Reduce register usage in 4-way SHA-256
This commit is contained in:
parent
7ca71eb324
commit
80c762b0da
2 changed files with 120 additions and 152 deletions
252
sha2-x64.S
252
sha2-x64.S
|
@ -233,21 +233,21 @@ _sha256_init_4way:
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro sha256_sse2_main_round i
|
.macro sha256_sse2_main_round i
|
||||||
movdqa 16*\i(%rax), %xmm6
|
movdqa 16*(\i)(%rax), %xmm6
|
||||||
paddd 16*\i(%rcx), %xmm6
|
paddd 16*(\i)(%rcx), %xmm6
|
||||||
paddd %xmm10, %xmm6
|
paddd 32(%rsp), %xmm6
|
||||||
|
|
||||||
movdqa %xmm0, %xmm1
|
movdqa %xmm0, %xmm1
|
||||||
movdqa %xmm9, %xmm2
|
movdqa 16(%rsp), %xmm2
|
||||||
pandn %xmm2, %xmm1
|
pandn %xmm2, %xmm1
|
||||||
|
|
||||||
movdqa %xmm2, %xmm10
|
movdqa %xmm2, 32(%rsp)
|
||||||
movdqa %xmm8, %xmm2
|
movdqa 0(%rsp), %xmm2
|
||||||
movdqa %xmm2, %xmm9
|
movdqa %xmm2, 16(%rsp)
|
||||||
|
|
||||||
pand %xmm0, %xmm2
|
pand %xmm0, %xmm2
|
||||||
pxor %xmm2, %xmm1
|
pxor %xmm2, %xmm1
|
||||||
movdqa %xmm0, %xmm8
|
movdqa %xmm0, 0(%rsp)
|
||||||
|
|
||||||
paddd %xmm1, %xmm6
|
paddd %xmm1, %xmm6
|
||||||
|
|
||||||
|
@ -297,6 +297,13 @@ _sha256_init_4way:
|
||||||
paddd %xmm6, %xmm7
|
paddd %xmm6, %xmm7
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro sha256_sse2_main_quadround i
|
||||||
|
sha256_sse2_main_round \i+0
|
||||||
|
sha256_sse2_main_round \i+1
|
||||||
|
sha256_sse2_main_round \i+2
|
||||||
|
sha256_sse2_main_round \i+3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_AVX)
|
#if defined(USE_AVX)
|
||||||
|
|
||||||
|
@ -969,19 +976,16 @@ _sha256d_ms_4way:
|
||||||
sha256d_ms_4way_sse2:
|
sha256d_ms_4way_sse2:
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
pushq %rdi
|
pushq %rdi
|
||||||
subq $80, %rsp
|
subq $32, %rsp
|
||||||
movdqa %xmm6, 0(%rsp)
|
movdqa %xmm6, 0(%rsp)
|
||||||
movdqa %xmm7, 16(%rsp)
|
movdqa %xmm7, 16(%rsp)
|
||||||
movdqa %xmm8, 32(%rsp)
|
|
||||||
movdqa %xmm9, 48(%rsp)
|
|
||||||
movdqa %xmm10, 64(%rsp)
|
|
||||||
pushq %rsi
|
pushq %rsi
|
||||||
movq %rcx, %rdi
|
movq %rcx, %rdi
|
||||||
movq %rdx, %rsi
|
movq %rdx, %rsi
|
||||||
movq %r8, %rdx
|
movq %r8, %rdx
|
||||||
movq %r9, %rcx
|
movq %r9, %rcx
|
||||||
#endif
|
#endif
|
||||||
subq $1032, %rsp
|
subq $8+67*16, %rsp
|
||||||
|
|
||||||
leaq 256(%rsi), %rax
|
leaq 256(%rsi), %rax
|
||||||
|
|
||||||
|
@ -989,8 +993,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
||||||
movdqa 3*16(%rsi), %xmm0
|
movdqa 3*16(%rsi), %xmm0
|
||||||
movdqa 2*16(%rax), %xmm3
|
movdqa 2*16(%rax), %xmm3
|
||||||
movdqa 3*16(%rax), %xmm7
|
movdqa 3*16(%rax), %xmm7
|
||||||
movdqa %xmm3, 2*16(%rsp)
|
movdqa %xmm3, 5*16(%rsp)
|
||||||
movdqa %xmm7, 3*16(%rsp)
|
movdqa %xmm7, 6*16(%rsp)
|
||||||
movdqa %xmm0, %xmm2
|
movdqa %xmm0, %xmm2
|
||||||
paddd %xmm0, %xmm7
|
paddd %xmm0, %xmm7
|
||||||
psrld $3, %xmm0
|
psrld $3, %xmm0
|
||||||
|
@ -1008,7 +1012,7 @@ sha256d_ms_4way_sse2_extend_loop1:
|
||||||
movdqa %xmm7, 3*16(%rax)
|
movdqa %xmm7, 3*16(%rax)
|
||||||
|
|
||||||
movdqa 4*16(%rax), %xmm0
|
movdqa 4*16(%rax), %xmm0
|
||||||
movdqa %xmm0, 4*16(%rsp)
|
movdqa %xmm0, 7*16(%rsp)
|
||||||
movdqa %xmm3, %xmm2
|
movdqa %xmm3, %xmm2
|
||||||
movdqa %xmm7, %xmm6
|
movdqa %xmm7, %xmm6
|
||||||
psrld $10, %xmm3
|
psrld $10, %xmm3
|
||||||
|
@ -1037,8 +1041,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
||||||
|
|
||||||
movdqa 6*16(%rax), %xmm0
|
movdqa 6*16(%rax), %xmm0
|
||||||
movdqa 7*16(%rax), %xmm4
|
movdqa 7*16(%rax), %xmm4
|
||||||
movdqa %xmm0, 6*16(%rsp)
|
movdqa %xmm0, 9*16(%rsp)
|
||||||
movdqa %xmm4, 7*16(%rsp)
|
movdqa %xmm4, 10*16(%rsp)
|
||||||
movdqa %xmm3, %xmm2
|
movdqa %xmm3, %xmm2
|
||||||
movdqa %xmm7, %xmm6
|
movdqa %xmm7, %xmm6
|
||||||
psrld $10, %xmm3
|
psrld $10, %xmm3
|
||||||
|
@ -1068,7 +1072,7 @@ sha256d_ms_4way_sse2_extend_loop1:
|
||||||
|
|
||||||
movdqa 8*16(%rax), %xmm0
|
movdqa 8*16(%rax), %xmm0
|
||||||
movdqa 2*16(%rax), %xmm4
|
movdqa 2*16(%rax), %xmm4
|
||||||
movdqa %xmm0, 8*16(%rsp)
|
movdqa %xmm0, 11*16(%rsp)
|
||||||
movdqa %xmm3, %xmm2
|
movdqa %xmm3, %xmm2
|
||||||
movdqa %xmm7, %xmm6
|
movdqa %xmm7, %xmm6
|
||||||
psrld $10, %xmm3
|
psrld $10, %xmm3
|
||||||
|
@ -1152,8 +1156,8 @@ sha256d_ms_4way_sse2_extend_loop1:
|
||||||
|
|
||||||
movdqa 14*16(%rax), %xmm0
|
movdqa 14*16(%rax), %xmm0
|
||||||
movdqa 15*16(%rax), %xmm4
|
movdqa 15*16(%rax), %xmm4
|
||||||
movdqa %xmm0, 14*16(%rsp)
|
movdqa %xmm0, 17*16(%rsp)
|
||||||
movdqa %xmm4, 15*16(%rsp)
|
movdqa %xmm4, 18*16(%rsp)
|
||||||
movdqa %xmm3, %xmm2
|
movdqa %xmm3, %xmm2
|
||||||
movdqa %xmm7, %xmm6
|
movdqa %xmm7, %xmm6
|
||||||
psrld $10, %xmm3
|
psrld $10, %xmm3
|
||||||
|
@ -1204,12 +1208,15 @@ sha256d_ms_4way_sse2_extend_loop2:
|
||||||
|
|
||||||
movdqa 0(%rcx), %xmm3
|
movdqa 0(%rcx), %xmm3
|
||||||
movdqa 16(%rcx), %xmm0
|
movdqa 16(%rcx), %xmm0
|
||||||
movdqa 32(%rcx), %xmm8
|
movdqa 32(%rcx), %xmm1
|
||||||
movdqa 48(%rcx), %xmm9
|
movdqa 48(%rcx), %xmm2
|
||||||
movdqa 64(%rcx), %xmm10
|
movdqa 64(%rcx), %xmm6
|
||||||
movdqa 80(%rcx), %xmm7
|
movdqa 80(%rcx), %xmm7
|
||||||
movdqa 96(%rcx), %xmm5
|
movdqa 96(%rcx), %xmm5
|
||||||
movdqa 112(%rcx), %xmm4
|
movdqa 112(%rcx), %xmm4
|
||||||
|
movdqa %xmm1, 0(%rsp)
|
||||||
|
movdqa %xmm2, 16(%rsp)
|
||||||
|
movdqa %xmm6, 32(%rsp)
|
||||||
|
|
||||||
movq %rsi, %rax
|
movq %rsi, %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
|
@ -1221,118 +1228,79 @@ sha256d_ms_4way_sse2_main_loop2:
|
||||||
sha256_sse2_main_round 2
|
sha256_sse2_main_round 2
|
||||||
sha256d_ms_4way_sse2_main_loop1:
|
sha256d_ms_4way_sse2_main_loop1:
|
||||||
sha256_sse2_main_round 3
|
sha256_sse2_main_round 3
|
||||||
sha256_sse2_main_round 4
|
sha256_sse2_main_quadround 4
|
||||||
sha256_sse2_main_round 5
|
sha256_sse2_main_quadround 8
|
||||||
sha256_sse2_main_round 6
|
sha256_sse2_main_quadround 12
|
||||||
sha256_sse2_main_round 7
|
sha256_sse2_main_quadround 16
|
||||||
sha256_sse2_main_round 8
|
sha256_sse2_main_quadround 20
|
||||||
sha256_sse2_main_round 9
|
sha256_sse2_main_quadround 24
|
||||||
sha256_sse2_main_round 10
|
sha256_sse2_main_quadround 28
|
||||||
sha256_sse2_main_round 11
|
sha256_sse2_main_quadround 32
|
||||||
sha256_sse2_main_round 12
|
sha256_sse2_main_quadround 36
|
||||||
sha256_sse2_main_round 13
|
sha256_sse2_main_quadround 40
|
||||||
sha256_sse2_main_round 14
|
sha256_sse2_main_quadround 44
|
||||||
sha256_sse2_main_round 15
|
sha256_sse2_main_quadround 48
|
||||||
sha256_sse2_main_round 16
|
sha256_sse2_main_quadround 52
|
||||||
sha256_sse2_main_round 17
|
|
||||||
sha256_sse2_main_round 18
|
|
||||||
sha256_sse2_main_round 19
|
|
||||||
sha256_sse2_main_round 20
|
|
||||||
sha256_sse2_main_round 21
|
|
||||||
sha256_sse2_main_round 22
|
|
||||||
sha256_sse2_main_round 23
|
|
||||||
sha256_sse2_main_round 24
|
|
||||||
sha256_sse2_main_round 25
|
|
||||||
sha256_sse2_main_round 26
|
|
||||||
sha256_sse2_main_round 27
|
|
||||||
sha256_sse2_main_round 28
|
|
||||||
sha256_sse2_main_round 29
|
|
||||||
sha256_sse2_main_round 30
|
|
||||||
sha256_sse2_main_round 31
|
|
||||||
sha256_sse2_main_round 32
|
|
||||||
sha256_sse2_main_round 33
|
|
||||||
sha256_sse2_main_round 34
|
|
||||||
sha256_sse2_main_round 35
|
|
||||||
sha256_sse2_main_round 36
|
|
||||||
sha256_sse2_main_round 37
|
|
||||||
sha256_sse2_main_round 38
|
|
||||||
sha256_sse2_main_round 39
|
|
||||||
sha256_sse2_main_round 40
|
|
||||||
sha256_sse2_main_round 41
|
|
||||||
sha256_sse2_main_round 42
|
|
||||||
sha256_sse2_main_round 43
|
|
||||||
sha256_sse2_main_round 44
|
|
||||||
sha256_sse2_main_round 45
|
|
||||||
sha256_sse2_main_round 46
|
|
||||||
sha256_sse2_main_round 47
|
|
||||||
sha256_sse2_main_round 48
|
|
||||||
sha256_sse2_main_round 49
|
|
||||||
sha256_sse2_main_round 50
|
|
||||||
sha256_sse2_main_round 51
|
|
||||||
sha256_sse2_main_round 52
|
|
||||||
sha256_sse2_main_round 53
|
|
||||||
sha256_sse2_main_round 54
|
|
||||||
sha256_sse2_main_round 55
|
|
||||||
sha256_sse2_main_round 56
|
sha256_sse2_main_round 56
|
||||||
jz sha256d_ms_4way_sse2_finish
|
jz sha256d_ms_4way_sse2_finish
|
||||||
sha256_sse2_main_round 57
|
sha256_sse2_main_round 57
|
||||||
sha256_sse2_main_round 58
|
sha256_sse2_main_round 58
|
||||||
sha256_sse2_main_round 59
|
sha256_sse2_main_round 59
|
||||||
sha256_sse2_main_round 60
|
sha256_sse2_main_quadround 60
|
||||||
sha256_sse2_main_round 61
|
|
||||||
sha256_sse2_main_round 62
|
|
||||||
sha256_sse2_main_round 63
|
|
||||||
|
|
||||||
movdqa 2*16(%rsp), %xmm1
|
movdqa 5*16(%rsp), %xmm1
|
||||||
movdqa 3*16(%rsp), %xmm2
|
movdqa 6*16(%rsp), %xmm2
|
||||||
movdqa 4*16(%rsp), %xmm6
|
movdqa 7*16(%rsp), %xmm6
|
||||||
movdqa %xmm1, 18*16(%rsi)
|
movdqa %xmm1, 18*16(%rsi)
|
||||||
movdqa %xmm2, 19*16(%rsi)
|
movdqa %xmm2, 19*16(%rsi)
|
||||||
movdqa %xmm6, 20*16(%rsi)
|
movdqa %xmm6, 20*16(%rsi)
|
||||||
movdqa 6*16(%rsp), %xmm1
|
movdqa 9*16(%rsp), %xmm1
|
||||||
movdqa 7*16(%rsp), %xmm2
|
movdqa 10*16(%rsp), %xmm2
|
||||||
movdqa 8*16(%rsp), %xmm6
|
movdqa 11*16(%rsp), %xmm6
|
||||||
movdqa %xmm1, 22*16(%rsi)
|
movdqa %xmm1, 22*16(%rsi)
|
||||||
movdqa %xmm2, 23*16(%rsi)
|
movdqa %xmm2, 23*16(%rsi)
|
||||||
movdqa %xmm6, 24*16(%rsi)
|
movdqa %xmm6, 24*16(%rsi)
|
||||||
movdqa 14*16(%rsp), %xmm1
|
movdqa 17*16(%rsp), %xmm1
|
||||||
movdqa 15*16(%rsp), %xmm2
|
movdqa 18*16(%rsp), %xmm2
|
||||||
movdqa %xmm1, 30*16(%rsi)
|
movdqa %xmm1, 30*16(%rsi)
|
||||||
movdqa %xmm2, 31*16(%rsi)
|
movdqa %xmm2, 31*16(%rsi)
|
||||||
|
|
||||||
|
movdqa 0(%rsp), %xmm1
|
||||||
|
movdqa 16(%rsp), %xmm2
|
||||||
|
movdqa 32(%rsp), %xmm6
|
||||||
paddd 0(%rdx), %xmm7
|
paddd 0(%rdx), %xmm7
|
||||||
paddd 16(%rdx), %xmm5
|
paddd 16(%rdx), %xmm5
|
||||||
paddd 32(%rdx), %xmm4
|
paddd 32(%rdx), %xmm4
|
||||||
paddd 48(%rdx), %xmm3
|
paddd 48(%rdx), %xmm3
|
||||||
paddd 64(%rdx), %xmm0
|
paddd 64(%rdx), %xmm0
|
||||||
paddd 80(%rdx), %xmm8
|
paddd 80(%rdx), %xmm1
|
||||||
paddd 96(%rdx), %xmm9
|
paddd 96(%rdx), %xmm2
|
||||||
paddd 112(%rdx), %xmm10
|
paddd 112(%rdx), %xmm6
|
||||||
|
|
||||||
movdqa %xmm7, 0(%rsp)
|
movdqa %xmm7, 48+0(%rsp)
|
||||||
movdqa %xmm5, 16(%rsp)
|
movdqa %xmm5, 48+16(%rsp)
|
||||||
movdqa %xmm4, 32(%rsp)
|
movdqa %xmm4, 48+32(%rsp)
|
||||||
movdqa %xmm3, 48(%rsp)
|
movdqa %xmm3, 48+48(%rsp)
|
||||||
movdqa %xmm0, 64(%rsp)
|
movdqa %xmm0, 48+64(%rsp)
|
||||||
movdqa %xmm8, 80(%rsp)
|
movdqa %xmm1, 48+80(%rsp)
|
||||||
movdqa %xmm9, 96(%rsp)
|
movdqa %xmm2, 48+96(%rsp)
|
||||||
movdqa %xmm10, 112(%rsp)
|
movdqa %xmm6, 48+112(%rsp)
|
||||||
|
|
||||||
pxor %xmm0, %xmm0
|
pxor %xmm0, %xmm0
|
||||||
movq $0x8000000000000100, %rax
|
movq $0x8000000000000100, %rax
|
||||||
movd %rax, %xmm1
|
movd %rax, %xmm1
|
||||||
pshufd $0x55, %xmm1, %xmm2
|
pshufd $0x55, %xmm1, %xmm2
|
||||||
pshufd $0x00, %xmm1, %xmm1
|
pshufd $0x00, %xmm1, %xmm1
|
||||||
movdqa %xmm2, 128(%rsp)
|
movdqa %xmm2, 48+128(%rsp)
|
||||||
movdqa %xmm0, 144(%rsp)
|
movdqa %xmm0, 48+144(%rsp)
|
||||||
movdqa %xmm0, 160(%rsp)
|
movdqa %xmm0, 48+160(%rsp)
|
||||||
movdqa %xmm0, 176(%rsp)
|
movdqa %xmm0, 48+176(%rsp)
|
||||||
movdqa %xmm0, 192(%rsp)
|
movdqa %xmm0, 48+192(%rsp)
|
||||||
movdqa %xmm0, 208(%rsp)
|
movdqa %xmm0, 48+208(%rsp)
|
||||||
movdqa %xmm0, 224(%rsp)
|
movdqa %xmm0, 48+224(%rsp)
|
||||||
movdqa %xmm1, 240(%rsp)
|
movdqa %xmm1, 48+240(%rsp)
|
||||||
|
|
||||||
leaq 256(%rsp), %rax
|
leaq 19*16(%rsp), %rax
|
||||||
cmpq %rax, %rax
|
cmpq %rax, %rax
|
||||||
|
|
||||||
movdqa -15*16(%rax), %xmm0
|
movdqa -15*16(%rax), %xmm0
|
||||||
|
@ -1550,61 +1518,63 @@ sha256d_ms_4way_sse2_extend_coda2:
|
||||||
movdqa sha256_4h+32(%rip), %xmm4
|
movdqa sha256_4h+32(%rip), %xmm4
|
||||||
movdqa sha256_4h+48(%rip), %xmm3
|
movdqa sha256_4h+48(%rip), %xmm3
|
||||||
movdqa sha256_4h+64(%rip), %xmm0
|
movdqa sha256_4h+64(%rip), %xmm0
|
||||||
movdqa sha256_4h+80(%rip), %xmm8
|
movdqa sha256_4h+80(%rip), %xmm1
|
||||||
movdqa sha256_4h+96(%rip), %xmm9
|
movdqa sha256_4h+96(%rip), %xmm2
|
||||||
movdqa sha256_4h+112(%rip), %xmm10
|
movdqa sha256_4h+112(%rip), %xmm6
|
||||||
|
movdqa %xmm1, 0(%rsp)
|
||||||
|
movdqa %xmm2, 16(%rsp)
|
||||||
|
movdqa %xmm6, 32(%rsp)
|
||||||
|
|
||||||
movq %rsp, %rax
|
leaq 48(%rsp), %rax
|
||||||
leaq sha256_4k(%rip), %rcx
|
leaq sha256_4k(%rip), %rcx
|
||||||
jmp sha256d_ms_4way_sse2_main_loop2
|
jmp sha256d_ms_4way_sse2_main_loop2
|
||||||
|
|
||||||
.macro sha256_sse2_main_round_red i, r0, r1, r2, r3, r4
|
.macro sha256_sse2_main_round_red i, r7
|
||||||
movdqa 16*\i(%rax), %xmm6
|
movdqa 16*\i(%rax), %xmm6
|
||||||
paddd 16*\i(%rcx), %xmm6
|
paddd 16*\i(%rcx), %xmm6
|
||||||
paddd \r0, %xmm6
|
paddd 32(%rsp), %xmm6
|
||||||
movdqa \r3, %xmm1
|
movdqa %xmm0, %xmm1
|
||||||
movdqa \r1, %xmm2
|
movdqa 16(%rsp), %xmm2
|
||||||
|
paddd \r7, %xmm6
|
||||||
pandn %xmm2, %xmm1
|
pandn %xmm2, %xmm1
|
||||||
movdqa \r2, %xmm2
|
movdqa %xmm2, 32(%rsp)
|
||||||
pand \r3, %xmm2
|
movdqa 0(%rsp), %xmm2
|
||||||
|
movdqa %xmm2, 16(%rsp)
|
||||||
|
pand %xmm0, %xmm2
|
||||||
pxor %xmm2, %xmm1
|
pxor %xmm2, %xmm1
|
||||||
movdqa \r3, \r0
|
movdqa %xmm0, 0(%rsp)
|
||||||
paddd %xmm1, %xmm6
|
paddd %xmm1, %xmm6
|
||||||
movdqa \r3, %xmm1
|
movdqa %xmm0, %xmm1
|
||||||
psrld $6, \r0
|
psrld $6, %xmm0
|
||||||
movdqa \r0, %xmm2
|
movdqa %xmm0, %xmm2
|
||||||
pslld $7, %xmm1
|
pslld $7, %xmm1
|
||||||
psrld $5, %xmm2
|
psrld $5, %xmm2
|
||||||
pxor %xmm1, \r0
|
pxor %xmm1, %xmm0
|
||||||
pxor %xmm2, \r0
|
pxor %xmm2, %xmm0
|
||||||
pslld $14, %xmm1
|
pslld $14, %xmm1
|
||||||
psrld $14, %xmm2
|
psrld $14, %xmm2
|
||||||
pxor %xmm1, \r0
|
pxor %xmm1, %xmm0
|
||||||
pxor %xmm2, \r0
|
pxor %xmm2, %xmm0
|
||||||
pslld $5, %xmm1
|
pslld $5, %xmm1
|
||||||
pxor %xmm1, \r0
|
pxor %xmm1, %xmm0
|
||||||
paddd %xmm6, \r0
|
paddd %xmm6, %xmm0
|
||||||
paddd \r4, \r0
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
sha256d_ms_4way_sse2_finish:
|
sha256d_ms_4way_sse2_finish:
|
||||||
sha256_sse2_main_round_red 57, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
|
sha256_sse2_main_round_red 57, %xmm3
|
||||||
sha256_sse2_main_round_red 58, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
|
sha256_sse2_main_round_red 58, %xmm4
|
||||||
sha256_sse2_main_round_red 59, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
|
sha256_sse2_main_round_red 59, %xmm5
|
||||||
sha256_sse2_main_round_red 60, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
|
sha256_sse2_main_round_red 60, %xmm7
|
||||||
|
|
||||||
paddd sha256_4h+112(%rip), %xmm0
|
paddd sha256_4h+112(%rip), %xmm0
|
||||||
movdqa %xmm0, 112(%rdi)
|
movdqa %xmm0, 112(%rdi)
|
||||||
|
|
||||||
addq $1032, %rsp
|
addq $8+67*16, %rsp
|
||||||
#if defined(WIN64)
|
#if defined(WIN64)
|
||||||
popq %rsi
|
popq %rsi
|
||||||
movdqa 0(%rsp), %xmm6
|
movdqa 0(%rsp), %xmm6
|
||||||
movdqa 16(%rsp), %xmm7
|
movdqa 16(%rsp), %xmm7
|
||||||
movdqa 32(%rsp), %xmm8
|
addq $32, %rsp
|
||||||
movdqa 48(%rsp), %xmm9
|
|
||||||
movdqa 64(%rsp), %xmm10
|
|
||||||
addq $80, %rsp
|
|
||||||
popq %rdi
|
popq %rdi
|
||||||
#endif
|
#endif
|
||||||
ret
|
ret
|
||||||
|
@ -2596,13 +2566,13 @@ _sha256_use_4way:
|
||||||
cpuid
|
cpuid
|
||||||
andl $0x18000000, %ecx
|
andl $0x18000000, %ecx
|
||||||
cmpl $0x18000000, %ecx
|
cmpl $0x18000000, %ecx
|
||||||
jne sha256_use_4way_sse2
|
jne sha256_use_4way_base
|
||||||
# Check for XMM and YMM state support
|
# Check for XMM and YMM state support
|
||||||
xorl %ecx, %ecx
|
xorl %ecx, %ecx
|
||||||
xgetbv
|
xgetbv
|
||||||
andl $0x00000006, %eax
|
andl $0x00000006, %eax
|
||||||
cmpl $0x00000006, %eax
|
cmpl $0x00000006, %eax
|
||||||
jne sha256_use_4way_sse2
|
jne sha256_use_4way_base
|
||||||
#if defined(USE_XOP)
|
#if defined(USE_XOP)
|
||||||
# Check for XOP support
|
# Check for XOP support
|
||||||
movl $0x80000001, %eax
|
movl $0x80000001, %eax
|
||||||
|
@ -2622,7 +2592,7 @@ sha256_use_4way_avx:
|
||||||
jmp sha256_use_4way_done
|
jmp sha256_use_4way_done
|
||||||
#endif /* USE_AVX */
|
#endif /* USE_AVX */
|
||||||
|
|
||||||
sha256_use_4way_sse2:
|
sha256_use_4way_base:
|
||||||
leaq sha256d_ms_4way_sse2(%rip), %rcx
|
leaq sha256d_ms_4way_sse2(%rip), %rcx
|
||||||
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
||||||
|
|
||||||
|
|
20
sha2-x86.S
20
sha2-x86.S
|
@ -115,25 +115,23 @@ sha256d_4preext2_30:
|
||||||
.globl _sha256_init_4way
|
.globl _sha256_init_4way
|
||||||
sha256_init_4way:
|
sha256_init_4way:
|
||||||
_sha256_init_4way:
|
_sha256_init_4way:
|
||||||
pushl %edi
|
movl 8(%esp), %edx
|
||||||
movl 8(%esp), %edi
|
|
||||||
movdqa sha256_4h+0, %xmm0
|
movdqa sha256_4h+0, %xmm0
|
||||||
movdqa sha256_4h+16, %xmm1
|
movdqa sha256_4h+16, %xmm1
|
||||||
movdqa sha256_4h+32, %xmm2
|
movdqa sha256_4h+32, %xmm2
|
||||||
movdqa sha256_4h+48, %xmm3
|
movdqa sha256_4h+48, %xmm3
|
||||||
movdqu %xmm0, 0(%edi)
|
movdqu %xmm0, 0(%edx)
|
||||||
movdqu %xmm1, 16(%edi)
|
movdqu %xmm1, 16(%edx)
|
||||||
movdqu %xmm2, 32(%edi)
|
movdqu %xmm2, 32(%edx)
|
||||||
movdqu %xmm3, 48(%edi)
|
movdqu %xmm3, 48(%edx)
|
||||||
movdqa sha256_4h+64, %xmm0
|
movdqa sha256_4h+64, %xmm0
|
||||||
movdqa sha256_4h+80, %xmm1
|
movdqa sha256_4h+80, %xmm1
|
||||||
movdqa sha256_4h+96, %xmm2
|
movdqa sha256_4h+96, %xmm2
|
||||||
movdqa sha256_4h+112, %xmm3
|
movdqa sha256_4h+112, %xmm3
|
||||||
movdqu %xmm0, 64(%edi)
|
movdqu %xmm0, 64(%edx)
|
||||||
movdqu %xmm1, 80(%edi)
|
movdqu %xmm1, 80(%edx)
|
||||||
movdqu %xmm2, 96(%edi)
|
movdqu %xmm2, 96(%edx)
|
||||||
movdqu %xmm3, 112(%edi)
|
movdqu %xmm3, 112(%edx)
|
||||||
popl %edi
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue