2012-03-12 13:32:11 +01:00
|
|
|
/*
|
2015-05-17 16:21:30 +02:00
|
|
|
* Copyright 2012-2015 pooler@litecoinpool.org
|
2012-03-12 13:32:11 +01:00
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify it
|
|
|
|
* under the terms of the GNU General Public License as published by the Free
|
|
|
|
* Software Foundation; either version 2 of the License, or (at your option)
|
|
|
|
* any later version. See COPYING for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "cpuminer-config.h"
|
|
|
|
|
|
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
|
|
.section .note.GNU-stack,"",%progbits
|
|
|
|
#endif
|
|
|
|
|
2014-05-16 15:37:41 +02:00
|
|
|
#if defined(USE_ASM) && defined(__x86_64__)
|
2012-03-12 13:32:11 +01:00
|
|
|
|
2015-05-17 16:21:30 +02:00
|
|
|
.data
|
|
|
|
.p2align 4
|
|
|
|
sha256_h:
|
|
|
|
.long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
|
|
|
|
.long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 6
|
|
|
|
sha256_k:
|
|
|
|
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
|
|
|
|
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
|
|
|
|
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
|
|
|
|
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
|
|
|
|
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
|
|
|
|
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
|
|
|
|
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
|
|
|
|
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
|
|
|
|
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
|
|
|
|
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
|
|
|
|
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
|
|
|
|
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
|
|
|
|
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
|
|
|
|
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
|
|
|
|
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
|
|
|
|
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
|
|
|
|
|
|
|
|
bswap_xmm_mask:
|
|
|
|
.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
|
|
|
|
|
|
|
|
|
|
|
|
.macro sha256_mixed_quadround ra, rb, rc, rd, re, rf, rg, rh, x0, x1, x2, x3
|
|
|
|
movdqa \x3, %xmm4
|
|
|
|
movl \re, %eax
|
|
|
|
movdqa \x2, %xmm6
|
|
|
|
rorl $(25-11), %eax
|
|
|
|
movl \ra, %ebx
|
|
|
|
pslldq $12, %xmm4
|
|
|
|
rorl $(22-13), %ebx
|
|
|
|
psrldq $4, %xmm6
|
|
|
|
xorl \re, %eax
|
|
|
|
movl \rf, %ecx
|
|
|
|
rorl $(11-6), %eax
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
movdqa \x1, %xmm5
|
|
|
|
xorl \ra, %ebx
|
|
|
|
xorl \rg, %ecx
|
|
|
|
xorl \re, %eax
|
|
|
|
paddd \x0, %xmm4
|
|
|
|
movdqa \x0, %xmm7
|
|
|
|
andl \re, %ecx
|
|
|
|
rorl $(13-2), %ebx
|
|
|
|
xorl \ra, %ebx
|
|
|
|
pslldq $12, %xmm5
|
|
|
|
psrldq $4, %xmm7
|
|
|
|
rorl $6, %eax
|
|
|
|
xorl \rg, %ecx
|
|
|
|
pxor %xmm7, %xmm5
|
|
|
|
rorl $2, %ebx
|
|
|
|
addl %eax, %ecx
|
|
|
|
addl (%rsp) , %ecx
|
|
|
|
movdqa %xmm5, %xmm6
|
|
|
|
movl \ra, %eax
|
|
|
|
addl %ecx, \rh
|
|
|
|
movl \ra, %ecx
|
|
|
|
movdqa %xmm5, %xmm7
|
|
|
|
orl \rc, %eax
|
|
|
|
addl \rh, \rd
|
|
|
|
andl \rc, %ecx
|
|
|
|
pslld $(32-7), %xmm5
|
|
|
|
psrld $7, %xmm6
|
|
|
|
andl \rb, %eax
|
|
|
|
addl %ebx, \rh
|
|
|
|
orl %ecx, %eax
|
|
|
|
por %xmm6, %xmm5
|
|
|
|
addl %eax, \rh
|
|
|
|
|
|
|
|
movl \rd, %eax
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
movl \rh, %ebx
|
|
|
|
rorl $(25-11), %eax
|
|
|
|
xorl \rd, %eax
|
|
|
|
movdqa %xmm7, %xmm8
|
|
|
|
movl \re, %ecx
|
|
|
|
rorl $(22-13), %ebx
|
|
|
|
xorl \rh, %ebx
|
|
|
|
pslld $(32-18), %xmm7
|
|
|
|
rorl $(11-6), %eax
|
|
|
|
xorl \rf, %ecx
|
|
|
|
rorl $(13-2), %ebx
|
|
|
|
psrld $18, %xmm6
|
|
|
|
xorl \rd, %eax
|
|
|
|
andl \rd, %ecx
|
|
|
|
rorl $6, %eax
|
|
|
|
pxor %xmm7, %xmm5
|
|
|
|
xorl \rh, %ebx
|
|
|
|
xorl \rf, %ecx
|
|
|
|
psrld $3, %xmm8
|
|
|
|
addl %eax, %ecx
|
|
|
|
addl 1*4(%rsp), %ecx
|
|
|
|
rorl $2, %ebx
|
|
|
|
pxor %xmm6, %xmm5
|
|
|
|
movl \rh, %eax
|
|
|
|
addl %ecx, \rg
|
|
|
|
movl \rh, %ecx
|
|
|
|
pxor %xmm8, %xmm5
|
|
|
|
orl \rb, %eax
|
|
|
|
addl \rg, \rc
|
|
|
|
andl \rb, %ecx
|
|
|
|
pshufd $0xfa, \x3, %xmm6
|
|
|
|
andl \ra, %eax
|
|
|
|
addl %ebx, \rg
|
|
|
|
paddd %xmm5, %xmm4
|
|
|
|
orl %ecx, %eax
|
|
|
|
addl %eax, \rg
|
|
|
|
|
|
|
|
movl \rc, %eax
|
|
|
|
movdqa %xmm6, %xmm7
|
|
|
|
movl \rg, %ebx
|
|
|
|
rorl $(25-11), %eax
|
|
|
|
xorl \rc, %eax
|
|
|
|
movdqa %xmm6, %xmm8
|
|
|
|
rorl $(22-13), %ebx
|
|
|
|
movl \rd, %ecx
|
|
|
|
xorl \rg, %ebx
|
|
|
|
psrlq $17, %xmm6
|
|
|
|
psrlq $19, %xmm7
|
|
|
|
rorl $(11-6), %eax
|
|
|
|
xorl \re, %ecx
|
|
|
|
xorl \rc, %eax
|
|
|
|
psrld $10, %xmm8
|
|
|
|
pxor %xmm7, %xmm6
|
|
|
|
andl \rc, %ecx
|
|
|
|
rorl $(13-2), %ebx
|
|
|
|
xorl \rg, %ebx
|
|
|
|
pxor %xmm6, %xmm8
|
|
|
|
xorl \re, %ecx
|
|
|
|
rorl $6, %eax
|
|
|
|
addl %eax, %ecx
|
|
|
|
pshufd $0x8f, %xmm8, %xmm8
|
|
|
|
rorl $2, %ebx
|
|
|
|
addl 2*4(%rsp), %ecx
|
|
|
|
movl \rg, %eax
|
|
|
|
psrldq $8, %xmm8
|
|
|
|
addl %ecx, \rf
|
|
|
|
movl \rg, %ecx
|
|
|
|
orl \ra, %eax
|
|
|
|
paddd %xmm8, %xmm4
|
|
|
|
addl \rf, \rb
|
|
|
|
andl \ra, %ecx
|
|
|
|
andl \rh, %eax
|
|
|
|
pshufd $0x50, %xmm4, %xmm6
|
|
|
|
addl %ebx, \rf
|
|
|
|
orl %ecx, %eax
|
|
|
|
addl %eax, \rf
|
|
|
|
|
|
|
|
movdqa %xmm6, %xmm7
|
|
|
|
movl \rb, %eax
|
|
|
|
rorl $(25-11), %eax
|
|
|
|
movl \rf, %ebx
|
|
|
|
movdqa %xmm6, \x0
|
|
|
|
rorl $(22-13), %ebx
|
|
|
|
xorl \rb, %eax
|
|
|
|
movl \rc, %ecx
|
|
|
|
psrlq $17, %xmm6
|
|
|
|
rorl $(11-6), %eax
|
|
|
|
xorl \rf, %ebx
|
|
|
|
xorl \rd, %ecx
|
|
|
|
psrlq $19, %xmm7
|
|
|
|
xorl \rb, %eax
|
|
|
|
andl \rb, %ecx
|
|
|
|
rorl $(13-2), %ebx
|
|
|
|
psrld $10, \x0
|
|
|
|
xorl \rf, %ebx
|
|
|
|
rorl $6, %eax
|
|
|
|
pxor %xmm7, %xmm6
|
|
|
|
xorl \rd, %ecx
|
|
|
|
rorl $2, %ebx
|
|
|
|
addl %eax, %ecx
|
|
|
|
pxor %xmm6, \x0
|
|
|
|
addl 3*4(%rsp), %ecx
|
|
|
|
movl \rf, %eax
|
|
|
|
addl %ecx, \re
|
|
|
|
pshufd $0xf8, \x0, \x0
|
|
|
|
movl \rf, %ecx
|
|
|
|
orl \rh, %eax
|
|
|
|
addl \re, \ra
|
|
|
|
pslldq $8, \x0
|
|
|
|
andl \rh, %ecx
|
|
|
|
andl \rg, %eax
|
|
|
|
paddd %xmm4, \x0
|
|
|
|
addl %ebx, \re
|
|
|
|
orl %ecx, %eax
|
|
|
|
addl %eax, \re
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_main_round i, ra, rb, rc, rd, re, rf, rg, rh
|
|
|
|
movl \re, %eax
|
|
|
|
rorl $(25-11), %eax
|
|
|
|
movl \ra, %ebx
|
|
|
|
xorl \re, %eax
|
|
|
|
rorl $(22-13), %ebx
|
|
|
|
movl \rf, %ecx
|
|
|
|
xorl \ra, %ebx
|
|
|
|
rorl $(11-6), %eax
|
|
|
|
xorl \rg, %ecx
|
|
|
|
xorl \re, %eax
|
|
|
|
rorl $(13-2), %ebx
|
|
|
|
andl \re, %ecx
|
|
|
|
xorl \ra, %ebx
|
|
|
|
rorl $6, %eax
|
|
|
|
xorl \rg, %ecx
|
|
|
|
addl %eax, %ecx
|
|
|
|
rorl $2, %ebx
|
|
|
|
addl \i*4(%rsp), %ecx
|
|
|
|
movl \ra, %eax
|
|
|
|
addl %ecx, \rh
|
|
|
|
movl \ra, %ecx
|
|
|
|
orl \rc, %eax
|
|
|
|
addl \rh, \rd
|
|
|
|
andl \rc, %ecx
|
|
|
|
andl \rb, %eax
|
|
|
|
addl %ebx, \rh
|
|
|
|
orl %ecx, %eax
|
|
|
|
addl %eax, \rh
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_sse2:
|
|
|
|
pushq %rbx
|
|
|
|
pushq %r12
|
|
|
|
pushq %r13
|
|
|
|
pushq %r14
|
|
|
|
pushq %r15
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
pushq %rdi
|
|
|
|
pushq %rsi
|
|
|
|
subq $5*16, %rsp
|
|
|
|
movdqa %xmm6, 1*16(%rsp)
|
|
|
|
movdqa %xmm7, 2*16(%rsp)
|
|
|
|
movdqa %xmm8, 3*16(%rsp)
|
|
|
|
movdqa %xmm9, 4*16(%rsp)
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
#else
|
|
|
|
subq $16, %rsp
|
|
|
|
#endif
|
|
|
|
|
|
|
|
movl 0*4(%rdi), %r8d
|
|
|
|
movl 1*4(%rdi), %r9d
|
|
|
|
movl 2*4(%rdi), %r10d
|
|
|
|
movl 3*4(%rdi), %r11d
|
|
|
|
movl 4*4(%rdi), %r12d
|
|
|
|
movl 5*4(%rdi), %r13d
|
|
|
|
movl 6*4(%rdi), %r14d
|
|
|
|
movl 7*4(%rdi), %r15d
|
|
|
|
|
|
|
|
testq %rdx, %rdx
|
|
|
|
jnz sha256_transform_sse2_swap
|
|
|
|
|
|
|
|
movdqu 0*16(%rsi), %xmm0
|
|
|
|
movdqu 1*16(%rsi), %xmm1
|
|
|
|
movdqu 2*16(%rsi), %xmm2
|
|
|
|
movdqu 3*16(%rsi), %xmm3
|
|
|
|
jmp sha256_transform_sse2_core
|
|
|
|
|
|
|
|
sha256_transform_sse2_swap:
|
|
|
|
movdqu 0*16(%rsi), %xmm0
|
|
|
|
movdqu 1*16(%rsi), %xmm1
|
|
|
|
movdqu 2*16(%rsi), %xmm2
|
|
|
|
movdqu 3*16(%rsi), %xmm3
|
|
|
|
pshuflw $0xb1, %xmm0, %xmm0
|
|
|
|
pshuflw $0xb1, %xmm1, %xmm1
|
|
|
|
pshuflw $0xb1, %xmm2, %xmm2
|
|
|
|
pshuflw $0xb1, %xmm3, %xmm3
|
|
|
|
pshufhw $0xb1, %xmm0, %xmm0
|
|
|
|
pshufhw $0xb1, %xmm1, %xmm1
|
|
|
|
pshufhw $0xb1, %xmm2, %xmm2
|
|
|
|
pshufhw $0xb1, %xmm3, %xmm3
|
|
|
|
movdqa %xmm0, %xmm4
|
|
|
|
movdqa %xmm1, %xmm5
|
|
|
|
movdqa %xmm2, %xmm6
|
|
|
|
movdqa %xmm3, %xmm7
|
|
|
|
psrlw $8, %xmm4
|
|
|
|
psrlw $8, %xmm5
|
|
|
|
psrlw $8, %xmm6
|
|
|
|
psrlw $8, %xmm7
|
|
|
|
psllw $8, %xmm0
|
|
|
|
psllw $8, %xmm1
|
|
|
|
psllw $8, %xmm2
|
|
|
|
psllw $8, %xmm3
|
|
|
|
pxor %xmm4, %xmm0
|
|
|
|
pxor %xmm5, %xmm1
|
|
|
|
pxor %xmm6, %xmm2
|
|
|
|
pxor %xmm7, %xmm3
|
|
|
|
|
|
|
|
sha256_transform_sse2_core:
|
|
|
|
leaq sha256_k(%rip), %rdx
|
|
|
|
movq $48, %rsi
|
|
|
|
.p2align 4
|
|
|
|
sha256_transform_sse2_loop:
|
|
|
|
movdqa 0*16(%rdx), %xmm9
|
|
|
|
paddd %xmm0, %xmm9
|
|
|
|
movdqa %xmm9, (%rsp)
|
|
|
|
sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm0, %xmm1, %xmm2, %xmm3
|
|
|
|
movdqa 1*16(%rdx), %xmm9
|
|
|
|
paddd %xmm1, %xmm9
|
|
|
|
movdqa %xmm9, (%rsp)
|
|
|
|
sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm1, %xmm2, %xmm3, %xmm0
|
|
|
|
movdqa 2*16(%rdx), %xmm9
|
|
|
|
paddd %xmm2, %xmm9
|
|
|
|
movdqa %xmm9, (%rsp)
|
|
|
|
sha256_mixed_quadround %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %xmm2, %xmm3, %xmm0, %xmm1
|
|
|
|
movdqa 3*16(%rdx), %xmm9
|
|
|
|
paddd %xmm3, %xmm9
|
|
|
|
movdqa %xmm9, (%rsp)
|
|
|
|
addq $4*16, %rdx
|
|
|
|
sha256_mixed_quadround %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %xmm3, %xmm0, %xmm1, %xmm2
|
|
|
|
|
|
|
|
subq $16, %rsi
|
|
|
|
jne sha256_transform_sse2_loop
|
|
|
|
|
|
|
|
paddd 0*16(%rdx), %xmm0
|
|
|
|
movdqa %xmm0, (%rsp)
|
|
|
|
sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
|
|
|
|
sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
|
|
|
|
sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
|
|
|
|
sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
|
|
|
|
paddd 1*16(%rdx), %xmm1
|
|
|
|
movdqa %xmm1, (%rsp)
|
|
|
|
sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
|
|
|
|
sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
|
|
|
|
sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
|
|
|
|
sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
|
|
|
|
paddd 2*16(%rdx), %xmm2
|
|
|
|
movdqa %xmm2, (%rsp)
|
|
|
|
sha256_main_round 0, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d
|
|
|
|
sha256_main_round 1, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d
|
|
|
|
sha256_main_round 2, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d
|
|
|
|
sha256_main_round 3, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d
|
|
|
|
paddd 3*16(%rdx), %xmm3
|
|
|
|
movdqa %xmm3, (%rsp)
|
|
|
|
sha256_main_round 0, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d
|
|
|
|
sha256_main_round 1, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d
|
|
|
|
sha256_main_round 2, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d
|
|
|
|
sha256_main_round 3, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d
|
|
|
|
|
|
|
|
addl %r8d, 0*4(%rdi)
|
|
|
|
addl %r9d, 1*4(%rdi)
|
|
|
|
addl %r10d, 2*4(%rdi)
|
|
|
|
addl %r11d, 3*4(%rdi)
|
|
|
|
addl %r12d, 4*4(%rdi)
|
|
|
|
addl %r13d, 5*4(%rdi)
|
|
|
|
addl %r14d, 6*4(%rdi)
|
|
|
|
addl %r15d, 7*4(%rdi)
|
|
|
|
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
movdqa 1*16(%rsp), %xmm6
|
|
|
|
movdqa 2*16(%rsp), %xmm7
|
|
|
|
movdqa 3*16(%rsp), %xmm8
|
|
|
|
movdqa 4*16(%rsp), %xmm9
|
|
|
|
addq $5*16, %rsp
|
|
|
|
popq %rsi
|
|
|
|
popq %rdi
|
|
|
|
#else
|
|
|
|
addq $16, %rsp
|
|
|
|
#endif
|
|
|
|
popq %r15
|
|
|
|
popq %r14
|
|
|
|
popq %r13
|
|
|
|
popq %r12
|
|
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_phe:
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
pushq %rdi
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
#endif
|
|
|
|
movq %rsp, %r8
|
|
|
|
subq $64, %rsp
|
|
|
|
andq $-64, %rsp
|
|
|
|
|
|
|
|
testq %rdx, %rdx
|
|
|
|
jnz sha256_transform_phe_noswap
|
|
|
|
|
|
|
|
movl 0*4(%rsi), %eax
|
|
|
|
movl 1*4(%rsi), %ecx
|
|
|
|
movl 2*4(%rsi), %edx
|
|
|
|
movl 3*4(%rsi), %r9d
|
|
|
|
bswapl %eax
|
|
|
|
bswapl %ecx
|
|
|
|
bswapl %edx
|
|
|
|
bswapl %r9d
|
|
|
|
movl %eax, 0*4(%rsp)
|
|
|
|
movl %ecx, 1*4(%rsp)
|
|
|
|
movl %edx, 2*4(%rsp)
|
|
|
|
movl %r9d, 3*4(%rsp)
|
|
|
|
movl 4*4(%rsi), %eax
|
|
|
|
movl 5*4(%rsi), %ecx
|
|
|
|
movl 6*4(%rsi), %edx
|
|
|
|
movl 7*4(%rsi), %r9d
|
|
|
|
bswapl %eax
|
|
|
|
bswapl %ecx
|
|
|
|
bswapl %edx
|
|
|
|
bswapl %r9d
|
|
|
|
movl %eax, 4*4(%rsp)
|
|
|
|
movl %ecx, 5*4(%rsp)
|
|
|
|
movl %edx, 6*4(%rsp)
|
|
|
|
movl %r9d, 7*4(%rsp)
|
|
|
|
|
|
|
|
movdqu 2*16(%rsi), %xmm0
|
|
|
|
movdqu 3*16(%rsi), %xmm2
|
|
|
|
pshuflw $0xb1, %xmm0, %xmm0
|
|
|
|
pshuflw $0xb1, %xmm2, %xmm2
|
|
|
|
pshufhw $0xb1, %xmm0, %xmm0
|
|
|
|
pshufhw $0xb1, %xmm2, %xmm2
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm2, %xmm3
|
|
|
|
psrlw $8, %xmm1
|
|
|
|
psrlw $8, %xmm3
|
|
|
|
psllw $8, %xmm0
|
|
|
|
psllw $8, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm3, %xmm2
|
|
|
|
movdqa %xmm0, 2*16(%rsp)
|
|
|
|
movdqa %xmm2, 3*16(%rsp)
|
|
|
|
|
|
|
|
jmp sha256_transform_phe_core
|
|
|
|
|
|
|
|
sha256_transform_phe_noswap:
|
|
|
|
movdqu 0*16(%rsi), %xmm0
|
|
|
|
movdqu 1*16(%rsi), %xmm1
|
|
|
|
movdqu 2*16(%rsi), %xmm2
|
|
|
|
movdqu 3*16(%rsi), %xmm3
|
|
|
|
movdqa %xmm0, 0*16(%rsp)
|
|
|
|
movdqa %xmm1, 1*16(%rsp)
|
|
|
|
movdqa %xmm2, 2*16(%rsp)
|
|
|
|
movdqa %xmm3, 3*16(%rsp)
|
|
|
|
|
|
|
|
sha256_transform_phe_core:
|
|
|
|
movq %rsp, %rsi
|
|
|
|
movq $-1, %rax
|
|
|
|
movq $1, %rcx
|
|
|
|
/* rep xsha256 */
|
|
|
|
.byte 0xf3, 0x0f, 0xa6, 0xd0
|
|
|
|
|
|
|
|
movq %r8, %rsp
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
popq %rsi
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 3
|
|
|
|
sha256_transform_addr:
|
|
|
|
.quad sha256_transform_sse2
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 3
|
|
|
|
.globl sha256_transform
|
|
|
|
.globl _sha256_transform
|
|
|
|
sha256_transform:
|
|
|
|
_sha256_transform:
|
|
|
|
jmp *sha256_transform_addr(%rip)
|
|
|
|
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256d_ms
|
|
|
|
.globl _sha256d_ms
|
|
|
|
sha256d_ms:
|
|
|
|
_sha256d_ms:
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
pushq %rdi
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
#endif
|
|
|
|
movq %rsp, %r8
|
|
|
|
subq $32, %rsp
|
|
|
|
andq $-32, %rsp
|
|
|
|
|
|
|
|
movdqa 0*16(%rdx), %xmm0
|
|
|
|
movdqa 1*16(%rdx), %xmm1
|
|
|
|
movdqa %xmm0, 0*16(%rdi)
|
|
|
|
movdqa %xmm1, 1*16(%rdi)
|
|
|
|
|
|
|
|
movl 0*4(%rsi), %eax
|
|
|
|
movl 1*4(%rsi), %ecx
|
|
|
|
movl 2*4(%rsi), %edx
|
|
|
|
movl 3*4(%rsi), %r9d
|
|
|
|
bswapl %eax
|
|
|
|
bswapl %ecx
|
|
|
|
bswapl %edx
|
|
|
|
bswapl %r9d
|
|
|
|
movl %eax, 0*4(%rsp)
|
|
|
|
movl %ecx, 1*4(%rsp)
|
|
|
|
movl %edx, 2*4(%rsp)
|
|
|
|
movl %r9d, 3*4(%rsp)
|
|
|
|
|
|
|
|
movq %rsp, %rsi
|
|
|
|
movl $64, %eax
|
|
|
|
movl $80, %ecx
|
|
|
|
/* rep xsha256 */
|
|
|
|
.byte 0xf3, 0x0f, 0xa6, 0xd0
|
|
|
|
|
|
|
|
movdqa bswap_xmm_mask(%rip), %xmm1
|
|
|
|
movdqa 0*16(%rdi), %xmm0
|
|
|
|
movdqa 1*16(%rdi), %xmm2
|
|
|
|
pshufb %xmm1, %xmm0
|
|
|
|
pshufb %xmm1, %xmm2
|
|
|
|
movdqa %xmm0, 0*16(%rsp)
|
|
|
|
movdqa %xmm2, 1*16(%rsp)
|
|
|
|
|
|
|
|
movdqa sha256_h+0*16(%rip), %xmm0
|
|
|
|
movdqa sha256_h+1*16(%rip), %xmm1
|
|
|
|
movdqa %xmm0, 0*16(%rdi)
|
|
|
|
movdqa %xmm1, 1*16(%rdi)
|
|
|
|
|
|
|
|
movq %rsp, %rsi
|
|
|
|
xorq %rax, %rax
|
|
|
|
movl $32, %ecx
|
|
|
|
/* rep xsha256 */
|
|
|
|
.byte 0xf3, 0x0f, 0xa6, 0xd0
|
|
|
|
|
|
|
|
movq %r8, %rsp
|
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
|
|
|
popq %rsi
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2012-03-12 13:32:11 +01:00
|
|
|
.data
|
2012-03-21 23:07:56 +01:00
|
|
|
.p2align 7
|
2012-03-12 13:32:11 +01:00
|
|
|
sha256_4h:
|
|
|
|
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
|
|
|
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
|
|
|
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
|
|
|
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
|
|
|
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
|
|
|
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
|
|
|
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
|
|
|
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
|
|
|
|
|
|
|
.data
|
2012-03-21 23:07:56 +01:00
|
|
|
.p2align 7
|
2012-03-12 13:32:11 +01:00
|
|
|
sha256_4k:
|
|
|
|
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
|
|
|
|
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
|
|
|
|
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
|
|
|
|
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
|
|
|
|
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
|
|
|
|
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
|
|
|
|
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
|
|
|
|
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
|
|
|
|
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
|
|
|
|
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
|
|
|
|
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
|
|
|
|
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
|
|
|
|
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
|
|
|
|
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
|
|
|
|
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
|
|
|
|
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
|
|
|
|
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
|
|
|
|
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
|
|
|
|
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
|
|
|
|
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
|
|
|
|
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
|
|
|
|
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
|
|
|
|
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
|
|
|
|
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
|
|
|
|
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
|
|
|
|
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
|
|
|
|
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
|
|
|
|
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
|
|
|
|
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
|
|
|
|
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
|
|
|
|
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
|
|
|
|
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
|
|
|
|
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
|
|
|
|
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
|
|
|
|
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
|
|
|
|
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
|
|
|
|
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
|
|
|
|
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
|
|
|
|
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
|
|
|
|
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
|
|
|
|
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
|
|
|
|
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
|
|
|
|
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
|
|
|
|
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
|
|
|
|
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
|
|
|
|
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
|
|
|
|
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
|
|
|
|
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
|
|
|
|
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
|
|
|
|
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
|
|
|
|
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
|
|
|
|
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
|
|
|
|
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
|
|
|
|
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
|
|
|
|
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
|
|
|
|
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
|
|
|
|
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
|
|
|
|
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
|
|
|
|
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
|
|
|
|
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
|
|
|
|
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
|
|
|
|
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
|
|
|
|
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
|
|
|
|
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
.data
|
|
|
|
.p2align 6
|
|
|
|
sha256d_4preext2_17:
|
|
|
|
.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
|
|
|
|
sha256d_4preext2_23:
|
|
|
|
.long 0x11002000, 0x11002000, 0x11002000, 0x11002000
|
|
|
|
sha256d_4preext2_24:
|
|
|
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
|
|
sha256d_4preext2_30:
|
|
|
|
.long 0x00400022, 0x00400022, 0x00400022, 0x00400022
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
|
2013-07-05 18:25:34 +02:00
|
|
|
#ifdef USE_AVX2
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 7
|
|
|
|
sha256_8h:
|
|
|
|
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
|
|
|
|
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
|
|
|
|
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
|
|
|
|
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
|
|
|
|
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
|
|
|
|
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
|
|
|
|
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
|
|
|
|
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 7
|
|
|
|
sha256_8k:
|
|
|
|
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
|
|
|
|
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491
|
|
|
|
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
|
|
|
|
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
|
|
|
|
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
|
|
|
|
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
|
|
|
|
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
|
|
|
|
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
|
|
|
|
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
|
|
|
|
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
|
|
|
|
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be
|
|
|
|
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
|
|
|
|
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
|
|
|
|
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
|
|
|
|
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
|
|
|
|
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
|
|
|
|
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
|
|
|
|
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
|
|
|
|
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
|
|
|
|
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
|
|
|
|
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
|
|
|
|
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
|
|
|
|
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
|
|
|
|
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
|
|
|
|
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
|
|
|
|
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
|
|
|
|
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
|
|
|
|
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
|
|
|
|
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
|
|
|
|
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
|
|
|
|
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
|
|
|
|
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967
|
|
|
|
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
|
|
|
|
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
|
|
|
|
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
|
|
|
|
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
|
|
|
|
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
|
|
|
|
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
|
|
|
|
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
|
|
|
|
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
|
|
|
|
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
|
|
|
|
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
|
|
|
|
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
|
|
|
|
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
|
|
|
|
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
|
|
|
|
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
|
|
|
|
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
|
|
|
|
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
|
|
|
|
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
|
|
|
|
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
|
|
|
|
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
|
|
|
|
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
|
|
|
|
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
|
|
|
|
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
|
|
|
|
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
|
|
|
|
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
|
|
|
|
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
|
|
|
|
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
|
|
|
|
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
|
|
|
|
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
|
|
|
|
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
|
|
|
|
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
|
|
|
|
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
|
|
|
|
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 6
|
|
|
|
sha256d_8preext2_17:
|
|
|
|
.long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000
|
|
|
|
sha256d_8preext2_23:
|
|
|
|
.long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000
|
|
|
|
sha256d_8preext2_24:
|
|
|
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
|
|
sha256d_8preext2_30:
|
|
|
|
.long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022
|
|
|
|
|
|
|
|
#endif /* USE_AVX2 */
|
|
|
|
|
|
|
|
|
2012-03-12 13:32:11 +01:00
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256_init_4way
|
|
|
|
.globl _sha256_init_4way
|
|
|
|
sha256_init_4way:
|
|
|
|
_sha256_init_4way:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-12 13:32:11 +01:00
|
|
|
pushq %rdi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
#endif
|
|
|
|
movdqa sha256_4h+0(%rip), %xmm0
|
|
|
|
movdqa sha256_4h+16(%rip), %xmm1
|
|
|
|
movdqa sha256_4h+32(%rip), %xmm2
|
|
|
|
movdqa sha256_4h+48(%rip), %xmm3
|
|
|
|
movdqu %xmm0, 0(%rdi)
|
|
|
|
movdqu %xmm1, 16(%rdi)
|
|
|
|
movdqu %xmm2, 32(%rdi)
|
|
|
|
movdqu %xmm3, 48(%rdi)
|
|
|
|
movdqa sha256_4h+64(%rip), %xmm0
|
|
|
|
movdqa sha256_4h+80(%rip), %xmm1
|
|
|
|
movdqa sha256_4h+96(%rip), %xmm2
|
|
|
|
movdqa sha256_4h+112(%rip), %xmm3
|
|
|
|
movdqu %xmm0, 64(%rdi)
|
|
|
|
movdqu %xmm1, 80(%rdi)
|
|
|
|
movdqu %xmm2, 96(%rdi)
|
|
|
|
movdqu %xmm3, 112(%rdi)
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-12 13:32:11 +01:00
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
2012-03-25 15:43:49 +02:00
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2013-07-05 18:25:34 +02:00
|
|
|
#ifdef USE_AVX2
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256_init_8way
|
|
|
|
.globl _sha256_init_8way
|
|
|
|
sha256_init_8way:
|
|
|
|
_sha256_init_8way:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
pushq %rdi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
#endif
|
|
|
|
vpbroadcastd sha256_4h+0(%rip), %ymm0
|
|
|
|
vpbroadcastd sha256_4h+16(%rip), %ymm1
|
|
|
|
vpbroadcastd sha256_4h+32(%rip), %ymm2
|
|
|
|
vpbroadcastd sha256_4h+48(%rip), %ymm3
|
|
|
|
vmovdqu %ymm0, 0*32(%rdi)
|
|
|
|
vmovdqu %ymm1, 1*32(%rdi)
|
|
|
|
vmovdqu %ymm2, 2*32(%rdi)
|
|
|
|
vmovdqu %ymm3, 3*32(%rdi)
|
|
|
|
vpbroadcastd sha256_4h+64(%rip), %ymm0
|
|
|
|
vpbroadcastd sha256_4h+80(%rip), %ymm1
|
|
|
|
vpbroadcastd sha256_4h+96(%rip), %ymm2
|
|
|
|
vpbroadcastd sha256_4h+112(%rip), %ymm3
|
|
|
|
vmovdqu %ymm0, 4*32(%rdi)
|
|
|
|
vmovdqu %ymm1, 5*32(%rdi)
|
|
|
|
vmovdqu %ymm2, 6*32(%rdi)
|
|
|
|
vmovdqu %ymm3, 7*32(%rdi)
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
#endif /* USE_AVX2 */
|
|
|
|
|
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
.macro sha256_sse2_extend_round i
|
2012-03-25 15:43:49 +02:00
|
|
|
movdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
psrld $3, %xmm0
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
pslld $14, %xmm2
|
|
|
|
psrld $4, %xmm1
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
psrld $11, %xmm1
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
paddd (\i-16)*16(%rax), %xmm0
|
|
|
|
paddd (\i-7)*16(%rax), %xmm0
|
|
|
|
|
2012-03-26 14:15:35 +02:00
|
|
|
movdqa %xmm3, %xmm2
|
2012-03-25 15:43:49 +02:00
|
|
|
psrld $10, %xmm3
|
|
|
|
pslld $13, %xmm2
|
2012-03-26 14:15:35 +02:00
|
|
|
movdqa %xmm3, %xmm1
|
2012-03-25 15:43:49 +02:00
|
|
|
psrld $7, %xmm1
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
psrld $2, %xmm1
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm2, %xmm3
|
2012-03-26 14:15:35 +02:00
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
movdqa %xmm3, \i*16(%rax)
|
2012-03-25 15:43:49 +02:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_sse2_extend_doubleround i
|
2012-03-24 01:27:23 +01:00
|
|
|
movdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
movdqa (\i-14)*16(%rax), %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
movdqa %xmm4, %xmm6
|
|
|
|
psrld $3, %xmm0
|
|
|
|
psrld $3, %xmm4
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm4, %xmm5
|
|
|
|
pslld $14, %xmm2
|
|
|
|
pslld $14, %xmm6
|
|
|
|
psrld $4, %xmm1
|
|
|
|
psrld $4, %xmm5
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
psrld $11, %xmm1
|
|
|
|
psrld $11, %xmm5
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pslld $11, %xmm6
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
paddd (\i-16)*16(%rax), %xmm0
|
|
|
|
paddd (\i-15)*16(%rax), %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
paddd (\i-7)*16(%rax), %xmm0
|
2012-03-24 01:27:23 +01:00
|
|
|
paddd (\i-6)*16(%rax), %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
|
2012-03-26 14:15:35 +02:00
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, \i*16(%rax)
|
|
|
|
movdqa %xmm7, (\i+1)*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_sse2_main_round i
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 16*(\i)(%rax), %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movdqa %xmm0, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 16(%rsp), %xmm2
|
2012-03-21 23:07:56 +01:00
|
|
|
pandn %xmm2, %xmm1
|
2012-05-26 17:04:49 +02:00
|
|
|
paddd 32(%rsp), %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm2, 32(%rsp)
|
|
|
|
movdqa 0(%rsp), %xmm2
|
|
|
|
movdqa %xmm2, 16(%rsp)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
pand %xmm0, %xmm2
|
|
|
|
pxor %xmm2, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 0(%rsp)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
paddd %xmm1, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
psrld $6, %xmm0
|
2012-05-26 17:04:49 +02:00
|
|
|
paddd 16*(\i)(%rcx), %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
pslld $7, %xmm1
|
|
|
|
psrld $5, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pslld $14, %xmm1
|
|
|
|
psrld $14, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pslld $5, %xmm1
|
2012-05-26 17:04:49 +02:00
|
|
|
pxor %xmm2, %xmm0
|
2012-03-21 23:07:56 +01:00
|
|
|
pxor %xmm1, %xmm0
|
2012-05-26 17:04:49 +02:00
|
|
|
movdqa %xmm5, %xmm1
|
2012-03-21 23:07:56 +01:00
|
|
|
paddd %xmm0, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm0
|
|
|
|
movdqa %xmm4, %xmm3
|
|
|
|
movdqa %xmm4, %xmm2
|
2012-05-26 17:04:49 +02:00
|
|
|
paddd %xmm6, %xmm0
|
2012-03-21 23:07:56 +01:00
|
|
|
pand %xmm5, %xmm2
|
|
|
|
pand %xmm7, %xmm1
|
2012-05-26 17:04:49 +02:00
|
|
|
pand %xmm7, %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
pxor %xmm4, %xmm1
|
|
|
|
movdqa %xmm5, %xmm4
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pxor %xmm2, %xmm1
|
|
|
|
paddd %xmm1, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm7, %xmm2
|
|
|
|
psrld $2, %xmm7
|
|
|
|
movdqa %xmm7, %xmm1
|
|
|
|
pslld $10, %xmm2
|
|
|
|
psrld $11, %xmm1
|
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
pslld $9, %xmm2
|
2012-05-26 17:04:49 +02:00
|
|
|
pxor %xmm1, %xmm7
|
2012-03-21 23:07:56 +01:00
|
|
|
psrld $9, %xmm1
|
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
pslld $11, %xmm2
|
2012-05-26 17:04:49 +02:00
|
|
|
pxor %xmm1, %xmm7
|
2012-03-21 23:07:56 +01:00
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
paddd %xmm6, %xmm7
|
|
|
|
.endm
|
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
.macro sha256_sse2_main_quadround i
|
|
|
|
sha256_sse2_main_round \i+0
|
|
|
|
sha256_sse2_main_round \i+1
|
|
|
|
sha256_sse2_main_round \i+2
|
|
|
|
sha256_sse2_main_round \i+3
|
|
|
|
.endm
|
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
#if defined(USE_AVX)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
.macro sha256_avx_extend_round i
|
2012-03-25 15:43:49 +02:00
|
|
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
vpslld $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm0
|
|
|
|
vpsrld $4, %xmm0, %xmm1
|
|
|
|
vpxor %xmm1, %xmm0, %xmm0
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpsrld $11, %xmm1, %xmm1
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, %xmm0, %xmm0
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
2012-03-26 14:15:35 +02:00
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, \i*16(%rax)
|
2012-03-25 15:43:49 +02:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx_extend_doubleround i
|
2012-03-24 01:27:23 +01:00
|
|
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
vpslld $14, %xmm0, %xmm2
|
|
|
|
vpslld $14, %xmm4, %xmm6
|
2012-03-30 00:40:41 +02:00
|
|
|
vpsrld $3, %xmm0, %xmm8
|
2012-03-22 17:38:35 +01:00
|
|
|
vpsrld $3, %xmm4, %xmm4
|
2012-03-30 00:40:41 +02:00
|
|
|
vpsrld $7, %xmm0, %xmm1
|
2012-03-21 23:07:56 +01:00
|
|
|
vpsrld $4, %xmm4, %xmm5
|
2012-03-30 00:40:41 +02:00
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
|
|
|
vpsrld $11, %xmm1, %xmm1
|
|
|
|
vpsrld $11, %xmm5, %xmm5
|
2012-03-30 00:40:41 +02:00
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpslld $11, %xmm6, %xmm6
|
2012-03-30 00:40:41 +02:00
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
2012-03-30 00:40:41 +02:00
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
vpaddd %xmm0, %xmm4, %xmm4
|
|
|
|
vpaddd (\i-16)*16(%rax), %xmm8, %xmm0
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
2012-03-22 17:38:35 +01:00
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
2012-03-30 00:40:41 +02:00
|
|
|
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-26 14:15:35 +02:00
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, \i*16(%rax)
|
|
|
|
vmovdqa %xmm7, (\i+1)*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
|
|
|
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
2012-03-22 17:38:35 +01:00
|
|
|
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpandn \r1, \r3, %xmm1
|
|
|
|
vpand \r3, \r2, %xmm2
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpslld $7, \r3, %xmm1
|
|
|
|
vpsrld $6, \r3, \r0
|
|
|
|
vpsrld $5, \r0, %xmm2
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpslld $14, %xmm1, %xmm1
|
|
|
|
vpsrld $14, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpslld $5, %xmm1, %xmm1
|
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpaddd \r0, %xmm6, %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
vpaddd %xmm6, \r4, \r0
|
|
|
|
|
|
|
|
vpand \r6, \r5, %xmm2
|
|
|
|
vpand \r7, \r5, \r4
|
|
|
|
vpand \r7, \r6, %xmm1
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor \r4, %xmm1, %xmm1
|
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpslld $10, \r7, %xmm2
|
|
|
|
vpsrld $2, \r7, \r4
|
|
|
|
vpsrld $11, \r4, %xmm1
|
2012-03-22 17:38:35 +01:00
|
|
|
vpxor %xmm2, \r4, \r4
|
|
|
|
vpxor %xmm1, \r4, \r4
|
|
|
|
vpslld $9, %xmm2, %xmm2
|
|
|
|
vpsrld $9, %xmm1, %xmm1
|
|
|
|
vpxor %xmm2, \r4, \r4
|
|
|
|
vpxor %xmm1, \r4, \r4
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpxor %xmm2, \r4, \r4
|
|
|
|
vpaddd %xmm6, \r4, \r4
|
2012-03-21 23:07:56 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx_main_quadround i
|
|
|
|
sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
|
|
sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#endif /* USE_AVX */
|
|
|
|
|
|
|
|
|
2013-07-05 18:25:34 +02:00
|
|
|
#if defined(USE_AVX2)
|
|
|
|
|
|
|
|
.macro sha256_avx2_extend_round i
|
|
|
|
vmovdqa (\i-15)*32(%rax), %ymm0
|
|
|
|
vpslld $14, %ymm0, %ymm2
|
|
|
|
vpsrld $3, %ymm0, %ymm0
|
|
|
|
vpsrld $4, %ymm0, %ymm1
|
|
|
|
vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
vpxor %ymm2, %ymm0, %ymm0
|
|
|
|
vpsrld $11, %ymm1, %ymm1
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
vpxor %ymm2, %ymm0, %ymm0
|
|
|
|
vpaddd (\i-16)*32(%rax), %ymm0, %ymm0
|
|
|
|
vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vmovdqa %ymm3, \i*32(%rax)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx2_extend_doubleround i
|
|
|
|
vmovdqa (\i-15)*32(%rax), %ymm0
|
|
|
|
vmovdqa (\i-14)*32(%rax), %ymm4
|
|
|
|
vpslld $14, %ymm0, %ymm2
|
|
|
|
vpslld $14, %ymm4, %ymm6
|
|
|
|
vpsrld $3, %ymm0, %ymm8
|
|
|
|
vpsrld $3, %ymm4, %ymm4
|
|
|
|
vpsrld $7, %ymm0, %ymm1
|
|
|
|
vpsrld $4, %ymm4, %ymm5
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpsrld $11, %ymm1, %ymm1
|
|
|
|
vpsrld $11, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpslld $11, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
|
|
|
|
vpaddd %ymm0, %ymm4, %ymm4
|
|
|
|
vpaddd (\i-16)*32(%rax), %ymm8, %ymm0
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
|
|
|
|
vpaddd (\i-7)*32(%rax), %ymm0, %ymm0
|
|
|
|
vpaddd (\i-6)*32(%rax), %ymm4, %ymm4
|
|
|
|
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, \i*32(%rax)
|
|
|
|
vmovdqa %ymm7, (\i+1)*32(%rax)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
|
|
|
vpaddd 32*(\i)(%rax), \r0, %ymm6
|
|
|
|
vpaddd 32*(\i)(%rcx), %ymm6, %ymm6
|
|
|
|
|
|
|
|
vpandn \r1, \r3, %ymm1
|
|
|
|
vpand \r3, \r2, %ymm2
|
|
|
|
vpxor %ymm2, %ymm1, %ymm1
|
|
|
|
vpaddd %ymm1, %ymm6, %ymm6
|
|
|
|
|
|
|
|
vpslld $7, \r3, %ymm1
|
|
|
|
vpsrld $6, \r3, \r0
|
|
|
|
vpsrld $5, \r0, %ymm2
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpxor %ymm2, \r0, \r0
|
|
|
|
vpslld $14, %ymm1, %ymm1
|
|
|
|
vpsrld $14, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpxor %ymm2, \r0, \r0
|
|
|
|
vpslld $5, %ymm1, %ymm1
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpaddd \r0, %ymm6, %ymm6
|
|
|
|
vpaddd %ymm6, \r4, \r0
|
|
|
|
|
|
|
|
vpand \r6, \r5, %ymm2
|
|
|
|
vpand \r7, \r5, \r4
|
|
|
|
vpand \r7, \r6, %ymm1
|
|
|
|
vpxor \r4, %ymm1, %ymm1
|
|
|
|
vpxor %ymm2, %ymm1, %ymm1
|
|
|
|
vpaddd %ymm1, %ymm6, %ymm6
|
|
|
|
|
|
|
|
vpslld $10, \r7, %ymm2
|
|
|
|
vpsrld $2, \r7, \r4
|
|
|
|
vpsrld $11, \r4, %ymm1
|
|
|
|
vpxor %ymm2, \r4, \r4
|
|
|
|
vpxor %ymm1, \r4, \r4
|
|
|
|
vpslld $9, %ymm2, %ymm2
|
|
|
|
vpsrld $9, %ymm1, %ymm1
|
|
|
|
vpxor %ymm2, \r4, \r4
|
|
|
|
vpxor %ymm1, \r4, \r4
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpxor %ymm2, \r4, \r4
|
|
|
|
vpaddd %ymm6, \r4, \r4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_avx2_main_quadround i
|
|
|
|
sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
|
|
|
|
sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
|
|
|
|
sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
|
|
|
|
sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#endif /* USE_AVX2 */
|
|
|
|
|
|
|
|
|
2012-03-22 17:38:35 +01:00
|
|
|
#if defined(USE_XOP)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
.macro sha256_xop_extend_round i
|
2012-03-25 15:43:49 +02:00
|
|
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
vprotd $25, %xmm0, %xmm1
|
|
|
|
vprotd $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm0
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
|
|
|
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
2012-03-26 14:15:35 +02:00
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, \i*16(%rax)
|
2012-03-25 15:43:49 +02:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_xop_extend_doubleround i
|
2012-03-24 01:27:23 +01:00
|
|
|
vmovdqa (\i-15)*16(%rax), %xmm0
|
|
|
|
vmovdqa (\i-14)*16(%rax), %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
vprotd $25, %xmm0, %xmm1
|
|
|
|
vprotd $25, %xmm4, %xmm5
|
|
|
|
vprotd $14, %xmm0, %xmm2
|
|
|
|
vprotd $14, %xmm4, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $3, %xmm0, %xmm0
|
|
|
|
vpsrld $3, %xmm4, %xmm4
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
vpaddd (\i-16)*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd (\i-15)*16(%rax), %xmm4, %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
vpaddd (\i-7)*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd (\i-6)*16(%rax), %xmm4, %xmm4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
|
2012-03-26 14:15:35 +02:00
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, \i*16(%rax)
|
|
|
|
vmovdqa %xmm7, (\i+1)*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7
|
|
|
|
vpaddd 16*(\i)(%rax), \r0, %xmm6
|
|
|
|
vpaddd 16*(\i)(%rcx), %xmm6, %xmm6
|
|
|
|
|
|
|
|
vpandn \r1, \r3, %xmm1
|
|
|
|
vpand \r3, \r2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
|
|
|
|
|
|
|
vprotd $26, \r3, %xmm1
|
|
|
|
vprotd $21, \r3, %xmm2
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vprotd $7, \r3, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpaddd \r0, %xmm6, %xmm6
|
|
|
|
vpaddd %xmm6, \r4, \r0
|
|
|
|
|
|
|
|
vpand \r6, \r5, %xmm2
|
|
|
|
vpand \r7, \r5, \r4
|
|
|
|
vpand \r7, \r6, %xmm1
|
|
|
|
vpxor \r4, %xmm1, %xmm1
|
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
|
|
|
|
|
|
|
vprotd $30, \r7, %xmm1
|
|
|
|
vprotd $19, \r7, %xmm2
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vprotd $10, \r7, \r4
|
|
|
|
vpxor %xmm2, \r4, \r4
|
|
|
|
vpaddd %xmm6, \r4, \r4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.macro sha256_xop_main_quadround i
|
|
|
|
sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
|
|
sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
.endm
|
|
|
|
|
|
|
|
#endif /* USE_XOP */
|
|
|
|
|
|
|
|
|
2012-03-12 13:32:11 +01:00
|
|
|
.text
|
|
|
|
.p2align 6
|
2012-03-22 17:38:35 +01:00
|
|
|
sha256_transform_4way_core_sse2:
|
2012-03-12 13:32:11 +01:00
|
|
|
leaq 256(%rsp), %rcx
|
|
|
|
leaq 48*16(%rcx), %rax
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa -2*16(%rcx), %xmm3
|
|
|
|
movdqa -1*16(%rcx), %xmm7
|
2012-03-22 17:38:35 +01:00
|
|
|
sha256_transform_4way_sse2_extend_loop:
|
2012-03-12 13:32:11 +01:00
|
|
|
movdqa -15*16(%rcx), %xmm0
|
|
|
|
movdqa -14*16(%rcx), %xmm4
|
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
movdqa %xmm4, %xmm6
|
|
|
|
psrld $3, %xmm0
|
|
|
|
psrld $3, %xmm4
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm4, %xmm5
|
|
|
|
pslld $14, %xmm2
|
|
|
|
pslld $14, %xmm6
|
|
|
|
psrld $4, %xmm1
|
|
|
|
psrld $4, %xmm5
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
psrld $11, %xmm1
|
|
|
|
psrld $11, %xmm5
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pslld $11, %xmm6
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
|
|
|
|
paddd -16*16(%rcx), %xmm0
|
|
|
|
paddd -15*16(%rcx), %xmm4
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
paddd -7*16(%rcx), %xmm0
|
2012-03-12 13:32:11 +01:00
|
|
|
paddd -6*16(%rcx), %xmm4
|
|
|
|
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, (%rcx)
|
|
|
|
movdqa %xmm7, 16(%rcx)
|
2012-03-12 13:32:11 +01:00
|
|
|
addq $2*16, %rcx
|
|
|
|
cmpq %rcx, %rax
|
2012-03-22 17:38:35 +01:00
|
|
|
jne sha256_transform_4way_sse2_extend_loop
|
2012-03-12 13:32:11 +01:00
|
|
|
|
|
|
|
movdqu 0(%rdi), %xmm7
|
|
|
|
movdqu 16(%rdi), %xmm5
|
|
|
|
movdqu 32(%rdi), %xmm4
|
|
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
movdqu 64(%rdi), %xmm0
|
|
|
|
movdqu 80(%rdi), %xmm8
|
|
|
|
movdqu 96(%rdi), %xmm9
|
|
|
|
movdqu 112(%rdi), %xmm10
|
|
|
|
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
|
|
|
xorq %rax, %rax
|
2012-03-22 17:38:35 +01:00
|
|
|
sha256_transform_4way_sse2_main_loop:
|
2012-03-12 13:32:11 +01:00
|
|
|
movdqa (%rsp, %rax), %xmm6
|
|
|
|
paddd (%rcx, %rax), %xmm6
|
|
|
|
paddd %xmm10, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm9, %xmm2
|
|
|
|
pandn %xmm2, %xmm1
|
|
|
|
|
|
|
|
movdqa %xmm2, %xmm10
|
|
|
|
movdqa %xmm8, %xmm2
|
|
|
|
movdqa %xmm2, %xmm9
|
|
|
|
|
|
|
|
pand %xmm0, %xmm2
|
|
|
|
pxor %xmm2, %xmm1
|
|
|
|
movdqa %xmm0, %xmm8
|
|
|
|
|
|
|
|
paddd %xmm1, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
psrld $6, %xmm0
|
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
pslld $7, %xmm1
|
|
|
|
psrld $5, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pslld $14, %xmm1
|
|
|
|
psrld $14, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pslld $5, %xmm1
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
paddd %xmm0, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm0
|
|
|
|
paddd %xmm6, %xmm0
|
|
|
|
|
|
|
|
movdqa %xmm5, %xmm1
|
|
|
|
movdqa %xmm4, %xmm3
|
|
|
|
movdqa %xmm4, %xmm2
|
|
|
|
pand %xmm5, %xmm2
|
|
|
|
pand %xmm7, %xmm4
|
|
|
|
pand %xmm7, %xmm1
|
|
|
|
pxor %xmm4, %xmm1
|
|
|
|
movdqa %xmm5, %xmm4
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pxor %xmm2, %xmm1
|
|
|
|
paddd %xmm1, %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm7, %xmm2
|
|
|
|
psrld $2, %xmm7
|
|
|
|
movdqa %xmm7, %xmm1
|
|
|
|
pslld $10, %xmm2
|
|
|
|
psrld $11, %xmm1
|
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
pxor %xmm1, %xmm7
|
|
|
|
pslld $9, %xmm2
|
|
|
|
psrld $9, %xmm1
|
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
pxor %xmm1, %xmm7
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pxor %xmm2, %xmm7
|
|
|
|
paddd %xmm6, %xmm7
|
|
|
|
|
|
|
|
addq $16, %rax
|
|
|
|
cmpq $16*64, %rax
|
2012-03-22 17:38:35 +01:00
|
|
|
jne sha256_transform_4way_sse2_main_loop
|
|
|
|
jmp sha256_transform_4way_finish
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
|
2012-03-22 17:38:35 +01:00
|
|
|
#if defined(USE_AVX)
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_4way_core_avx:
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsp), %rax
|
2012-03-26 14:15:35 +02:00
|
|
|
movdqa -2*16(%rax), %xmm3
|
|
|
|
movdqa -1*16(%rax), %xmm7
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_avx_extend_doubleround 0
|
|
|
|
sha256_avx_extend_doubleround 2
|
|
|
|
sha256_avx_extend_doubleround 4
|
|
|
|
sha256_avx_extend_doubleround 6
|
|
|
|
sha256_avx_extend_doubleround 8
|
|
|
|
sha256_avx_extend_doubleround 10
|
|
|
|
sha256_avx_extend_doubleround 12
|
|
|
|
sha256_avx_extend_doubleround 14
|
|
|
|
sha256_avx_extend_doubleround 16
|
|
|
|
sha256_avx_extend_doubleround 18
|
|
|
|
sha256_avx_extend_doubleround 20
|
|
|
|
sha256_avx_extend_doubleround 22
|
|
|
|
sha256_avx_extend_doubleround 24
|
|
|
|
sha256_avx_extend_doubleround 26
|
|
|
|
sha256_avx_extend_doubleround 28
|
|
|
|
sha256_avx_extend_doubleround 30
|
|
|
|
sha256_avx_extend_doubleround 32
|
|
|
|
sha256_avx_extend_doubleround 34
|
|
|
|
sha256_avx_extend_doubleround 36
|
|
|
|
sha256_avx_extend_doubleround 38
|
|
|
|
sha256_avx_extend_doubleround 40
|
|
|
|
sha256_avx_extend_doubleround 42
|
|
|
|
sha256_avx_extend_doubleround 44
|
|
|
|
sha256_avx_extend_doubleround 46
|
2012-03-22 17:38:35 +01:00
|
|
|
movdqu 0(%rdi), %xmm7
|
|
|
|
movdqu 16(%rdi), %xmm5
|
|
|
|
movdqu 32(%rdi), %xmm4
|
|
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
movdqu 64(%rdi), %xmm0
|
|
|
|
movdqu 80(%rdi), %xmm8
|
|
|
|
movdqu 96(%rdi), %xmm9
|
|
|
|
movdqu 112(%rdi), %xmm10
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_avx_main_quadround 0
|
|
|
|
sha256_avx_main_quadround 4
|
|
|
|
sha256_avx_main_quadround 8
|
|
|
|
sha256_avx_main_quadround 12
|
|
|
|
sha256_avx_main_quadround 16
|
|
|
|
sha256_avx_main_quadround 20
|
|
|
|
sha256_avx_main_quadround 24
|
|
|
|
sha256_avx_main_quadround 28
|
|
|
|
sha256_avx_main_quadround 32
|
|
|
|
sha256_avx_main_quadround 36
|
|
|
|
sha256_avx_main_quadround 40
|
|
|
|
sha256_avx_main_quadround 44
|
|
|
|
sha256_avx_main_quadround 48
|
|
|
|
sha256_avx_main_quadround 52
|
|
|
|
sha256_avx_main_quadround 56
|
|
|
|
sha256_avx_main_quadround 60
|
2012-03-22 17:38:35 +01:00
|
|
|
jmp sha256_transform_4way_finish
|
|
|
|
#endif /* USE_AVX */
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
|
2012-03-22 17:38:35 +01:00
|
|
|
#if defined(USE_XOP)
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_4way_core_xop:
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsp), %rax
|
2012-03-26 14:15:35 +02:00
|
|
|
movdqa -2*16(%rax), %xmm3
|
|
|
|
movdqa -1*16(%rax), %xmm7
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_xop_extend_doubleround 0
|
|
|
|
sha256_xop_extend_doubleround 2
|
|
|
|
sha256_xop_extend_doubleround 4
|
|
|
|
sha256_xop_extend_doubleround 6
|
|
|
|
sha256_xop_extend_doubleround 8
|
|
|
|
sha256_xop_extend_doubleround 10
|
|
|
|
sha256_xop_extend_doubleround 12
|
|
|
|
sha256_xop_extend_doubleround 14
|
|
|
|
sha256_xop_extend_doubleround 16
|
|
|
|
sha256_xop_extend_doubleround 18
|
|
|
|
sha256_xop_extend_doubleround 20
|
|
|
|
sha256_xop_extend_doubleround 22
|
|
|
|
sha256_xop_extend_doubleround 24
|
|
|
|
sha256_xop_extend_doubleround 26
|
|
|
|
sha256_xop_extend_doubleround 28
|
|
|
|
sha256_xop_extend_doubleround 30
|
|
|
|
sha256_xop_extend_doubleround 32
|
|
|
|
sha256_xop_extend_doubleround 34
|
|
|
|
sha256_xop_extend_doubleround 36
|
|
|
|
sha256_xop_extend_doubleround 38
|
|
|
|
sha256_xop_extend_doubleround 40
|
|
|
|
sha256_xop_extend_doubleround 42
|
|
|
|
sha256_xop_extend_doubleround 44
|
|
|
|
sha256_xop_extend_doubleround 46
|
2012-03-22 17:38:35 +01:00
|
|
|
movdqu 0(%rdi), %xmm7
|
|
|
|
movdqu 16(%rdi), %xmm5
|
|
|
|
movdqu 32(%rdi), %xmm4
|
|
|
|
movdqu 48(%rdi), %xmm3
|
|
|
|
movdqu 64(%rdi), %xmm0
|
|
|
|
movdqu 80(%rdi), %xmm8
|
|
|
|
movdqu 96(%rdi), %xmm9
|
|
|
|
movdqu 112(%rdi), %xmm10
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_xop_main_quadround 0
|
|
|
|
sha256_xop_main_quadround 4
|
|
|
|
sha256_xop_main_quadround 8
|
|
|
|
sha256_xop_main_quadround 12
|
|
|
|
sha256_xop_main_quadround 16
|
|
|
|
sha256_xop_main_quadround 20
|
|
|
|
sha256_xop_main_quadround 24
|
|
|
|
sha256_xop_main_quadround 28
|
|
|
|
sha256_xop_main_quadround 32
|
|
|
|
sha256_xop_main_quadround 36
|
|
|
|
sha256_xop_main_quadround 40
|
|
|
|
sha256_xop_main_quadround 44
|
|
|
|
sha256_xop_main_quadround 48
|
|
|
|
sha256_xop_main_quadround 52
|
|
|
|
sha256_xop_main_quadround 56
|
|
|
|
sha256_xop_main_quadround 60
|
2012-03-22 17:38:35 +01:00
|
|
|
jmp sha256_transform_4way_finish
|
|
|
|
#endif /* USE_XOP */
|
|
|
|
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 3
|
|
|
|
sha256_transform_4way_core_addr:
|
|
|
|
.quad 0x0
|
|
|
|
|
|
|
|
.macro p2bswap_rsi_rsp i
|
|
|
|
movdqu \i*16(%rsi), %xmm0
|
|
|
|
movdqu (\i+1)*16(%rsi), %xmm2
|
|
|
|
pshuflw $0xb1, %xmm0, %xmm0
|
|
|
|
pshuflw $0xb1, %xmm2, %xmm2
|
|
|
|
pshufhw $0xb1, %xmm0, %xmm0
|
|
|
|
pshufhw $0xb1, %xmm2, %xmm2
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm2, %xmm3
|
|
|
|
psrlw $8, %xmm1
|
|
|
|
psrlw $8, %xmm3
|
|
|
|
psllw $8, %xmm0
|
|
|
|
psllw $8, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm3, %xmm2
|
|
|
|
movdqa %xmm0, \i*16(%rsp)
|
|
|
|
movdqa %xmm2, (\i+1)*16(%rsp)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256_transform_4way
|
|
|
|
.globl _sha256_transform_4way
|
|
|
|
sha256_transform_4way:
|
|
|
|
_sha256_transform_4way:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-22 17:38:35 +01:00
|
|
|
pushq %rdi
|
|
|
|
subq $96, %rsp
|
|
|
|
movdqa %xmm6, 0(%rsp)
|
|
|
|
movdqa %xmm7, 16(%rsp)
|
|
|
|
movdqa %xmm8, 32(%rsp)
|
|
|
|
movdqa %xmm9, 48(%rsp)
|
|
|
|
movdqa %xmm10, 64(%rsp)
|
|
|
|
movdqa %xmm11, 80(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
#endif
|
|
|
|
movq %rsp, %r8
|
|
|
|
subq $1032, %rsp
|
|
|
|
andq $-128, %rsp
|
|
|
|
|
|
|
|
testq %rdx, %rdx
|
2012-03-23 16:35:21 +01:00
|
|
|
jnz sha256_transform_4way_swap
|
2012-03-12 13:32:11 +01:00
|
|
|
|
2012-03-22 17:38:35 +01:00
|
|
|
movdqu 0*16(%rsi), %xmm0
|
|
|
|
movdqu 1*16(%rsi), %xmm1
|
|
|
|
movdqu 2*16(%rsi), %xmm2
|
|
|
|
movdqu 3*16(%rsi), %xmm3
|
|
|
|
movdqu 4*16(%rsi), %xmm4
|
|
|
|
movdqu 5*16(%rsi), %xmm5
|
|
|
|
movdqu 6*16(%rsi), %xmm6
|
|
|
|
movdqu 7*16(%rsi), %xmm7
|
|
|
|
movdqa %xmm0, 0*16(%rsp)
|
|
|
|
movdqa %xmm1, 1*16(%rsp)
|
|
|
|
movdqa %xmm2, 2*16(%rsp)
|
|
|
|
movdqa %xmm3, 3*16(%rsp)
|
|
|
|
movdqa %xmm4, 4*16(%rsp)
|
|
|
|
movdqa %xmm5, 5*16(%rsp)
|
|
|
|
movdqa %xmm6, 6*16(%rsp)
|
|
|
|
movdqa %xmm7, 7*16(%rsp)
|
|
|
|
movdqu 8*16(%rsi), %xmm0
|
|
|
|
movdqu 9*16(%rsi), %xmm1
|
|
|
|
movdqu 10*16(%rsi), %xmm2
|
|
|
|
movdqu 11*16(%rsi), %xmm3
|
|
|
|
movdqu 12*16(%rsi), %xmm4
|
|
|
|
movdqu 13*16(%rsi), %xmm5
|
|
|
|
movdqu 14*16(%rsi), %xmm6
|
|
|
|
movdqu 15*16(%rsi), %xmm7
|
|
|
|
movdqa %xmm0, 8*16(%rsp)
|
|
|
|
movdqa %xmm1, 9*16(%rsp)
|
|
|
|
movdqa %xmm2, 10*16(%rsp)
|
|
|
|
movdqa %xmm3, 11*16(%rsp)
|
|
|
|
movdqa %xmm4, 12*16(%rsp)
|
|
|
|
movdqa %xmm5, 13*16(%rsp)
|
|
|
|
movdqa %xmm6, 14*16(%rsp)
|
|
|
|
movdqa %xmm7, 15*16(%rsp)
|
2012-03-23 16:35:21 +01:00
|
|
|
jmp *sha256_transform_4way_core_addr(%rip)
|
2012-03-22 17:38:35 +01:00
|
|
|
|
2012-03-23 16:35:21 +01:00
|
|
|
.p2align 6
|
|
|
|
sha256_transform_4way_swap:
|
|
|
|
p2bswap_rsi_rsp 0
|
|
|
|
p2bswap_rsi_rsp 2
|
|
|
|
p2bswap_rsi_rsp 4
|
|
|
|
p2bswap_rsi_rsp 6
|
|
|
|
p2bswap_rsi_rsp 8
|
|
|
|
p2bswap_rsi_rsp 10
|
|
|
|
p2bswap_rsi_rsp 12
|
|
|
|
p2bswap_rsi_rsp 14
|
|
|
|
jmp *sha256_transform_4way_core_addr(%rip)
|
2012-03-22 17:38:35 +01:00
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_4way_finish:
|
2012-03-12 13:32:11 +01:00
|
|
|
movdqu 0(%rdi), %xmm2
|
|
|
|
movdqu 16(%rdi), %xmm6
|
|
|
|
movdqu 32(%rdi), %xmm11
|
|
|
|
movdqu 48(%rdi), %xmm1
|
|
|
|
paddd %xmm2, %xmm7
|
|
|
|
paddd %xmm6, %xmm5
|
|
|
|
paddd %xmm11, %xmm4
|
|
|
|
paddd %xmm1, %xmm3
|
|
|
|
movdqu 64(%rdi), %xmm2
|
|
|
|
movdqu 80(%rdi), %xmm6
|
|
|
|
movdqu 96(%rdi), %xmm11
|
|
|
|
movdqu 112(%rdi), %xmm1
|
|
|
|
paddd %xmm2, %xmm0
|
|
|
|
paddd %xmm6, %xmm8
|
|
|
|
paddd %xmm11, %xmm9
|
|
|
|
paddd %xmm1, %xmm10
|
|
|
|
|
|
|
|
movdqu %xmm7, 0(%rdi)
|
|
|
|
movdqu %xmm5, 16(%rdi)
|
|
|
|
movdqu %xmm4, 32(%rdi)
|
|
|
|
movdqu %xmm3, 48(%rdi)
|
|
|
|
movdqu %xmm0, 64(%rdi)
|
|
|
|
movdqu %xmm8, 80(%rdi)
|
|
|
|
movdqu %xmm9, 96(%rdi)
|
|
|
|
movdqu %xmm10, 112(%rdi)
|
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
movq %r8, %rsp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-12 13:32:11 +01:00
|
|
|
popq %rsi
|
|
|
|
movdqa 0(%rsp), %xmm6
|
|
|
|
movdqa 16(%rsp), %xmm7
|
|
|
|
movdqa 32(%rsp), %xmm8
|
|
|
|
movdqa 48(%rsp), %xmm9
|
|
|
|
movdqa 64(%rsp), %xmm10
|
|
|
|
movdqa 80(%rsp), %xmm11
|
|
|
|
addq $96, %rsp
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
2013-07-05 18:25:34 +02:00
|
|
|
|
|
|
|
|
|
|
|
#ifdef USE_AVX2
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_8way_core_avx2:
|
|
|
|
leaq 8*64(%rsp), %rax
|
|
|
|
vmovdqa -2*32(%rax), %ymm3
|
|
|
|
vmovdqa -1*32(%rax), %ymm7
|
|
|
|
sha256_avx2_extend_doubleround 0
|
|
|
|
sha256_avx2_extend_doubleround 2
|
|
|
|
sha256_avx2_extend_doubleround 4
|
|
|
|
sha256_avx2_extend_doubleround 6
|
|
|
|
sha256_avx2_extend_doubleround 8
|
|
|
|
sha256_avx2_extend_doubleround 10
|
|
|
|
sha256_avx2_extend_doubleround 12
|
|
|
|
sha256_avx2_extend_doubleround 14
|
|
|
|
sha256_avx2_extend_doubleround 16
|
|
|
|
sha256_avx2_extend_doubleround 18
|
|
|
|
sha256_avx2_extend_doubleround 20
|
|
|
|
sha256_avx2_extend_doubleround 22
|
|
|
|
sha256_avx2_extend_doubleround 24
|
|
|
|
sha256_avx2_extend_doubleround 26
|
|
|
|
sha256_avx2_extend_doubleround 28
|
|
|
|
sha256_avx2_extend_doubleround 30
|
|
|
|
sha256_avx2_extend_doubleround 32
|
|
|
|
sha256_avx2_extend_doubleround 34
|
|
|
|
sha256_avx2_extend_doubleround 36
|
|
|
|
sha256_avx2_extend_doubleround 38
|
|
|
|
sha256_avx2_extend_doubleround 40
|
|
|
|
sha256_avx2_extend_doubleround 42
|
|
|
|
sha256_avx2_extend_doubleround 44
|
|
|
|
sha256_avx2_extend_doubleround 46
|
|
|
|
vmovdqu 0*32(%rdi), %ymm7
|
|
|
|
vmovdqu 1*32(%rdi), %ymm5
|
|
|
|
vmovdqu 2*32(%rdi), %ymm4
|
|
|
|
vmovdqu 3*32(%rdi), %ymm3
|
|
|
|
vmovdqu 4*32(%rdi), %ymm0
|
|
|
|
vmovdqu 5*32(%rdi), %ymm8
|
|
|
|
vmovdqu 6*32(%rdi), %ymm9
|
|
|
|
vmovdqu 7*32(%rdi), %ymm10
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_8k(%rip), %rcx
|
|
|
|
sha256_avx2_main_quadround 0
|
|
|
|
sha256_avx2_main_quadround 4
|
|
|
|
sha256_avx2_main_quadround 8
|
|
|
|
sha256_avx2_main_quadround 12
|
|
|
|
sha256_avx2_main_quadround 16
|
|
|
|
sha256_avx2_main_quadround 20
|
|
|
|
sha256_avx2_main_quadround 24
|
|
|
|
sha256_avx2_main_quadround 28
|
|
|
|
sha256_avx2_main_quadround 32
|
|
|
|
sha256_avx2_main_quadround 36
|
|
|
|
sha256_avx2_main_quadround 40
|
|
|
|
sha256_avx2_main_quadround 44
|
|
|
|
sha256_avx2_main_quadround 48
|
|
|
|
sha256_avx2_main_quadround 52
|
|
|
|
sha256_avx2_main_quadround 56
|
|
|
|
sha256_avx2_main_quadround 60
|
|
|
|
jmp sha256_transform_8way_finish
|
|
|
|
|
|
|
|
.macro p2bswap_avx2_rsi_rsp i
|
|
|
|
vmovdqu \i*32(%rsi), %ymm0
|
|
|
|
vmovdqu (\i+1)*32(%rsi), %ymm2
|
|
|
|
vpshuflw $0xb1, %ymm0, %ymm0
|
|
|
|
vpshuflw $0xb1, %ymm2, %ymm2
|
|
|
|
vpshufhw $0xb1, %ymm0, %ymm0
|
|
|
|
vpshufhw $0xb1, %ymm2, %ymm2
|
|
|
|
vpsrlw $8, %ymm0, %ymm1
|
|
|
|
vpsrlw $8, %ymm2, %ymm3
|
|
|
|
vpsllw $8, %ymm0, %ymm0
|
|
|
|
vpsllw $8, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
vpxor %ymm3, %ymm2, %ymm2
|
|
|
|
vmovdqa %ymm0, \i*32(%rsp)
|
|
|
|
vmovdqa %ymm2, (\i+1)*32(%rsp)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256_transform_8way
|
|
|
|
.globl _sha256_transform_8way
|
|
|
|
sha256_transform_8way:
|
|
|
|
_sha256_transform_8way:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
pushq %rdi
|
|
|
|
subq $96, %rsp
|
|
|
|
vmovdqa %xmm6, 0(%rsp)
|
|
|
|
vmovdqa %xmm7, 16(%rsp)
|
|
|
|
vmovdqa %xmm8, 32(%rsp)
|
|
|
|
vmovdqa %xmm9, 48(%rsp)
|
|
|
|
vmovdqa %xmm10, 64(%rsp)
|
|
|
|
vmovdqa %xmm11, 80(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
#endif
|
|
|
|
movq %rsp, %r8
|
|
|
|
subq $64*32, %rsp
|
|
|
|
andq $-128, %rsp
|
|
|
|
|
|
|
|
testq %rdx, %rdx
|
|
|
|
jnz sha256_transform_8way_swap
|
|
|
|
|
|
|
|
vmovdqu 0*32(%rsi), %ymm0
|
|
|
|
vmovdqu 1*32(%rsi), %ymm1
|
|
|
|
vmovdqu 2*32(%rsi), %ymm2
|
|
|
|
vmovdqu 3*32(%rsi), %ymm3
|
|
|
|
vmovdqu 4*32(%rsi), %ymm4
|
|
|
|
vmovdqu 5*32(%rsi), %ymm5
|
|
|
|
vmovdqu 6*32(%rsi), %ymm6
|
|
|
|
vmovdqu 7*32(%rsi), %ymm7
|
|
|
|
vmovdqa %ymm0, 0*32(%rsp)
|
|
|
|
vmovdqa %ymm1, 1*32(%rsp)
|
|
|
|
vmovdqa %ymm2, 2*32(%rsp)
|
|
|
|
vmovdqa %ymm3, 3*32(%rsp)
|
|
|
|
vmovdqa %ymm4, 4*32(%rsp)
|
|
|
|
vmovdqa %ymm5, 5*32(%rsp)
|
|
|
|
vmovdqa %ymm6, 6*32(%rsp)
|
|
|
|
vmovdqa %ymm7, 7*32(%rsp)
|
|
|
|
vmovdqu 8*32(%rsi), %ymm0
|
|
|
|
vmovdqu 9*32(%rsi), %ymm1
|
|
|
|
vmovdqu 10*32(%rsi), %ymm2
|
|
|
|
vmovdqu 11*32(%rsi), %ymm3
|
|
|
|
vmovdqu 12*32(%rsi), %ymm4
|
|
|
|
vmovdqu 13*32(%rsi), %ymm5
|
|
|
|
vmovdqu 14*32(%rsi), %ymm6
|
|
|
|
vmovdqu 15*32(%rsi), %ymm7
|
|
|
|
vmovdqa %ymm0, 8*32(%rsp)
|
|
|
|
vmovdqa %ymm1, 9*32(%rsp)
|
|
|
|
vmovdqa %ymm2, 10*32(%rsp)
|
|
|
|
vmovdqa %ymm3, 11*32(%rsp)
|
|
|
|
vmovdqa %ymm4, 12*32(%rsp)
|
|
|
|
vmovdqa %ymm5, 13*32(%rsp)
|
|
|
|
vmovdqa %ymm6, 14*32(%rsp)
|
|
|
|
vmovdqa %ymm7, 15*32(%rsp)
|
|
|
|
jmp sha256_transform_8way_core_avx2
|
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_8way_swap:
|
|
|
|
p2bswap_avx2_rsi_rsp 0
|
|
|
|
p2bswap_avx2_rsi_rsp 2
|
|
|
|
p2bswap_avx2_rsi_rsp 4
|
|
|
|
p2bswap_avx2_rsi_rsp 6
|
|
|
|
p2bswap_avx2_rsi_rsp 8
|
|
|
|
p2bswap_avx2_rsi_rsp 10
|
|
|
|
p2bswap_avx2_rsi_rsp 12
|
|
|
|
p2bswap_avx2_rsi_rsp 14
|
|
|
|
jmp sha256_transform_8way_core_avx2
|
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
sha256_transform_8way_finish:
|
|
|
|
vmovdqu 0*32(%rdi), %ymm2
|
|
|
|
vmovdqu 1*32(%rdi), %ymm6
|
|
|
|
vmovdqu 2*32(%rdi), %ymm11
|
|
|
|
vmovdqu 3*32(%rdi), %ymm1
|
|
|
|
vpaddd %ymm2, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm6, %ymm5, %ymm5
|
|
|
|
vpaddd %ymm11, %ymm4, %ymm4
|
|
|
|
vpaddd %ymm1, %ymm3, %ymm3
|
|
|
|
vmovdqu 4*32(%rdi), %ymm2
|
|
|
|
vmovdqu 5*32(%rdi), %ymm6
|
|
|
|
vmovdqu 6*32(%rdi), %ymm11
|
|
|
|
vmovdqu 7*32(%rdi), %ymm1
|
|
|
|
vpaddd %ymm2, %ymm0, %ymm0
|
|
|
|
vpaddd %ymm6, %ymm8, %ymm8
|
|
|
|
vpaddd %ymm11, %ymm9, %ymm9
|
|
|
|
vpaddd %ymm1, %ymm10, %ymm10
|
|
|
|
|
|
|
|
vmovdqu %ymm7, 0*32(%rdi)
|
|
|
|
vmovdqu %ymm5, 1*32(%rdi)
|
|
|
|
vmovdqu %ymm4, 2*32(%rdi)
|
|
|
|
vmovdqu %ymm3, 3*32(%rdi)
|
|
|
|
vmovdqu %ymm0, 4*32(%rdi)
|
|
|
|
vmovdqu %ymm8, 5*32(%rdi)
|
|
|
|
vmovdqu %ymm9, 6*32(%rdi)
|
|
|
|
vmovdqu %ymm10, 7*32(%rdi)
|
|
|
|
|
|
|
|
movq %r8, %rsp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
popq %rsi
|
|
|
|
vmovdqa 0(%rsp), %xmm6
|
|
|
|
vmovdqa 16(%rsp), %xmm7
|
|
|
|
vmovdqa 32(%rsp), %xmm8
|
|
|
|
vmovdqa 48(%rsp), %xmm9
|
|
|
|
vmovdqa 64(%rsp), %xmm10
|
|
|
|
vmovdqa 80(%rsp), %xmm11
|
|
|
|
addq $96, %rsp
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
#endif /* USE_AVX2 */
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
.data
|
|
|
|
.p2align 3
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_addr:
|
2012-03-21 23:07:56 +01:00
|
|
|
.quad 0x0
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
2012-03-25 15:43:49 +02:00
|
|
|
.globl sha256d_ms_4way
|
|
|
|
.globl _sha256d_ms_4way
|
|
|
|
sha256d_ms_4way:
|
|
|
|
_sha256d_ms_4way:
|
|
|
|
jmp *sha256d_ms_4way_addr(%rip)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
.p2align 6
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_sse2:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
pushq %rdi
|
2012-04-01 19:39:01 +02:00
|
|
|
subq $32, %rsp
|
2012-03-21 23:07:56 +01:00
|
|
|
movdqa %xmm6, 0(%rsp)
|
|
|
|
movdqa %xmm7, 16(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
2012-03-24 01:27:23 +01:00
|
|
|
movq %r9, %rcx
|
2012-03-21 23:07:56 +01:00
|
|
|
#endif
|
2012-04-01 19:39:01 +02:00
|
|
|
subq $8+67*16, %rsp
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsi), %rax
|
2012-03-25 15:43:49 +02:00
|
|
|
|
|
|
|
sha256d_ms_4way_sse2_extend_loop1:
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa 3*16(%rsi), %xmm0
|
|
|
|
movdqa 2*16(%rax), %xmm3
|
|
|
|
movdqa 3*16(%rax), %xmm7
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm3, 5*16(%rsp)
|
|
|
|
movdqa %xmm7, 6*16(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
paddd %xmm0, %xmm7
|
|
|
|
psrld $3, %xmm0
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
pslld $14, %xmm2
|
|
|
|
psrld $4, %xmm1
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
psrld $11, %xmm1
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
movdqa %xmm3, 2*16(%rax)
|
|
|
|
movdqa %xmm7, 3*16(%rax)
|
|
|
|
|
|
|
|
movdqa 4*16(%rax), %xmm0
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 7*16(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
movdqa %xmm3, 4*16(%rax)
|
|
|
|
movdqa %xmm7, 5*16(%rax)
|
|
|
|
|
|
|
|
movdqa 6*16(%rax), %xmm0
|
|
|
|
movdqa 7*16(%rax), %xmm4
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 9*16(%rsp)
|
|
|
|
movdqa %xmm4, 10*16(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 6*16(%rax)
|
|
|
|
movdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
movdqa 8*16(%rax), %xmm0
|
|
|
|
movdqa 2*16(%rax), %xmm4
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 11*16(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 8*16(%rax)
|
|
|
|
movdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd 3*16(%rax), %xmm3
|
|
|
|
paddd 4*16(%rax), %xmm7
|
|
|
|
movdqa %xmm3, 10*16(%rax)
|
|
|
|
movdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd 5*16(%rax), %xmm3
|
|
|
|
paddd 6*16(%rax), %xmm7
|
|
|
|
movdqa %xmm3, 12*16(%rax)
|
|
|
|
movdqa %xmm7, 13*16(%rax)
|
|
|
|
|
|
|
|
movdqa 14*16(%rax), %xmm0
|
|
|
|
movdqa 15*16(%rax), %xmm4
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 17*16(%rsp)
|
|
|
|
movdqa %xmm4, 18*16(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
paddd 7*16(%rax), %xmm0
|
|
|
|
paddd 8*16(%rax), %xmm4
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 14*16(%rax)
|
|
|
|
movdqa %xmm7, 15*16(%rax)
|
|
|
|
|
|
|
|
sha256d_ms_4way_sse2_extend_loop2:
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_sse2_extend_doubleround 16
|
|
|
|
sha256_sse2_extend_doubleround 18
|
|
|
|
sha256_sse2_extend_doubleround 20
|
|
|
|
sha256_sse2_extend_doubleround 22
|
|
|
|
sha256_sse2_extend_doubleround 24
|
|
|
|
sha256_sse2_extend_doubleround 26
|
|
|
|
sha256_sse2_extend_doubleround 28
|
|
|
|
sha256_sse2_extend_doubleround 30
|
|
|
|
sha256_sse2_extend_doubleround 32
|
|
|
|
sha256_sse2_extend_doubleround 34
|
|
|
|
sha256_sse2_extend_doubleround 36
|
|
|
|
sha256_sse2_extend_doubleround 38
|
|
|
|
sha256_sse2_extend_doubleround 40
|
|
|
|
sha256_sse2_extend_doubleround 42
|
|
|
|
jz sha256d_ms_4way_sse2_extend_coda2
|
|
|
|
sha256_sse2_extend_doubleround 44
|
|
|
|
sha256_sse2_extend_doubleround 46
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
movdqa 0(%rcx), %xmm3
|
|
|
|
movdqa 16(%rcx), %xmm0
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 32(%rcx), %xmm1
|
|
|
|
movdqa 48(%rcx), %xmm2
|
|
|
|
movdqa 64(%rcx), %xmm6
|
2012-03-24 01:27:23 +01:00
|
|
|
movdqa 80(%rcx), %xmm7
|
|
|
|
movdqa 96(%rcx), %xmm5
|
|
|
|
movdqa 112(%rcx), %xmm4
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm1, 0(%rsp)
|
|
|
|
movdqa %xmm2, 16(%rsp)
|
|
|
|
movdqa %xmm6, 32(%rsp)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movq %rsi, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_sse2_main_loop1
|
|
|
|
|
|
|
|
sha256d_ms_4way_sse2_main_loop2:
|
|
|
|
sha256_sse2_main_round 0
|
|
|
|
sha256_sse2_main_round 1
|
|
|
|
sha256_sse2_main_round 2
|
|
|
|
sha256d_ms_4way_sse2_main_loop1:
|
|
|
|
sha256_sse2_main_round 3
|
2012-04-01 19:39:01 +02:00
|
|
|
sha256_sse2_main_quadround 4
|
|
|
|
sha256_sse2_main_quadround 8
|
|
|
|
sha256_sse2_main_quadround 12
|
|
|
|
sha256_sse2_main_quadround 16
|
|
|
|
sha256_sse2_main_quadround 20
|
|
|
|
sha256_sse2_main_quadround 24
|
|
|
|
sha256_sse2_main_quadround 28
|
|
|
|
sha256_sse2_main_quadround 32
|
|
|
|
sha256_sse2_main_quadround 36
|
|
|
|
sha256_sse2_main_quadround 40
|
|
|
|
sha256_sse2_main_quadround 44
|
|
|
|
sha256_sse2_main_quadround 48
|
|
|
|
sha256_sse2_main_quadround 52
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_sse2_main_round 56
|
2012-03-26 14:15:35 +02:00
|
|
|
jz sha256d_ms_4way_sse2_finish
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_sse2_main_round 57
|
|
|
|
sha256_sse2_main_round 58
|
|
|
|
sha256_sse2_main_round 59
|
2012-04-01 19:39:01 +02:00
|
|
|
sha256_sse2_main_quadround 60
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 5*16(%rsp), %xmm1
|
|
|
|
movdqa 6*16(%rsp), %xmm2
|
|
|
|
movdqa 7*16(%rsp), %xmm6
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm1, 18*16(%rsi)
|
|
|
|
movdqa %xmm2, 19*16(%rsi)
|
|
|
|
movdqa %xmm6, 20*16(%rsi)
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 9*16(%rsp), %xmm1
|
|
|
|
movdqa 10*16(%rsp), %xmm2
|
|
|
|
movdqa 11*16(%rsp), %xmm6
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm1, 22*16(%rsi)
|
|
|
|
movdqa %xmm2, 23*16(%rsi)
|
|
|
|
movdqa %xmm6, 24*16(%rsi)
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 17*16(%rsp), %xmm1
|
|
|
|
movdqa 18*16(%rsp), %xmm2
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm1, 30*16(%rsi)
|
|
|
|
movdqa %xmm2, 31*16(%rsi)
|
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa 0(%rsp), %xmm1
|
|
|
|
movdqa 16(%rsp), %xmm2
|
|
|
|
movdqa 32(%rsp), %xmm6
|
2012-03-21 23:07:56 +01:00
|
|
|
paddd 0(%rdx), %xmm7
|
|
|
|
paddd 16(%rdx), %xmm5
|
|
|
|
paddd 32(%rdx), %xmm4
|
|
|
|
paddd 48(%rdx), %xmm3
|
|
|
|
paddd 64(%rdx), %xmm0
|
2012-04-01 19:39:01 +02:00
|
|
|
paddd 80(%rdx), %xmm1
|
|
|
|
paddd 96(%rdx), %xmm2
|
|
|
|
paddd 112(%rdx), %xmm6
|
|
|
|
|
|
|
|
movdqa %xmm7, 48+0(%rsp)
|
|
|
|
movdqa %xmm5, 48+16(%rsp)
|
|
|
|
movdqa %xmm4, 48+32(%rsp)
|
|
|
|
movdqa %xmm3, 48+48(%rsp)
|
|
|
|
movdqa %xmm0, 48+64(%rsp)
|
|
|
|
movdqa %xmm1, 48+80(%rsp)
|
|
|
|
movdqa %xmm2, 48+96(%rsp)
|
|
|
|
movdqa %xmm6, 48+112(%rsp)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
movq $0x8000000000000100, %rax
|
|
|
|
movd %rax, %xmm1
|
|
|
|
pshufd $0x55, %xmm1, %xmm2
|
|
|
|
pshufd $0x00, %xmm1, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm2, 48+128(%rsp)
|
|
|
|
movdqa %xmm0, 48+144(%rsp)
|
|
|
|
movdqa %xmm0, 48+160(%rsp)
|
|
|
|
movdqa %xmm0, 48+176(%rsp)
|
|
|
|
movdqa %xmm0, 48+192(%rsp)
|
|
|
|
movdqa %xmm0, 48+208(%rsp)
|
|
|
|
movdqa %xmm0, 48+224(%rsp)
|
|
|
|
movdqa %xmm1, 48+240(%rsp)
|
|
|
|
|
|
|
|
leaq 19*16(%rsp), %rax
|
2012-03-25 15:43:49 +02:00
|
|
|
cmpq %rax, %rax
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa -15*16(%rax), %xmm0
|
|
|
|
movdqa -14*16(%rax), %xmm4
|
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
movdqa %xmm4, %xmm6
|
|
|
|
psrld $3, %xmm0
|
|
|
|
psrld $3, %xmm4
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa %xmm4, %xmm5
|
|
|
|
pslld $14, %xmm2
|
|
|
|
pslld $14, %xmm6
|
|
|
|
psrld $4, %xmm1
|
|
|
|
psrld $4, %xmm5
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
psrld $11, %xmm1
|
|
|
|
psrld $11, %xmm5
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pslld $11, %xmm6
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
paddd -16*16(%rax), %xmm0
|
|
|
|
paddd -15*16(%rax), %xmm4
|
|
|
|
paddd sha256d_4preext2_17(%rip), %xmm4
|
|
|
|
movdqa %xmm0, %xmm3
|
|
|
|
movdqa %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 0*16(%rax)
|
|
|
|
movdqa %xmm7, 1*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
sha256_sse2_extend_doubleround 2
|
|
|
|
sha256_sse2_extend_doubleround 4
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa -9*16(%rax), %xmm0
|
|
|
|
movdqa sha256d_4preext2_23(%rip), %xmm4
|
|
|
|
movdqa %xmm0, %xmm2
|
|
|
|
psrld $3, %xmm0
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
pslld $14, %xmm2
|
|
|
|
psrld $4, %xmm1
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
psrld $11, %xmm1
|
|
|
|
pslld $11, %xmm2
|
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
|
|
|
paddd -10*16(%rax), %xmm0
|
|
|
|
paddd -9*16(%rax), %xmm4
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
paddd -1*16(%rax), %xmm0
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
paddd 0*16(%rax), %xmm4
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 6*16(%rax)
|
|
|
|
movdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
movdqa sha256d_4preext2_24(%rip), %xmm0
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
paddd 1*16(%rax), %xmm0
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd 2*16(%rax), %xmm7
|
|
|
|
movdqa %xmm3, 8*16(%rax)
|
|
|
|
movdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd 3*16(%rax), %xmm3
|
|
|
|
paddd 4*16(%rax), %xmm7
|
|
|
|
movdqa %xmm3, 10*16(%rax)
|
|
|
|
movdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd 5*16(%rax), %xmm3
|
|
|
|
paddd 6*16(%rax), %xmm7
|
|
|
|
movdqa %xmm3, 12*16(%rax)
|
|
|
|
movdqa %xmm7, 13*16(%rax)
|
|
|
|
|
|
|
|
movdqa sha256d_4preext2_30(%rip), %xmm0
|
|
|
|
movdqa 0*16(%rax), %xmm4
|
|
|
|
movdqa %xmm4, %xmm6
|
|
|
|
psrld $3, %xmm4
|
|
|
|
movdqa %xmm4, %xmm5
|
|
|
|
pslld $14, %xmm6
|
|
|
|
psrld $4, %xmm5
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
psrld $11, %xmm5
|
|
|
|
pslld $11, %xmm6
|
|
|
|
pxor %xmm5, %xmm4
|
|
|
|
pxor %xmm6, %xmm4
|
|
|
|
paddd -1*16(%rax), %xmm4
|
|
|
|
movdqa %xmm3, %xmm2
|
|
|
|
movdqa %xmm7, %xmm6
|
|
|
|
psrld $10, %xmm3
|
|
|
|
psrld $10, %xmm7
|
|
|
|
movdqa %xmm3, %xmm1
|
|
|
|
movdqa %xmm7, %xmm5
|
|
|
|
paddd 7*16(%rax), %xmm0
|
|
|
|
pslld $13, %xmm2
|
|
|
|
pslld $13, %xmm6
|
|
|
|
psrld $7, %xmm1
|
|
|
|
psrld $7, %xmm5
|
|
|
|
paddd 8*16(%rax), %xmm4
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
psrld $2, %xmm1
|
|
|
|
psrld $2, %xmm5
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
pslld $2, %xmm2
|
|
|
|
pslld $2, %xmm6
|
|
|
|
pxor %xmm1, %xmm3
|
|
|
|
pxor %xmm5, %xmm7
|
|
|
|
pxor %xmm2, %xmm3
|
|
|
|
pxor %xmm6, %xmm7
|
|
|
|
paddd %xmm0, %xmm3
|
|
|
|
paddd %xmm4, %xmm7
|
|
|
|
movdqa %xmm3, 14*16(%rax)
|
|
|
|
movdqa %xmm7, 15*16(%rax)
|
|
|
|
|
|
|
|
jmp sha256d_ms_4way_sse2_extend_loop2
|
|
|
|
|
|
|
|
sha256d_ms_4way_sse2_extend_coda2:
|
|
|
|
sha256_sse2_extend_round 44
|
|
|
|
|
|
|
|
movdqa sha256_4h+0(%rip), %xmm7
|
|
|
|
movdqa sha256_4h+16(%rip), %xmm5
|
|
|
|
movdqa sha256_4h+32(%rip), %xmm4
|
|
|
|
movdqa sha256_4h+48(%rip), %xmm3
|
|
|
|
movdqa sha256_4h+64(%rip), %xmm0
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa sha256_4h+80(%rip), %xmm1
|
|
|
|
movdqa sha256_4h+96(%rip), %xmm2
|
|
|
|
movdqa sha256_4h+112(%rip), %xmm6
|
|
|
|
movdqa %xmm1, 0(%rsp)
|
|
|
|
movdqa %xmm2, 16(%rsp)
|
|
|
|
movdqa %xmm6, 32(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
leaq 48(%rsp), %rax
|
2012-03-30 00:40:41 +02:00
|
|
|
leaq sha256_4k(%rip), %rcx
|
|
|
|
jmp sha256d_ms_4way_sse2_main_loop2
|
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
.macro sha256_sse2_main_round_red i, r7
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa 16*\i(%rax), %xmm6
|
|
|
|
paddd 16*\i(%rcx), %xmm6
|
2012-04-01 19:39:01 +02:00
|
|
|
paddd 32(%rsp), %xmm6
|
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
movdqa 16(%rsp), %xmm2
|
|
|
|
paddd \r7, %xmm6
|
2012-03-30 00:40:41 +02:00
|
|
|
pandn %xmm2, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm2, 32(%rsp)
|
|
|
|
movdqa 0(%rsp), %xmm2
|
|
|
|
movdqa %xmm2, 16(%rsp)
|
|
|
|
pand %xmm0, %xmm2
|
2012-03-30 00:40:41 +02:00
|
|
|
pxor %xmm2, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, 0(%rsp)
|
2012-03-30 00:40:41 +02:00
|
|
|
paddd %xmm1, %xmm6
|
2012-04-01 19:39:01 +02:00
|
|
|
movdqa %xmm0, %xmm1
|
|
|
|
psrld $6, %xmm0
|
|
|
|
movdqa %xmm0, %xmm2
|
2012-03-30 00:40:41 +02:00
|
|
|
pslld $7, %xmm1
|
|
|
|
psrld $5, %xmm2
|
2012-04-01 19:39:01 +02:00
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
2012-03-30 00:40:41 +02:00
|
|
|
pslld $14, %xmm1
|
|
|
|
psrld $14, %xmm2
|
2012-04-01 19:39:01 +02:00
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
pxor %xmm2, %xmm0
|
2012-03-26 14:15:35 +02:00
|
|
|
pslld $5, %xmm1
|
2012-04-01 19:39:01 +02:00
|
|
|
pxor %xmm1, %xmm0
|
|
|
|
paddd %xmm6, %xmm0
|
2012-03-26 14:15:35 +02:00
|
|
|
.endm
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_sse2_finish:
|
2012-04-01 19:39:01 +02:00
|
|
|
sha256_sse2_main_round_red 57, %xmm3
|
|
|
|
sha256_sse2_main_round_red 58, %xmm4
|
|
|
|
sha256_sse2_main_round_red 59, %xmm5
|
|
|
|
sha256_sse2_main_round_red 60, %xmm7
|
2012-03-26 14:15:35 +02:00
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
paddd sha256_4h+112(%rip), %xmm0
|
|
|
|
movdqa %xmm0, 112(%rdi)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
addq $8+67*16, %rsp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
popq %rsi
|
|
|
|
movdqa 0(%rsp), %xmm6
|
|
|
|
movdqa 16(%rsp), %xmm7
|
2012-04-01 19:39:01 +02:00
|
|
|
addq $32, %rsp
|
2012-03-21 23:07:56 +01:00
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(USE_AVX)
|
|
|
|
|
|
|
|
.p2align 6
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_avx:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
pushq %rdi
|
|
|
|
subq $80, %rsp
|
|
|
|
movdqa %xmm6, 0(%rsp)
|
|
|
|
movdqa %xmm7, 16(%rsp)
|
|
|
|
movdqa %xmm8, 32(%rsp)
|
|
|
|
movdqa %xmm9, 48(%rsp)
|
|
|
|
movdqa %xmm10, 64(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
2012-03-24 01:27:23 +01:00
|
|
|
movq %r9, %rcx
|
2012-03-21 23:07:56 +01:00
|
|
|
#endif
|
|
|
|
subq $1032, %rsp
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsi), %rax
|
2012-03-25 15:43:49 +02:00
|
|
|
|
|
|
|
sha256d_ms_4way_avx_extend_loop1:
|
2012-03-30 00:40:41 +02:00
|
|
|
vmovdqa 3*16(%rsi), %xmm0
|
|
|
|
vmovdqa 2*16(%rax), %xmm3
|
|
|
|
vmovdqa 3*16(%rax), %xmm7
|
|
|
|
vmovdqa %xmm3, 2*16(%rsp)
|
|
|
|
vmovdqa %xmm7, 3*16(%rsp)
|
|
|
|
vpaddd %xmm0, %xmm7, %xmm7
|
|
|
|
vpslld $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm0
|
|
|
|
vpsrld $4, %xmm0, %xmm1
|
|
|
|
vpxor %xmm1, %xmm0, %xmm0
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpsrld $11, %xmm1, %xmm1
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, %xmm0, %xmm0
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, 2*16(%rax)
|
|
|
|
vmovdqa %xmm7, 3*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 4*16(%rax), %xmm0
|
|
|
|
vmovdqa %xmm0, 4*16(%rsp)
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, 4*16(%rax)
|
|
|
|
vmovdqa %xmm7, 5*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 6*16(%rax), %xmm0
|
|
|
|
vmovdqa 7*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 6*16(%rsp)
|
|
|
|
vmovdqa %xmm4, 7*16(%rsp)
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 6*16(%rax)
|
|
|
|
vmovdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 8*16(%rax), %xmm0
|
|
|
|
vmovdqa 2*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 8*16(%rsp)
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 8*16(%rax)
|
|
|
|
vmovdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 3*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 4*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 10*16(%rax)
|
|
|
|
vmovdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 5*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 6*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 12*16(%rax)
|
|
|
|
vmovdqa %xmm7, 13*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 14*16(%rax), %xmm0
|
|
|
|
vmovdqa 15*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 14*16(%rsp)
|
|
|
|
vmovdqa %xmm4, 15*16(%rsp)
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpaddd 7*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 8*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 14*16(%rax)
|
|
|
|
vmovdqa %xmm7, 15*16(%rax)
|
|
|
|
|
|
|
|
sha256d_ms_4way_avx_extend_loop2:
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_avx_extend_doubleround 16
|
|
|
|
sha256_avx_extend_doubleround 18
|
|
|
|
sha256_avx_extend_doubleround 20
|
|
|
|
sha256_avx_extend_doubleround 22
|
|
|
|
sha256_avx_extend_doubleround 24
|
|
|
|
sha256_avx_extend_doubleround 26
|
|
|
|
sha256_avx_extend_doubleround 28
|
|
|
|
sha256_avx_extend_doubleround 30
|
|
|
|
sha256_avx_extend_doubleround 32
|
|
|
|
sha256_avx_extend_doubleround 34
|
|
|
|
sha256_avx_extend_doubleround 36
|
|
|
|
sha256_avx_extend_doubleround 38
|
|
|
|
sha256_avx_extend_doubleround 40
|
|
|
|
sha256_avx_extend_doubleround 42
|
|
|
|
jz sha256d_ms_4way_avx_extend_coda2
|
|
|
|
sha256_avx_extend_doubleround 44
|
|
|
|
sha256_avx_extend_doubleround 46
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
movdqa 0(%rcx), %xmm7
|
|
|
|
movdqa 16(%rcx), %xmm8
|
|
|
|
movdqa 32(%rcx), %xmm9
|
|
|
|
movdqa 48(%rcx), %xmm10
|
|
|
|
movdqa 64(%rcx), %xmm0
|
|
|
|
movdqa 80(%rcx), %xmm5
|
|
|
|
movdqa 96(%rcx), %xmm4
|
|
|
|
movdqa 112(%rcx), %xmm3
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movq %rsi, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_avx_main_loop1
|
|
|
|
|
|
|
|
sha256d_ms_4way_avx_main_loop2:
|
|
|
|
sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
|
|
sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256d_ms_4way_avx_main_loop1:
|
|
|
|
sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
sha256_avx_main_quadround 4
|
|
|
|
sha256_avx_main_quadround 8
|
|
|
|
sha256_avx_main_quadround 12
|
|
|
|
sha256_avx_main_quadround 16
|
|
|
|
sha256_avx_main_quadround 20
|
|
|
|
sha256_avx_main_quadround 24
|
|
|
|
sha256_avx_main_quadround 28
|
|
|
|
sha256_avx_main_quadround 32
|
|
|
|
sha256_avx_main_quadround 36
|
|
|
|
sha256_avx_main_quadround 40
|
|
|
|
sha256_avx_main_quadround 44
|
|
|
|
sha256_avx_main_quadround 48
|
|
|
|
sha256_avx_main_quadround 52
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
2012-03-25 15:43:49 +02:00
|
|
|
jz sha256d_ms_4way_avx_finish
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
sha256_avx_main_quadround 60
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa 2*16(%rsp), %xmm1
|
|
|
|
movdqa 3*16(%rsp), %xmm2
|
|
|
|
movdqa 4*16(%rsp), %xmm6
|
|
|
|
movdqa %xmm1, 18*16(%rsi)
|
|
|
|
movdqa %xmm2, 19*16(%rsi)
|
|
|
|
movdqa %xmm6, 20*16(%rsi)
|
|
|
|
movdqa 6*16(%rsp), %xmm1
|
|
|
|
movdqa 7*16(%rsp), %xmm2
|
|
|
|
movdqa 8*16(%rsp), %xmm6
|
|
|
|
movdqa %xmm1, 22*16(%rsi)
|
|
|
|
movdqa %xmm2, 23*16(%rsi)
|
|
|
|
movdqa %xmm6, 24*16(%rsi)
|
|
|
|
movdqa 14*16(%rsp), %xmm1
|
|
|
|
movdqa 15*16(%rsp), %xmm2
|
|
|
|
movdqa %xmm1, 30*16(%rsi)
|
|
|
|
movdqa %xmm2, 31*16(%rsi)
|
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
paddd 0(%rdx), %xmm7
|
|
|
|
paddd 16(%rdx), %xmm5
|
|
|
|
paddd 32(%rdx), %xmm4
|
|
|
|
paddd 48(%rdx), %xmm3
|
|
|
|
paddd 64(%rdx), %xmm0
|
|
|
|
paddd 80(%rdx), %xmm8
|
|
|
|
paddd 96(%rdx), %xmm9
|
|
|
|
paddd 112(%rdx), %xmm10
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa %xmm7, 0(%rsp)
|
|
|
|
movdqa %xmm5, 16(%rsp)
|
|
|
|
movdqa %xmm4, 32(%rsp)
|
|
|
|
movdqa %xmm3, 48(%rsp)
|
|
|
|
movdqa %xmm0, 64(%rsp)
|
|
|
|
movdqa %xmm8, 80(%rsp)
|
|
|
|
movdqa %xmm9, 96(%rsp)
|
|
|
|
movdqa %xmm10, 112(%rsp)
|
|
|
|
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
movq $0x8000000000000100, %rax
|
|
|
|
movd %rax, %xmm1
|
|
|
|
pshufd $0x55, %xmm1, %xmm2
|
|
|
|
pshufd $0x00, %xmm1, %xmm1
|
|
|
|
movdqa %xmm2, 128(%rsp)
|
|
|
|
movdqa %xmm0, 144(%rsp)
|
|
|
|
movdqa %xmm0, 160(%rsp)
|
|
|
|
movdqa %xmm0, 176(%rsp)
|
|
|
|
movdqa %xmm0, 192(%rsp)
|
|
|
|
movdqa %xmm0, 208(%rsp)
|
|
|
|
movdqa %xmm0, 224(%rsp)
|
|
|
|
movdqa %xmm1, 240(%rsp)
|
|
|
|
|
|
|
|
leaq 256(%rsp), %rax
|
|
|
|
cmpq %rax, %rax
|
|
|
|
|
|
|
|
vmovdqa -15*16(%rax), %xmm0
|
|
|
|
vmovdqa -14*16(%rax), %xmm4
|
|
|
|
vpslld $14, %xmm0, %xmm2
|
|
|
|
vpslld $14, %xmm4, %xmm6
|
|
|
|
vpsrld $3, %xmm0, %xmm8
|
|
|
|
vpsrld $3, %xmm4, %xmm4
|
|
|
|
vpsrld $7, %xmm0, %xmm1
|
|
|
|
vpsrld $4, %xmm4, %xmm5
|
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
|
|
|
vpsrld $11, %xmm1, %xmm1
|
|
|
|
vpsrld $11, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpslld $11, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpaddd %xmm0, %xmm4, %xmm4
|
|
|
|
vpaddd -16*16(%rax), %xmm8, %xmm3
|
|
|
|
vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
|
|
|
|
vmovdqa %xmm3, 0*16(%rax)
|
|
|
|
vmovdqa %xmm7, 1*16(%rax)
|
|
|
|
|
|
|
|
sha256_avx_extend_doubleround 2
|
|
|
|
sha256_avx_extend_doubleround 4
|
|
|
|
|
|
|
|
vmovdqa -9*16(%rax), %xmm0
|
|
|
|
vpslld $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm8
|
|
|
|
vpsrld $7, %xmm0, %xmm1
|
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpsrld $11, %xmm1, %xmm1
|
|
|
|
vpslld $11, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, %xmm8, %xmm8
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
|
|
|
|
vpaddd -10*16(%rax), %xmm8, %xmm0
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpaddd -1*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 0*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 6*16(%rax)
|
|
|
|
vmovdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
|
|
|
|
vpaddd 1*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 2*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 8*16(%rax)
|
|
|
|
vmovdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 3*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 4*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 10*16(%rax)
|
|
|
|
vmovdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 5*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 6*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 12*16(%rax)
|
|
|
|
vmovdqa %xmm7, 13*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
vmovdqa sha256d_4preext2_30(%rip), %xmm0
|
|
|
|
vmovdqa 0*16(%rax), %xmm4
|
|
|
|
vpslld $14, %xmm4, %xmm6
|
|
|
|
vpsrld $3, %xmm4, %xmm4
|
|
|
|
vpsrld $4, %xmm4, %xmm5
|
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpsrld $11, %xmm5, %xmm5
|
|
|
|
vpslld $11, %xmm6, %xmm6
|
|
|
|
vpxor %xmm5, %xmm4, %xmm4
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpaddd -1*16(%rax), %xmm4, %xmm4
|
|
|
|
vpslld $13, %xmm3, %xmm2
|
|
|
|
vpslld $13, %xmm7, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpaddd 7*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 8*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $7, %xmm3, %xmm1
|
|
|
|
vpsrld $7, %xmm7, %xmm5
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpsrld $2, %xmm1, %xmm1
|
|
|
|
vpsrld $2, %xmm5, %xmm5
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpslld $2, %xmm2, %xmm2
|
|
|
|
vpslld $2, %xmm6, %xmm6
|
|
|
|
vpxor %xmm1, %xmm3, %xmm3
|
|
|
|
vpxor %xmm5, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 14*16(%rax)
|
|
|
|
vmovdqa %xmm7, 15*16(%rax)
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_avx_extend_loop2
|
|
|
|
|
|
|
|
sha256d_ms_4way_avx_extend_coda2:
|
|
|
|
sha256_avx_extend_round 44
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movdqa sha256_4h+0(%rip), %xmm7
|
|
|
|
movdqa sha256_4h+16(%rip), %xmm5
|
|
|
|
movdqa sha256_4h+32(%rip), %xmm4
|
|
|
|
movdqa sha256_4h+48(%rip), %xmm3
|
|
|
|
movdqa sha256_4h+64(%rip), %xmm0
|
|
|
|
movdqa sha256_4h+80(%rip), %xmm8
|
|
|
|
movdqa sha256_4h+96(%rip), %xmm9
|
|
|
|
movdqa sha256_4h+112(%rip), %xmm10
|
|
|
|
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_avx_main_loop2
|
2012-03-26 14:15:35 +02:00
|
|
|
|
|
|
|
.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4
|
|
|
|
vpaddd 16*\i(%rax), \r0, %xmm6
|
|
|
|
vpaddd 16*\i(%rcx), %xmm6, %xmm6
|
|
|
|
vpandn \r1, \r3, %xmm1
|
|
|
|
vpand \r3, \r2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
|
|
|
vpslld $7, \r3, %xmm1
|
|
|
|
vpsrld $6, \r3, \r0
|
|
|
|
vpsrld $5, \r0, %xmm2
|
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpslld $14, %xmm1, %xmm1
|
|
|
|
vpsrld $14, %xmm2, %xmm2
|
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpslld $5, %xmm1, %xmm1
|
|
|
|
vpxor %xmm1, \r0, \r0
|
|
|
|
vpaddd \r0, %xmm6, %xmm6
|
|
|
|
vpaddd %xmm6, \r4, \r0
|
|
|
|
.endm
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_avx_finish:
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
|
|
|
|
sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
|
|
|
|
sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
|
|
|
|
sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
paddd sha256_4h+112(%rip), %xmm10
|
2012-03-21 23:07:56 +01:00
|
|
|
movdqa %xmm10, 112(%rdi)
|
|
|
|
|
|
|
|
addq $1032, %rsp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
popq %rsi
|
|
|
|
movdqa 0(%rsp), %xmm6
|
|
|
|
movdqa 16(%rsp), %xmm7
|
|
|
|
movdqa 32(%rsp), %xmm8
|
|
|
|
movdqa 48(%rsp), %xmm9
|
|
|
|
movdqa 64(%rsp), %xmm10
|
|
|
|
addq $80, %rsp
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
#endif /* USE_AVX */
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(USE_XOP)
|
|
|
|
|
|
|
|
.p2align 6
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_xop:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
pushq %rdi
|
|
|
|
subq $80, %rsp
|
|
|
|
movdqa %xmm6, 0(%rsp)
|
|
|
|
movdqa %xmm7, 16(%rsp)
|
|
|
|
movdqa %xmm8, 32(%rsp)
|
|
|
|
movdqa %xmm9, 48(%rsp)
|
|
|
|
movdqa %xmm10, 64(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
2012-03-24 01:27:23 +01:00
|
|
|
movq %r9, %rcx
|
2012-03-21 23:07:56 +01:00
|
|
|
#endif
|
|
|
|
subq $1032, %rsp
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsi), %rax
|
2012-03-25 15:43:49 +02:00
|
|
|
|
|
|
|
sha256d_ms_4way_xop_extend_loop1:
|
2012-03-30 00:40:41 +02:00
|
|
|
vmovdqa 3*16(%rsi), %xmm0
|
|
|
|
vmovdqa 2*16(%rax), %xmm3
|
|
|
|
vmovdqa 3*16(%rax), %xmm7
|
|
|
|
vmovdqa %xmm3, 2*16(%rsp)
|
|
|
|
vmovdqa %xmm7, 3*16(%rsp)
|
|
|
|
vpaddd %xmm0, %xmm7, %xmm7
|
|
|
|
vprotd $25, %xmm0, %xmm1
|
|
|
|
vprotd $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm0
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm0, %xmm0
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, 2*16(%rax)
|
|
|
|
vmovdqa %xmm7, 3*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 4*16(%rax), %xmm0
|
|
|
|
vmovdqa %xmm0, 4*16(%rsp)
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vmovdqa %xmm3, 4*16(%rax)
|
|
|
|
vmovdqa %xmm7, 5*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 6*16(%rax), %xmm0
|
|
|
|
vmovdqa 7*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 6*16(%rsp)
|
|
|
|
vmovdqa %xmm4, 7*16(%rsp)
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 6*16(%rax)
|
|
|
|
vmovdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 8*16(%rax), %xmm0
|
|
|
|
vmovdqa 2*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 8*16(%rsp)
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 8*16(%rax)
|
|
|
|
vmovdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 3*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 4*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 10*16(%rax)
|
|
|
|
vmovdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 5*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 6*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 12*16(%rax)
|
|
|
|
vmovdqa %xmm7, 13*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa 14*16(%rax), %xmm0
|
|
|
|
vmovdqa 15*16(%rax), %xmm4
|
|
|
|
vmovdqa %xmm0, 14*16(%rsp)
|
|
|
|
vmovdqa %xmm4, 15*16(%rsp)
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpaddd 7*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 8*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 14*16(%rax)
|
|
|
|
vmovdqa %xmm7, 15*16(%rax)
|
|
|
|
|
|
|
|
sha256d_ms_4way_xop_extend_loop2:
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256_xop_extend_doubleround 16
|
|
|
|
sha256_xop_extend_doubleround 18
|
|
|
|
sha256_xop_extend_doubleround 20
|
|
|
|
sha256_xop_extend_doubleround 22
|
|
|
|
sha256_xop_extend_doubleround 24
|
|
|
|
sha256_xop_extend_doubleround 26
|
|
|
|
sha256_xop_extend_doubleround 28
|
|
|
|
sha256_xop_extend_doubleround 30
|
|
|
|
sha256_xop_extend_doubleround 32
|
|
|
|
sha256_xop_extend_doubleround 34
|
|
|
|
sha256_xop_extend_doubleround 36
|
|
|
|
sha256_xop_extend_doubleround 38
|
|
|
|
sha256_xop_extend_doubleround 40
|
|
|
|
sha256_xop_extend_doubleround 42
|
|
|
|
jz sha256d_ms_4way_xop_extend_coda2
|
|
|
|
sha256_xop_extend_doubleround 44
|
|
|
|
sha256_xop_extend_doubleround 46
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
movdqa 0(%rcx), %xmm7
|
|
|
|
movdqa 16(%rcx), %xmm8
|
|
|
|
movdqa 32(%rcx), %xmm9
|
|
|
|
movdqa 48(%rcx), %xmm10
|
|
|
|
movdqa 64(%rcx), %xmm0
|
|
|
|
movdqa 80(%rcx), %xmm5
|
|
|
|
movdqa 96(%rcx), %xmm4
|
|
|
|
movdqa 112(%rcx), %xmm3
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movq %rsi, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_xop_main_loop1
|
|
|
|
|
|
|
|
sha256d_ms_4way_xop_main_loop2:
|
|
|
|
sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
|
|
|
sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256d_ms_4way_xop_main_loop1:
|
|
|
|
sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
sha256_xop_main_quadround 4
|
|
|
|
sha256_xop_main_quadround 8
|
|
|
|
sha256_xop_main_quadround 12
|
|
|
|
sha256_xop_main_quadround 16
|
|
|
|
sha256_xop_main_quadround 20
|
|
|
|
sha256_xop_main_quadround 24
|
|
|
|
sha256_xop_main_quadround 28
|
|
|
|
sha256_xop_main_quadround 32
|
|
|
|
sha256_xop_main_quadround 36
|
|
|
|
sha256_xop_main_quadround 40
|
|
|
|
sha256_xop_main_quadround 44
|
|
|
|
sha256_xop_main_quadround 48
|
|
|
|
sha256_xop_main_quadround 52
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7
|
2012-03-25 15:43:49 +02:00
|
|
|
jz sha256d_ms_4way_xop_finish
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3
|
|
|
|
sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4
|
|
|
|
sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5
|
|
|
|
sha256_xop_main_quadround 60
|
2012-03-21 23:07:56 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
movdqa 2*16(%rsp), %xmm1
|
|
|
|
movdqa 3*16(%rsp), %xmm2
|
|
|
|
movdqa 4*16(%rsp), %xmm6
|
|
|
|
movdqa %xmm1, 18*16(%rsi)
|
|
|
|
movdqa %xmm2, 19*16(%rsi)
|
|
|
|
movdqa %xmm6, 20*16(%rsi)
|
|
|
|
movdqa 6*16(%rsp), %xmm1
|
|
|
|
movdqa 7*16(%rsp), %xmm2
|
|
|
|
movdqa 8*16(%rsp), %xmm6
|
|
|
|
movdqa %xmm1, 22*16(%rsi)
|
|
|
|
movdqa %xmm2, 23*16(%rsi)
|
|
|
|
movdqa %xmm6, 24*16(%rsi)
|
|
|
|
movdqa 14*16(%rsp), %xmm1
|
|
|
|
movdqa 15*16(%rsp), %xmm2
|
|
|
|
movdqa %xmm1, 30*16(%rsi)
|
|
|
|
movdqa %xmm2, 31*16(%rsi)
|
|
|
|
|
2012-03-21 23:07:56 +01:00
|
|
|
paddd 0(%rdx), %xmm7
|
|
|
|
paddd 16(%rdx), %xmm5
|
|
|
|
paddd 32(%rdx), %xmm4
|
|
|
|
paddd 48(%rdx), %xmm3
|
|
|
|
paddd 64(%rdx), %xmm0
|
|
|
|
paddd 80(%rdx), %xmm8
|
|
|
|
paddd 96(%rdx), %xmm9
|
|
|
|
paddd 112(%rdx), %xmm10
|
|
|
|
|
|
|
|
movdqa %xmm7, 0(%rsp)
|
|
|
|
movdqa %xmm5, 16(%rsp)
|
|
|
|
movdqa %xmm4, 32(%rsp)
|
|
|
|
movdqa %xmm3, 48(%rsp)
|
|
|
|
movdqa %xmm0, 64(%rsp)
|
|
|
|
movdqa %xmm8, 80(%rsp)
|
|
|
|
movdqa %xmm9, 96(%rsp)
|
|
|
|
movdqa %xmm10, 112(%rsp)
|
|
|
|
|
|
|
|
pxor %xmm0, %xmm0
|
|
|
|
movq $0x8000000000000100, %rax
|
|
|
|
movd %rax, %xmm1
|
|
|
|
pshufd $0x55, %xmm1, %xmm2
|
|
|
|
pshufd $0x00, %xmm1, %xmm1
|
|
|
|
movdqa %xmm2, 128(%rsp)
|
|
|
|
movdqa %xmm0, 144(%rsp)
|
|
|
|
movdqa %xmm0, 160(%rsp)
|
|
|
|
movdqa %xmm0, 176(%rsp)
|
|
|
|
movdqa %xmm0, 192(%rsp)
|
|
|
|
movdqa %xmm0, 208(%rsp)
|
|
|
|
movdqa %xmm0, 224(%rsp)
|
|
|
|
movdqa %xmm1, 240(%rsp)
|
|
|
|
|
2012-03-24 01:27:23 +01:00
|
|
|
leaq 256(%rsp), %rax
|
2012-03-25 15:43:49 +02:00
|
|
|
cmpq %rax, %rax
|
2012-03-30 00:40:41 +02:00
|
|
|
|
|
|
|
vmovdqa -15*16(%rax), %xmm0
|
|
|
|
vmovdqa -14*16(%rax), %xmm4
|
|
|
|
vprotd $25, %xmm0, %xmm1
|
|
|
|
vprotd $25, %xmm4, %xmm5
|
|
|
|
vprotd $14, %xmm0, %xmm2
|
|
|
|
vprotd $14, %xmm4, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $3, %xmm0, %xmm8
|
|
|
|
vpsrld $3, %xmm4, %xmm4
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpaddd %xmm0, %xmm4, %xmm4
|
|
|
|
vpaddd -16*16(%rax), %xmm8, %xmm3
|
|
|
|
vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7
|
|
|
|
vmovdqa %xmm3, 0*16(%rax)
|
|
|
|
vmovdqa %xmm7, 1*16(%rax)
|
|
|
|
|
|
|
|
sha256_xop_extend_doubleround 2
|
|
|
|
sha256_xop_extend_doubleround 4
|
|
|
|
|
|
|
|
vmovdqa -9*16(%rax), %xmm0
|
|
|
|
vprotd $25, %xmm0, %xmm1
|
|
|
|
vprotd $14, %xmm0, %xmm2
|
|
|
|
vpsrld $3, %xmm0, %xmm8
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm8, %xmm8
|
|
|
|
vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4
|
|
|
|
vpaddd -10*16(%rax), %xmm8, %xmm0
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpaddd -1*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 0*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 6*16(%rax)
|
|
|
|
vmovdqa %xmm7, 7*16(%rax)
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3
|
|
|
|
vpaddd 1*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 2*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 8*16(%rax)
|
|
|
|
vmovdqa %xmm7, 9*16(%rax)
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 3*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 4*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 10*16(%rax)
|
|
|
|
vmovdqa %xmm7, 11*16(%rax)
|
|
|
|
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd 5*16(%rax), %xmm3, %xmm3
|
|
|
|
vpaddd 6*16(%rax), %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 12*16(%rax)
|
|
|
|
vmovdqa %xmm7, 13*16(%rax)
|
|
|
|
|
|
|
|
vmovdqa sha256d_4preext2_30(%rip), %xmm0
|
|
|
|
vmovdqa 0*16(%rax), %xmm4
|
|
|
|
vprotd $25, %xmm4, %xmm5
|
|
|
|
vprotd $14, %xmm4, %xmm6
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpsrld $3, %xmm4, %xmm4
|
|
|
|
vpxor %xmm6, %xmm4, %xmm4
|
|
|
|
vpaddd -1*16(%rax), %xmm4, %xmm4
|
|
|
|
vprotd $15, %xmm3, %xmm1
|
|
|
|
vprotd $15, %xmm7, %xmm5
|
|
|
|
vprotd $13, %xmm3, %xmm2
|
|
|
|
vprotd $13, %xmm7, %xmm6
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vpxor %xmm5, %xmm6, %xmm6
|
|
|
|
vpaddd 7*16(%rax), %xmm0, %xmm0
|
|
|
|
vpaddd 8*16(%rax), %xmm4, %xmm4
|
|
|
|
vpsrld $10, %xmm3, %xmm3
|
|
|
|
vpsrld $10, %xmm7, %xmm7
|
|
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
|
|
vpxor %xmm6, %xmm7, %xmm7
|
|
|
|
vpaddd %xmm0, %xmm3, %xmm3
|
|
|
|
vpaddd %xmm4, %xmm7, %xmm7
|
|
|
|
vmovdqa %xmm3, 14*16(%rax)
|
|
|
|
vmovdqa %xmm7, 15*16(%rax)
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_xop_extend_loop2
|
|
|
|
|
|
|
|
sha256d_ms_4way_xop_extend_coda2:
|
|
|
|
sha256_xop_extend_round 44
|
2012-03-21 23:07:56 +01:00
|
|
|
|
|
|
|
movdqa sha256_4h+0(%rip), %xmm7
|
|
|
|
movdqa sha256_4h+16(%rip), %xmm5
|
|
|
|
movdqa sha256_4h+32(%rip), %xmm4
|
|
|
|
movdqa sha256_4h+48(%rip), %xmm3
|
|
|
|
movdqa sha256_4h+64(%rip), %xmm0
|
|
|
|
movdqa sha256_4h+80(%rip), %xmm8
|
|
|
|
movdqa sha256_4h+96(%rip), %xmm9
|
|
|
|
movdqa sha256_4h+112(%rip), %xmm10
|
|
|
|
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_4k(%rip), %rcx
|
2012-03-25 15:43:49 +02:00
|
|
|
jmp sha256d_ms_4way_xop_main_loop2
|
2012-03-26 14:15:35 +02:00
|
|
|
|
|
|
|
.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4
|
|
|
|
vpaddd 16*\i(%rax), \r0, %xmm6
|
|
|
|
vpaddd 16*\i(%rcx), %xmm6, %xmm6
|
|
|
|
vpandn \r1, \r3, %xmm1
|
|
|
|
vpand \r3, \r2, %xmm2
|
|
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
|
|
vpaddd %xmm1, %xmm6, %xmm6
|
|
|
|
vprotd $26, \r3, %xmm1
|
|
|
|
vprotd $21, \r3, %xmm2
|
|
|
|
vpxor %xmm1, %xmm2, %xmm2
|
|
|
|
vprotd $7, \r3, \r0
|
|
|
|
vpxor %xmm2, \r0, \r0
|
|
|
|
vpaddd \r0, %xmm6, %xmm6
|
|
|
|
vpaddd %xmm6, \r4, \r0
|
|
|
|
.endm
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
sha256d_ms_4way_xop_finish:
|
2012-03-26 14:15:35 +02:00
|
|
|
sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4
|
|
|
|
sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5
|
|
|
|
sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7
|
|
|
|
sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3
|
|
|
|
|
2012-03-25 15:43:49 +02:00
|
|
|
paddd sha256_4h+112(%rip), %xmm10
|
2012-03-21 23:07:56 +01:00
|
|
|
movdqa %xmm10, 112(%rdi)
|
|
|
|
|
|
|
|
addq $1032, %rsp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2012-03-21 23:07:56 +01:00
|
|
|
popq %rsi
|
|
|
|
movdqa 0(%rsp), %xmm6
|
|
|
|
movdqa 16(%rsp), %xmm7
|
|
|
|
movdqa 32(%rsp), %xmm8
|
|
|
|
movdqa 48(%rsp), %xmm9
|
|
|
|
movdqa 64(%rsp), %xmm10
|
|
|
|
addq $80, %rsp
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
#endif /* USE_XOP */
|
2012-03-12 13:32:11 +01:00
|
|
|
|
2012-03-22 17:38:35 +01:00
|
|
|
|
2012-03-23 16:35:21 +01:00
|
|
|
.text
|
2012-03-22 17:38:35 +01:00
|
|
|
.p2align 6
|
2012-03-23 16:35:21 +01:00
|
|
|
.globl sha256_use_4way
|
|
|
|
.globl _sha256_use_4way
|
|
|
|
sha256_use_4way:
|
|
|
|
_sha256_use_4way:
|
2012-03-22 17:38:35 +01:00
|
|
|
pushq %rbx
|
|
|
|
pushq %rcx
|
|
|
|
pushq %rdx
|
|
|
|
|
2015-05-17 16:21:30 +02:00
|
|
|
/* Check for VIA PadLock Hash Engine */
|
|
|
|
movl $0xc0000000, %eax
|
|
|
|
cpuid
|
|
|
|
cmpl $0xc0000001, %eax
|
|
|
|
jb sha256_use_4way_no_phe
|
|
|
|
movl $0xc0000001, %eax
|
|
|
|
cpuid
|
|
|
|
andl $0x00000c00, %edx
|
|
|
|
cmpl $0x00000c00, %edx
|
|
|
|
jne sha256_use_4way_no_phe
|
|
|
|
leaq sha256_transform_phe(%rip), %rdx
|
|
|
|
movq %rdx, sha256_transform_addr(%rip)
|
|
|
|
xorl %eax, %eax
|
|
|
|
jmp sha256_use_4way_exit
|
|
|
|
sha256_use_4way_no_phe:
|
2012-03-22 17:38:35 +01:00
|
|
|
#if defined(USE_AVX)
|
2012-04-06 19:53:48 +02:00
|
|
|
/* Check for AVX and OSXSAVE support */
|
2012-03-22 17:38:35 +01:00
|
|
|
movl $1, %eax
|
|
|
|
cpuid
|
|
|
|
andl $0x18000000, %ecx
|
|
|
|
cmpl $0x18000000, %ecx
|
2012-04-01 19:39:01 +02:00
|
|
|
jne sha256_use_4way_base
|
2012-04-06 19:53:48 +02:00
|
|
|
/* Check for XMM and YMM state support */
|
2012-03-22 17:38:35 +01:00
|
|
|
xorl %ecx, %ecx
|
|
|
|
xgetbv
|
|
|
|
andl $0x00000006, %eax
|
|
|
|
cmpl $0x00000006, %eax
|
2012-04-01 19:39:01 +02:00
|
|
|
jne sha256_use_4way_base
|
2012-03-22 17:38:35 +01:00
|
|
|
#if defined(USE_XOP)
|
2012-04-06 19:53:48 +02:00
|
|
|
/* Check for XOP support */
|
2012-03-22 17:38:35 +01:00
|
|
|
movl $0x80000001, %eax
|
|
|
|
cpuid
|
|
|
|
andl $0x00000800, %ecx
|
2012-03-30 00:40:41 +02:00
|
|
|
jz sha256_use_4way_avx
|
2012-03-22 17:38:35 +01:00
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
sha256_use_4way_xop:
|
|
|
|
leaq sha256d_ms_4way_xop(%rip), %rcx
|
2012-03-22 17:38:35 +01:00
|
|
|
leaq sha256_transform_4way_core_xop(%rip), %rdx
|
2012-03-30 00:40:41 +02:00
|
|
|
jmp sha256_use_4way_done
|
2012-03-22 17:38:35 +01:00
|
|
|
#endif /* USE_XOP */
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
sha256_use_4way_avx:
|
|
|
|
leaq sha256d_ms_4way_avx(%rip), %rcx
|
2012-03-22 17:38:35 +01:00
|
|
|
leaq sha256_transform_4way_core_avx(%rip), %rdx
|
2012-03-30 00:40:41 +02:00
|
|
|
jmp sha256_use_4way_done
|
2012-03-22 17:38:35 +01:00
|
|
|
#endif /* USE_AVX */
|
|
|
|
|
2012-04-01 19:39:01 +02:00
|
|
|
sha256_use_4way_base:
|
2012-03-30 00:40:41 +02:00
|
|
|
leaq sha256d_ms_4way_sse2(%rip), %rcx
|
2012-03-22 17:38:35 +01:00
|
|
|
leaq sha256_transform_4way_core_sse2(%rip), %rdx
|
|
|
|
|
2012-03-30 00:40:41 +02:00
|
|
|
sha256_use_4way_done:
|
|
|
|
movq %rcx, sha256d_ms_4way_addr(%rip)
|
2012-03-22 17:38:35 +01:00
|
|
|
movq %rdx, sha256_transform_4way_core_addr(%rip)
|
2015-05-17 16:21:30 +02:00
|
|
|
movl $1, %eax
|
|
|
|
sha256_use_4way_exit:
|
2012-03-22 17:38:35 +01:00
|
|
|
popq %rdx
|
|
|
|
popq %rcx
|
|
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
|
2013-07-05 18:25:34 +02:00
|
|
|
|
|
|
|
#if defined(USE_AVX2)
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256d_ms_8way
|
|
|
|
.globl _sha256d_ms_8way
|
|
|
|
sha256d_ms_8way:
|
|
|
|
_sha256d_ms_8way:
|
|
|
|
sha256d_ms_8way_avx2:
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
pushq %rdi
|
|
|
|
subq $80, %rsp
|
|
|
|
vmovdqa %xmm6, 0(%rsp)
|
|
|
|
vmovdqa %xmm7, 16(%rsp)
|
|
|
|
vmovdqa %xmm8, 32(%rsp)
|
|
|
|
vmovdqa %xmm9, 48(%rsp)
|
|
|
|
vmovdqa %xmm10, 64(%rsp)
|
|
|
|
pushq %rsi
|
|
|
|
movq %rcx, %rdi
|
|
|
|
movq %rdx, %rsi
|
|
|
|
movq %r8, %rdx
|
|
|
|
movq %r9, %rcx
|
|
|
|
#endif
|
|
|
|
pushq %rbp
|
|
|
|
movq %rsp, %rbp
|
|
|
|
subq $64*32, %rsp
|
|
|
|
andq $-128, %rsp
|
|
|
|
|
|
|
|
leaq 16*32(%rsi), %rax
|
|
|
|
|
|
|
|
sha256d_ms_8way_avx2_extend_loop1:
|
|
|
|
vmovdqa 3*32(%rsi), %ymm0
|
|
|
|
vmovdqa 2*32(%rax), %ymm3
|
|
|
|
vmovdqa 3*32(%rax), %ymm7
|
|
|
|
vmovdqa %ymm3, 2*32(%rsp)
|
|
|
|
vmovdqa %ymm7, 3*32(%rsp)
|
|
|
|
vpaddd %ymm0, %ymm7, %ymm7
|
|
|
|
vpslld $14, %ymm0, %ymm2
|
|
|
|
vpsrld $3, %ymm0, %ymm0
|
|
|
|
vpsrld $4, %ymm0, %ymm1
|
|
|
|
vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
vpxor %ymm2, %ymm0, %ymm0
|
|
|
|
vpsrld $11, %ymm1, %ymm1
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, %ymm0, %ymm0
|
|
|
|
vpxor %ymm2, %ymm0, %ymm0
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vmovdqa %ymm3, 2*32(%rax)
|
|
|
|
vmovdqa %ymm7, 3*32(%rax)
|
|
|
|
|
|
|
|
vmovdqa 4*32(%rax), %ymm0
|
|
|
|
vmovdqa %ymm0, 4*32(%rsp)
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vmovdqa %ymm3, 4*32(%rax)
|
|
|
|
vmovdqa %ymm7, 5*32(%rax)
|
|
|
|
|
|
|
|
vmovdqa 6*32(%rax), %ymm0
|
|
|
|
vmovdqa 7*32(%rax), %ymm4
|
|
|
|
vmovdqa %ymm0, 6*32(%rsp)
|
|
|
|
vmovdqa %ymm4, 7*32(%rsp)
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 6*32(%rax)
|
|
|
|
vmovdqa %ymm7, 7*32(%rax)
|
|
|
|
|
|
|
|
vmovdqa 8*32(%rax), %ymm0
|
|
|
|
vmovdqa 2*32(%rax), %ymm4
|
|
|
|
vmovdqa %ymm0, 8*32(%rsp)
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 8*32(%rax)
|
|
|
|
vmovdqa %ymm7, 9*32(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd 3*32(%rax), %ymm3, %ymm3
|
|
|
|
vpaddd 4*32(%rax), %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 10*32(%rax)
|
|
|
|
vmovdqa %ymm7, 11*32(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd 5*32(%rax), %ymm3, %ymm3
|
|
|
|
vpaddd 6*32(%rax), %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 12*32(%rax)
|
|
|
|
vmovdqa %ymm7, 13*32(%rax)
|
|
|
|
|
|
|
|
vmovdqa 14*32(%rax), %ymm0
|
|
|
|
vmovdqa 15*32(%rax), %ymm4
|
|
|
|
vmovdqa %ymm0, 14*32(%rsp)
|
|
|
|
vmovdqa %ymm4, 15*32(%rsp)
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpaddd 7*32(%rax), %ymm0, %ymm0
|
|
|
|
vpaddd 8*32(%rax), %ymm4, %ymm4
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 14*32(%rax)
|
|
|
|
vmovdqa %ymm7, 15*32(%rax)
|
|
|
|
|
|
|
|
sha256d_ms_8way_avx2_extend_loop2:
|
|
|
|
sha256_avx2_extend_doubleround 16
|
|
|
|
sha256_avx2_extend_doubleround 18
|
|
|
|
sha256_avx2_extend_doubleround 20
|
|
|
|
sha256_avx2_extend_doubleround 22
|
|
|
|
sha256_avx2_extend_doubleround 24
|
|
|
|
sha256_avx2_extend_doubleround 26
|
|
|
|
sha256_avx2_extend_doubleround 28
|
|
|
|
sha256_avx2_extend_doubleround 30
|
|
|
|
sha256_avx2_extend_doubleround 32
|
|
|
|
sha256_avx2_extend_doubleround 34
|
|
|
|
sha256_avx2_extend_doubleround 36
|
|
|
|
sha256_avx2_extend_doubleround 38
|
|
|
|
sha256_avx2_extend_doubleround 40
|
|
|
|
sha256_avx2_extend_doubleround 42
|
|
|
|
jz sha256d_ms_8way_avx2_extend_coda2
|
|
|
|
sha256_avx2_extend_doubleround 44
|
|
|
|
sha256_avx2_extend_doubleround 46
|
|
|
|
|
|
|
|
vmovdqa 0(%rcx), %ymm7
|
|
|
|
vmovdqa 32(%rcx), %ymm8
|
|
|
|
vmovdqa 64(%rcx), %ymm9
|
|
|
|
vmovdqa 96(%rcx), %ymm10
|
|
|
|
vmovdqa 128(%rcx), %ymm0
|
|
|
|
vmovdqa 160(%rcx), %ymm5
|
|
|
|
vmovdqa 192(%rcx), %ymm4
|
|
|
|
vmovdqa 224(%rcx), %ymm3
|
|
|
|
|
|
|
|
movq %rsi, %rax
|
|
|
|
leaq sha256_8k(%rip), %rcx
|
|
|
|
jmp sha256d_ms_8way_avx2_main_loop1
|
|
|
|
|
|
|
|
sha256d_ms_8way_avx2_main_loop2:
|
|
|
|
sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
|
|
|
|
sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
|
|
|
|
sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
|
|
|
|
sha256d_ms_8way_avx2_main_loop1:
|
|
|
|
sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
|
|
|
|
sha256_avx2_main_quadround 4
|
|
|
|
sha256_avx2_main_quadround 8
|
|
|
|
sha256_avx2_main_quadround 12
|
|
|
|
sha256_avx2_main_quadround 16
|
|
|
|
sha256_avx2_main_quadround 20
|
|
|
|
sha256_avx2_main_quadround 24
|
|
|
|
sha256_avx2_main_quadround 28
|
|
|
|
sha256_avx2_main_quadround 32
|
|
|
|
sha256_avx2_main_quadround 36
|
|
|
|
sha256_avx2_main_quadround 40
|
|
|
|
sha256_avx2_main_quadround 44
|
|
|
|
sha256_avx2_main_quadround 48
|
|
|
|
sha256_avx2_main_quadround 52
|
|
|
|
sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7
|
|
|
|
jz sha256d_ms_8way_avx2_finish
|
|
|
|
sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3
|
|
|
|
sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4
|
|
|
|
sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5
|
|
|
|
sha256_avx2_main_quadround 60
|
|
|
|
|
|
|
|
vmovdqa 2*32(%rsp), %ymm1
|
|
|
|
vmovdqa 3*32(%rsp), %ymm2
|
|
|
|
vmovdqa 4*32(%rsp), %ymm6
|
|
|
|
vmovdqa %ymm1, 18*32(%rsi)
|
|
|
|
vmovdqa %ymm2, 19*32(%rsi)
|
|
|
|
vmovdqa %ymm6, 20*32(%rsi)
|
|
|
|
vmovdqa 6*32(%rsp), %ymm1
|
|
|
|
vmovdqa 7*32(%rsp), %ymm2
|
|
|
|
vmovdqa 8*32(%rsp), %ymm6
|
|
|
|
vmovdqa %ymm1, 22*32(%rsi)
|
|
|
|
vmovdqa %ymm2, 23*32(%rsi)
|
|
|
|
vmovdqa %ymm6, 24*32(%rsi)
|
|
|
|
vmovdqa 14*32(%rsp), %ymm1
|
|
|
|
vmovdqa 15*32(%rsp), %ymm2
|
|
|
|
vmovdqa %ymm1, 30*32(%rsi)
|
|
|
|
vmovdqa %ymm2, 31*32(%rsi)
|
|
|
|
|
|
|
|
vpaddd 0(%rdx), %ymm7, %ymm7
|
|
|
|
vpaddd 32(%rdx), %ymm5, %ymm5
|
|
|
|
vpaddd 64(%rdx), %ymm4, %ymm4
|
|
|
|
vpaddd 96(%rdx), %ymm3, %ymm3
|
|
|
|
vpaddd 128(%rdx), %ymm0, %ymm0
|
|
|
|
vpaddd 160(%rdx), %ymm8, %ymm8
|
|
|
|
vpaddd 192(%rdx), %ymm9, %ymm9
|
|
|
|
vpaddd 224(%rdx), %ymm10, %ymm10
|
|
|
|
|
|
|
|
vmovdqa %ymm7, 0(%rsp)
|
|
|
|
vmovdqa %ymm5, 32(%rsp)
|
|
|
|
vmovdqa %ymm4, 64(%rsp)
|
|
|
|
vmovdqa %ymm3, 96(%rsp)
|
|
|
|
vmovdqa %ymm0, 128(%rsp)
|
|
|
|
vmovdqa %ymm8, 160(%rsp)
|
|
|
|
vmovdqa %ymm9, 192(%rsp)
|
|
|
|
vmovdqa %ymm10, 224(%rsp)
|
|
|
|
|
|
|
|
vpxor %ymm0, %ymm0, %ymm0
|
|
|
|
movq $0x8000000000000100, %rax
|
|
|
|
vmovd %rax, %xmm1
|
|
|
|
vinserti128 $1, %xmm1, %ymm1, %ymm1
|
|
|
|
vpshufd $0x55, %ymm1, %ymm2
|
|
|
|
vpshufd $0x00, %ymm1, %ymm1
|
|
|
|
vmovdqa %ymm2, 8*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 9*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 10*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 11*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 12*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 13*32(%rsp)
|
|
|
|
vmovdqa %ymm0, 14*32(%rsp)
|
|
|
|
vmovdqa %ymm1, 15*32(%rsp)
|
|
|
|
|
|
|
|
leaq 16*32(%rsp), %rax
|
|
|
|
cmpq %rax, %rax
|
|
|
|
|
|
|
|
vmovdqa -15*32(%rax), %ymm0
|
|
|
|
vmovdqa -14*32(%rax), %ymm4
|
|
|
|
vpslld $14, %ymm0, %ymm2
|
|
|
|
vpslld $14, %ymm4, %ymm6
|
|
|
|
vpsrld $3, %ymm0, %ymm8
|
|
|
|
vpsrld $3, %ymm4, %ymm4
|
|
|
|
vpsrld $7, %ymm0, %ymm1
|
|
|
|
vpsrld $4, %ymm4, %ymm5
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpsrld $11, %ymm1, %ymm1
|
|
|
|
vpsrld $11, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpslld $11, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
vpaddd %ymm0, %ymm4, %ymm4
|
|
|
|
vpaddd -16*32(%rax), %ymm8, %ymm3
|
|
|
|
vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7
|
|
|
|
vmovdqa %ymm3, 0*32(%rax)
|
|
|
|
vmovdqa %ymm7, 1*32(%rax)
|
|
|
|
|
|
|
|
sha256_avx2_extend_doubleround 2
|
|
|
|
sha256_avx2_extend_doubleround 4
|
|
|
|
|
|
|
|
vmovdqa -9*32(%rax), %ymm0
|
|
|
|
vpslld $14, %ymm0, %ymm2
|
|
|
|
vpsrld $3, %ymm0, %ymm8
|
|
|
|
vpsrld $7, %ymm0, %ymm1
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpsrld $11, %ymm1, %ymm1
|
|
|
|
vpslld $11, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, %ymm8, %ymm8
|
|
|
|
vpxor %ymm2, %ymm8, %ymm8
|
|
|
|
vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4
|
|
|
|
vpaddd -10*32(%rax), %ymm8, %ymm0
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpaddd -1*32(%rax), %ymm0, %ymm0
|
|
|
|
vpaddd 0*32(%rax), %ymm4, %ymm4
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 6*32(%rax)
|
|
|
|
vmovdqa %ymm7, 7*32(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3
|
|
|
|
vpaddd 1*32(%rax), %ymm3, %ymm3
|
|
|
|
vpaddd 2*32(%rax), %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 8*32(%rax)
|
|
|
|
vmovdqa %ymm7, 9*32(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd 3*32(%rax), %ymm3, %ymm3
|
|
|
|
vpaddd 4*32(%rax), %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 10*32(%rax)
|
|
|
|
vmovdqa %ymm7, 11*32(%rax)
|
|
|
|
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd 5*32(%rax), %ymm3, %ymm3
|
|
|
|
vpaddd 6*32(%rax), %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 12*32(%rax)
|
|
|
|
vmovdqa %ymm7, 13*32(%rax)
|
|
|
|
|
|
|
|
vmovdqa sha256d_8preext2_30(%rip), %ymm0
|
|
|
|
vmovdqa 0*32(%rax), %ymm4
|
|
|
|
vpslld $14, %ymm4, %ymm6
|
|
|
|
vpsrld $3, %ymm4, %ymm4
|
|
|
|
vpsrld $4, %ymm4, %ymm5
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
vpsrld $11, %ymm5, %ymm5
|
|
|
|
vpslld $11, %ymm6, %ymm6
|
|
|
|
vpxor %ymm5, %ymm4, %ymm4
|
|
|
|
vpxor %ymm6, %ymm4, %ymm4
|
|
|
|
vpaddd -1*32(%rax), %ymm4, %ymm4
|
|
|
|
vpslld $13, %ymm3, %ymm2
|
|
|
|
vpslld $13, %ymm7, %ymm6
|
|
|
|
vpsrld $10, %ymm3, %ymm3
|
|
|
|
vpsrld $10, %ymm7, %ymm7
|
|
|
|
vpaddd 7*32(%rax), %ymm0, %ymm0
|
|
|
|
vpaddd 8*32(%rax), %ymm4, %ymm4
|
|
|
|
vpsrld $7, %ymm3, %ymm1
|
|
|
|
vpsrld $7, %ymm7, %ymm5
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpsrld $2, %ymm1, %ymm1
|
|
|
|
vpsrld $2, %ymm5, %ymm5
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpslld $2, %ymm2, %ymm2
|
|
|
|
vpslld $2, %ymm6, %ymm6
|
|
|
|
vpxor %ymm1, %ymm3, %ymm3
|
|
|
|
vpxor %ymm5, %ymm7, %ymm7
|
|
|
|
vpxor %ymm2, %ymm3, %ymm3
|
|
|
|
vpxor %ymm6, %ymm7, %ymm7
|
|
|
|
vpaddd %ymm0, %ymm3, %ymm3
|
|
|
|
vpaddd %ymm4, %ymm7, %ymm7
|
|
|
|
vmovdqa %ymm3, 14*32(%rax)
|
|
|
|
vmovdqa %ymm7, 15*32(%rax)
|
|
|
|
|
|
|
|
jmp sha256d_ms_8way_avx2_extend_loop2
|
|
|
|
|
|
|
|
sha256d_ms_8way_avx2_extend_coda2:
|
|
|
|
sha256_avx2_extend_round 44
|
|
|
|
|
|
|
|
vmovdqa sha256_8h+0(%rip), %ymm7
|
|
|
|
vmovdqa sha256_8h+32(%rip), %ymm5
|
|
|
|
vmovdqa sha256_8h+64(%rip), %ymm4
|
|
|
|
vmovdqa sha256_8h+96(%rip), %ymm3
|
|
|
|
vmovdqa sha256_8h+128(%rip), %ymm0
|
|
|
|
vmovdqa sha256_8h+160(%rip), %ymm8
|
|
|
|
vmovdqa sha256_8h+192(%rip), %ymm9
|
|
|
|
vmovdqa sha256_8h+224(%rip), %ymm10
|
|
|
|
|
|
|
|
movq %rsp, %rax
|
|
|
|
leaq sha256_8k(%rip), %rcx
|
|
|
|
jmp sha256d_ms_8way_avx2_main_loop2
|
|
|
|
|
|
|
|
.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4
|
|
|
|
vpaddd 32*\i(%rax), \r0, %ymm6
|
|
|
|
vpaddd 32*\i(%rcx), %ymm6, %ymm6
|
|
|
|
vpandn \r1, \r3, %ymm1
|
|
|
|
vpand \r3, \r2, %ymm2
|
|
|
|
vpxor %ymm2, %ymm1, %ymm1
|
|
|
|
vpaddd %ymm1, %ymm6, %ymm6
|
|
|
|
vpslld $7, \r3, %ymm1
|
|
|
|
vpsrld $6, \r3, \r0
|
|
|
|
vpsrld $5, \r0, %ymm2
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpxor %ymm2, \r0, \r0
|
|
|
|
vpslld $14, %ymm1, %ymm1
|
|
|
|
vpsrld $14, %ymm2, %ymm2
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpxor %ymm2, \r0, \r0
|
|
|
|
vpslld $5, %ymm1, %ymm1
|
|
|
|
vpxor %ymm1, \r0, \r0
|
|
|
|
vpaddd \r0, %ymm6, %ymm6
|
|
|
|
vpaddd %ymm6, \r4, \r0
|
|
|
|
.endm
|
|
|
|
|
|
|
|
sha256d_ms_8way_avx2_finish:
|
|
|
|
sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4
|
|
|
|
sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5
|
|
|
|
sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7
|
|
|
|
sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3
|
|
|
|
|
|
|
|
vpaddd sha256_8h+224(%rip), %ymm10, %ymm10
|
|
|
|
vmovdqa %ymm10, 224(%rdi)
|
|
|
|
|
|
|
|
movq %rbp, %rsp
|
|
|
|
popq %rbp
|
2014-03-06 10:39:21 +01:00
|
|
|
#if defined(_WIN64) || defined(__CYGWIN__)
|
2013-07-05 18:25:34 +02:00
|
|
|
popq %rsi
|
|
|
|
vmovdqa 0(%rsp), %xmm6
|
|
|
|
vmovdqa 16(%rsp), %xmm7
|
|
|
|
vmovdqa 32(%rsp), %xmm8
|
|
|
|
vmovdqa 48(%rsp), %xmm9
|
|
|
|
vmovdqa 64(%rsp), %xmm10
|
|
|
|
addq $80, %rsp
|
|
|
|
popq %rdi
|
|
|
|
#endif
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
.text
|
|
|
|
.p2align 6
|
|
|
|
.globl sha256_use_8way
|
|
|
|
.globl _sha256_use_8way
|
|
|
|
sha256_use_8way:
|
|
|
|
_sha256_use_8way:
|
|
|
|
pushq %rbx
|
|
|
|
|
|
|
|
/* Check for AVX and OSXSAVE support */
|
|
|
|
movl $1, %eax
|
|
|
|
cpuid
|
|
|
|
andl $0x18000000, %ecx
|
|
|
|
cmpl $0x18000000, %ecx
|
|
|
|
jne sha256_use_8way_no
|
|
|
|
/* Check for AVX2 support */
|
|
|
|
movl $7, %eax
|
|
|
|
xorl %ecx, %ecx
|
|
|
|
cpuid
|
|
|
|
andl $0x00000020, %ebx
|
|
|
|
cmpl $0x00000020, %ebx
|
|
|
|
jne sha256_use_8way_no
|
|
|
|
/* Check for XMM and YMM state support */
|
|
|
|
xorl %ecx, %ecx
|
|
|
|
xgetbv
|
|
|
|
andl $0x00000006, %eax
|
|
|
|
cmpl $0x00000006, %eax
|
|
|
|
jne sha256_use_8way_no
|
|
|
|
|
|
|
|
sha256_use_8way_yes:
|
|
|
|
movl $1, %eax
|
|
|
|
jmp sha256_use_8way_done
|
|
|
|
|
|
|
|
sha256_use_8way_no:
|
|
|
|
xorl %eax, %eax
|
|
|
|
|
|
|
|
sha256_use_8way_done:
|
|
|
|
popq %rbx
|
|
|
|
ret
|
|
|
|
|
|
|
|
#endif /* USE_AVX2 */
|
|
|
|
|
2012-03-12 13:32:11 +01:00
|
|
|
#endif
|