cpuminer/scrypt-x64.S

2765 lines
61 KiB
ArmAsm

/*
* Copyright 2011-2012 pooler@litecoinpool.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "cpuminer-config.h"
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif
#if defined(__x86_64__)
.data
.p2align 6
sha256_4h:
.long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667
.long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85
.long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372
.long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a
.long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f
.long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c
.long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab
.long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19
.data
.p2align 6
sha256_4k:
.long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98
.long 0x71374491, 0x71374491, 0x71374491, 0x71374491
.long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf
.long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5
.long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b
.long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1
.long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4
.long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5
.long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98
.long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01
.long 0x243185be, 0x243185be, 0x243185be, 0x243185be
.long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3
.long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74
.long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe
.long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7
.long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174
.long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1
.long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786
.long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6
.long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc
.long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f
.long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa
.long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc
.long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da
.long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152
.long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d
.long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8
.long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7
.long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3
.long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147
.long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351
.long 0x14292967, 0x14292967, 0x14292967, 0x14292967
.long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85
.long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138
.long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc
.long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13
.long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354
.long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb
.long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e
.long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85
.long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1
.long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b
.long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70
.long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3
.long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819
.long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624
.long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585
.long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070
.long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116
.long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08
.long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c
.long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5
.long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3
.long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a
.long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f
.long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3
.long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee
.long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f
.long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814
.long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208
.long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa
.long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb
.long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7
.long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2
.text
.p2align 6
.globl SHA256_InitState_4way
.globl _SHA256_InitState_4way
SHA256_InitState_4way:
_SHA256_InitState_4way:
#if defined(WIN64)
pushq %rdi
movq %rcx, %rdi
#endif
movdqa sha256_4h+0(%rip), %xmm0
movdqa sha256_4h+16(%rip), %xmm1
movdqa sha256_4h+32(%rip), %xmm2
movdqa sha256_4h+48(%rip), %xmm3
movdqu %xmm0, 0(%rdi)
movdqu %xmm1, 16(%rdi)
movdqu %xmm2, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movdqa sha256_4h+64(%rip), %xmm0
movdqa sha256_4h+80(%rip), %xmm1
movdqa sha256_4h+96(%rip), %xmm2
movdqa sha256_4h+112(%rip), %xmm3
movdqu %xmm0, 64(%rdi)
movdqu %xmm1, 80(%rdi)
movdqu %xmm2, 96(%rdi)
movdqu %xmm3, 112(%rdi)
#if defined(WIN64)
popq %rdi
#endif
ret
.macro p2bswap_rsi_rsp i
movdqu \i*16(%rsi), %xmm0
movdqu (\i+1)*16(%rsi), %xmm2
pshuflw $0xb1, %xmm0, %xmm0
pshuflw $0xb1, %xmm2, %xmm2
pshufhw $0xb1, %xmm0, %xmm0
pshufhw $0xb1, %xmm2, %xmm2
movdqa %xmm0, %xmm1
movdqa %xmm2, %xmm3
psrlw $8, %xmm1
psrlw $8, %xmm3
psllw $8, %xmm0
psllw $8, %xmm2
pxor %xmm1, %xmm0
pxor %xmm3, %xmm2
movdqa %xmm0, \i*16(%rsp)
movdqa %xmm2, (\i+1)*16(%rsp)
.endm
.text
.p2align 6
.globl SHA256_Transform_4way
.globl _SHA256_Transform_4way
SHA256_Transform_4way:
_SHA256_Transform_4way:
#if defined(WIN64)
pushq %rdi
subq $96, %rsp
movdqa %xmm6, 0(%rsp)
movdqa %xmm7, 16(%rsp)
movdqa %xmm8, 32(%rsp)
movdqa %xmm9, 48(%rsp)
movdqa %xmm10, 64(%rsp)
movdqa %xmm11, 80(%rsp)
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
#endif
subq $1032, %rsp
testq %rdx, %rdx
jz sha256_transform_4way_block_copy
p2bswap_rsi_rsp 0
p2bswap_rsi_rsp 2
p2bswap_rsi_rsp 4
p2bswap_rsi_rsp 6
p2bswap_rsi_rsp 8
p2bswap_rsi_rsp 10
p2bswap_rsi_rsp 12
p2bswap_rsi_rsp 14
jmp sha256_transform_4way_extend
.p2align 6
sha256_transform_4way_block_copy:
movdqu 0*16(%rsi), %xmm0
movdqu 1*16(%rsi), %xmm1
movdqu 2*16(%rsi), %xmm2
movdqu 3*16(%rsi), %xmm3
movdqu 4*16(%rsi), %xmm4
movdqu 5*16(%rsi), %xmm5
movdqu 6*16(%rsi), %xmm6
movdqu 7*16(%rsi), %xmm7
movdqa %xmm0, 0*16(%rsp)
movdqa %xmm1, 1*16(%rsp)
movdqa %xmm2, 2*16(%rsp)
movdqa %xmm3, 3*16(%rsp)
movdqa %xmm4, 4*16(%rsp)
movdqa %xmm5, 5*16(%rsp)
movdqa %xmm6, 6*16(%rsp)
movdqa %xmm7, 7*16(%rsp)
movdqu 8*16(%rsi), %xmm0
movdqu 9*16(%rsi), %xmm1
movdqu 10*16(%rsi), %xmm2
movdqu 11*16(%rsi), %xmm3
movdqu 12*16(%rsi), %xmm4
movdqu 13*16(%rsi), %xmm5
movdqu 14*16(%rsi), %xmm6
movdqu 15*16(%rsi), %xmm7
movdqa %xmm0, 8*16(%rsp)
movdqa %xmm1, 9*16(%rsp)
movdqa %xmm2, 10*16(%rsp)
movdqa %xmm3, 11*16(%rsp)
movdqa %xmm4, 12*16(%rsp)
movdqa %xmm5, 13*16(%rsp)
movdqa %xmm6, 14*16(%rsp)
movdqa %xmm7, 15*16(%rsp)
sha256_transform_4way_extend:
leaq 256(%rsp), %rcx
leaq 48*16(%rcx), %rax
sha256_transform_4way_extend_loop:
movdqa -15*16(%rcx), %xmm0
movdqa -14*16(%rcx), %xmm4
movdqa %xmm0, %xmm2
movdqa %xmm4, %xmm6
psrld $3, %xmm0
psrld $3, %xmm4
movdqa %xmm0, %xmm1
movdqa %xmm4, %xmm5
pslld $14, %xmm2
pslld $14, %xmm6
psrld $4, %xmm1
psrld $4, %xmm5
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
psrld $11, %xmm1
psrld $11, %xmm5
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
pslld $11, %xmm2
pslld $11, %xmm6
pxor %xmm1, %xmm0
pxor %xmm5, %xmm4
pxor %xmm2, %xmm0
pxor %xmm6, %xmm4
movdqa -2*16(%rcx), %xmm3
movdqa -1*16(%rcx), %xmm7
paddd -16*16(%rcx), %xmm0
paddd -15*16(%rcx), %xmm4
movdqa %xmm3, %xmm2
movdqa %xmm7, %xmm6
psrld $10, %xmm3
psrld $10, %xmm7
movdqa %xmm3, %xmm1
movdqa %xmm7, %xmm5
paddd -7*16(%rcx), %xmm0
pslld $13, %xmm2
pslld $13, %xmm6
psrld $7, %xmm1
psrld $7, %xmm5
paddd -6*16(%rcx), %xmm4
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
psrld $2, %xmm1
psrld $2, %xmm5
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
pslld $2, %xmm2
pslld $2, %xmm6
pxor %xmm1, %xmm3
pxor %xmm5, %xmm7
pxor %xmm2, %xmm3
pxor %xmm6, %xmm7
paddd %xmm3, %xmm0
paddd %xmm7, %xmm4
movdqa %xmm0, (%rcx)
movdqa %xmm4, 16(%rcx)
addq $2*16, %rcx
cmpq %rcx, %rax
jne sha256_transform_4way_extend_loop
movdqu 0(%rdi), %xmm7
movdqu 16(%rdi), %xmm5
movdqu 32(%rdi), %xmm4
movdqu 48(%rdi), %xmm3
movdqu 64(%rdi), %xmm0
movdqu 80(%rdi), %xmm8
movdqu 96(%rdi), %xmm9
movdqu 112(%rdi), %xmm10
leaq sha256_4k(%rip), %rcx
xorq %rax, %rax
sha256_transform_4way_main_loop:
movdqa (%rsp, %rax), %xmm6
paddd (%rcx, %rax), %xmm6
paddd %xmm10, %xmm6
movdqa %xmm0, %xmm1
movdqa %xmm9, %xmm2
pandn %xmm2, %xmm1
movdqa %xmm2, %xmm10
movdqa %xmm8, %xmm2
movdqa %xmm2, %xmm9
pand %xmm0, %xmm2
pxor %xmm2, %xmm1
movdqa %xmm0, %xmm8
paddd %xmm1, %xmm6
movdqa %xmm0, %xmm1
psrld $6, %xmm0
movdqa %xmm0, %xmm2
pslld $7, %xmm1
psrld $5, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $14, %xmm1
psrld $14, %xmm2
pxor %xmm1, %xmm0
pxor %xmm2, %xmm0
pslld $5, %xmm1
pxor %xmm1, %xmm0
paddd %xmm0, %xmm6
movdqa %xmm3, %xmm0
paddd %xmm6, %xmm0
movdqa %xmm5, %xmm1
movdqa %xmm4, %xmm3
movdqa %xmm4, %xmm2
pand %xmm5, %xmm2
pand %xmm7, %xmm4
pand %xmm7, %xmm1
pxor %xmm4, %xmm1
movdqa %xmm5, %xmm4
movdqa %xmm7, %xmm5
pxor %xmm2, %xmm1
paddd %xmm1, %xmm6
movdqa %xmm7, %xmm2
psrld $2, %xmm7
movdqa %xmm7, %xmm1
pslld $10, %xmm2
psrld $11, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $9, %xmm2
psrld $9, %xmm1
pxor %xmm2, %xmm7
pxor %xmm1, %xmm7
pslld $11, %xmm2
pxor %xmm2, %xmm7
paddd %xmm6, %xmm7
addq $16, %rax
cmpq $16*64, %rax
jne sha256_transform_4way_main_loop
movdqu 0(%rdi), %xmm2
movdqu 16(%rdi), %xmm6
movdqu 32(%rdi), %xmm11
movdqu 48(%rdi), %xmm1
paddd %xmm2, %xmm7
paddd %xmm6, %xmm5
paddd %xmm11, %xmm4
paddd %xmm1, %xmm3
movdqu 64(%rdi), %xmm2
movdqu 80(%rdi), %xmm6
movdqu 96(%rdi), %xmm11
movdqu 112(%rdi), %xmm1
paddd %xmm2, %xmm0
paddd %xmm6, %xmm8
paddd %xmm11, %xmm9
paddd %xmm1, %xmm10
movdqu %xmm7, 0(%rdi)
movdqu %xmm5, 16(%rdi)
movdqu %xmm4, 32(%rdi)
movdqu %xmm3, 48(%rdi)
movdqu %xmm0, 64(%rdi)
movdqu %xmm8, 80(%rdi)
movdqu %xmm9, 96(%rdi)
movdqu %xmm10, 112(%rdi)
addq $1032, %rsp
#if defined(WIN64)
popq %rsi
movdqa 0(%rsp), %xmm6
movdqa 16(%rsp), %xmm7
movdqa 32(%rsp), %xmm8
movdqa 48(%rsp), %xmm9
movdqa 64(%rsp), %xmm10
movdqa 80(%rsp), %xmm11
addq $96, %rsp
popq %rdi
#endif
ret
.text
.p2align 6
.globl scrypt_best_throughput
.globl _scrypt_best_throughput
scrypt_best_throughput:
_scrypt_best_throughput:
pushq %rbx
xorq %rax, %rax
cpuid
movl $3, %eax
cmpl $0x444d4163, %ecx
jne scrypt_best_throughput_exit
cmpl $0x69746e65, %edx
jne scrypt_best_throughput_exit
cmpl $0x68747541, %ebx
jne scrypt_best_throughput_exit
movl $1, %eax
cpuid
andl $0x0ff00000, %eax
movl $3, %eax
jnz scrypt_best_throughput_exit
movl $1, %eax
scrypt_best_throughput_exit:
popq %rbx
ret
.macro scrypt_shuffle src, so, dest, do
movl \so+60(\src), %r8d
movl \so+44(\src), %r9d
movl \so+28(\src), %r10d
movl \so+12(\src), %r11d
movl %r8d, \do+12(\dest)
movl %r9d, \do+28(\dest)
movl %r10d, \do+44(\dest)
movl %r11d, \do+60(\dest)
movl \so+40(\src), %r8d
movl \so+8(\src), %r9d
movl \so+48(\src), %r10d
movl \so+16(\src), %r11d
movl %r8d, \do+8(\dest)
movl %r9d, \do+40(\dest)
movl %r10d, \do+16(\dest)
movl %r11d, \do+48(\dest)
movl \so+20(\src), %r8d
movl \so+4(\src), %r9d
movl \so+52(\src), %r10d
movl \so+36(\src), %r11d
movl %r8d, \do+4(\dest)
movl %r9d, \do+20(\dest)
movl %r10d, \do+36(\dest)
movl %r11d, \do+52(\dest)
movl \so+0(\src), %r8d
movl \so+24(\src), %r9d
movl \so+32(\src), %r10d
movl \so+56(\src), %r11d
movl %r8d, \do+0(\dest)
movl %r9d, \do+24(\dest)
movl %r10d, \do+32(\dest)
movl %r11d, \do+56(\dest)
.endm
.macro salsa8_core_gen_doubleround
movq 72(%rsp), %r15
leaq (%r14, %rdx), %rbp
roll $7, %ebp
xorq %rbp, %r9
leaq (%rdi, %r15), %rbp
roll $7, %ebp
xorq %rbp, %r10
leaq (%rdx, %r9), %rbp
roll $9, %ebp
xorq %rbp, %r11
leaq (%r15, %r10), %rbp
roll $9, %ebp
xorq %rbp, %r13
leaq (%r9, %r11), %rbp
roll $13, %ebp
xorq %rbp, %r14
leaq (%r10, %r13), %rbp
roll $13, %ebp
xorq %rbp, %rdi
leaq (%r11, %r14), %rbp
roll $18, %ebp
xorq %rbp, %rdx
leaq (%r13, %rdi), %rbp
roll $18, %ebp
xorq %rbp, %r15
movq 48(%rsp), %rbp
movq %r15, 72(%rsp)
leaq (%rax, %rbp), %r15
roll $7, %r15d
xorq %r15, %rbx
leaq (%rbp, %rbx), %r15
roll $9, %r15d
xorq %r15, %rcx
leaq (%rbx, %rcx), %r15
roll $13, %r15d
xorq %r15, %rax
leaq (%rcx, %rax), %r15
roll $18, %r15d
xorq %r15, %rbp
movq 88(%rsp), %r15
movq %rbp, 48(%rsp)
leaq (%r12, %r15), %rbp
roll $7, %ebp
xorq %rbp, %rsi
leaq (%r15, %rsi), %rbp
roll $9, %ebp
xorq %rbp, %r8
leaq (%rsi, %r8), %rbp
roll $13, %ebp
xorq %rbp, %r12
leaq (%r8, %r12), %rbp
roll $18, %ebp
xorq %rbp, %r15
movq %r15, 88(%rsp)
movq 72(%rsp), %r15
leaq (%rsi, %rdx), %rbp
roll $7, %ebp
xorq %rbp, %rdi
leaq (%r9, %r15), %rbp
roll $7, %ebp
xorq %rbp, %rax
leaq (%rdx, %rdi), %rbp
roll $9, %ebp
xorq %rbp, %rcx
leaq (%r15, %rax), %rbp
roll $9, %ebp
xorq %rbp, %r8
leaq (%rdi, %rcx), %rbp
roll $13, %ebp
xorq %rbp, %rsi
leaq (%rax, %r8), %rbp
roll $13, %ebp
xorq %rbp, %r9
leaq (%rcx, %rsi), %rbp
roll $18, %ebp
xorq %rbp, %rdx
leaq (%r8, %r9), %rbp
roll $18, %ebp
xorq %rbp, %r15
movq 48(%rsp), %rbp
movq %r15, 72(%rsp)
leaq (%r10, %rbp), %r15
roll $7, %r15d
xorq %r15, %r12
leaq (%rbp, %r12), %r15
roll $9, %r15d
xorq %r15, %r11
leaq (%r12, %r11), %r15
roll $13, %r15d
xorq %r15, %r10
leaq (%r11, %r10), %r15
roll $18, %r15d
xorq %r15, %rbp
movq 88(%rsp), %r15
movq %rbp, 48(%rsp)
leaq (%rbx, %r15), %rbp
roll $7, %ebp
xorq %rbp, %r14
leaq (%r15, %r14), %rbp
roll $9, %ebp
xorq %rbp, %r13
leaq (%r14, %r13), %rbp
roll $13, %ebp
xorq %rbp, %rbx
leaq (%r13, %rbx), %rbp
roll $18, %ebp
xorq %rbp, %r15
movq %r15, 88(%rsp)
.endm
.text
.p2align 6
salsa8_core_gen:
# 0: %rdx, %rdi, %rcx, %rsi
movq 8(%rsp), %rdi
movq %rdi, %rdx
shrq $32, %rdi
movq 16(%rsp), %rsi
movq %rsi, %rcx
shrq $32, %rsi
# 1: %r9, 72(%rsp), %rax, %r8
movq 24(%rsp), %r8
movq %r8, %r9
shrq $32, %r8
movq %r8, 72(%rsp)
movq 32(%rsp), %r8
movq %r8, %rax
shrq $32, %r8
# 2: %r11, %r10, 48(%rsp), %r12
movq 40(%rsp), %r10
movq %r10, %r11
shrq $32, %r10
movq 48(%rsp), %r12
#movq %r12, %r13
#movq %r13, 48(%rsp)
shrq $32, %r12
# 3: %r14, %r13, %rbx, 88(%rsp)
movq 56(%rsp), %r13
movq %r13, %r14
shrq $32, %r13
movq 64(%rsp), %r15
movq %r15, %rbx
shrq $32, %r15
movq %r15, 88(%rsp)
salsa8_core_gen_doubleround
salsa8_core_gen_doubleround
salsa8_core_gen_doubleround
salsa8_core_gen_doubleround
movl %edx, %edx
shlq $32, %rdi
addq %rdi, %rdx
movd %rdx, %xmm0
movl %ecx, %ecx
shlq $32, %rsi
addq %rsi, %rcx
movd %rcx, %xmm4
movq 72(%rsp), %rdi
movl %r9d, %r9d
shlq $32, %rdi
addq %rdi, %r9
movd %r9, %xmm1
movl %eax, %eax
shlq $32, %r8
addq %r8, %rax
movd %rax, %xmm5
movl %r11d, %r11d
shlq $32, %r10
addq %r10, %r11
movd %r11, %xmm2
movl 48(%rsp), %r8d
shlq $32, %r12
addq %r12, %r8
movd %r8, %xmm6
movl %r14d, %r14d
shlq $32, %r13
addq %r13, %r14
movd %r14, %xmm3
movq 88(%rsp), %rdi
movl %ebx, %ebx
shlq $32, %rdi
addq %rdi, %rbx
movd %rbx, %xmm7
punpcklqdq %xmm4, %xmm0
punpcklqdq %xmm5, %xmm1
punpcklqdq %xmm6, %xmm2
punpcklqdq %xmm7, %xmm3
#movq %rdx, 8(%rsp)
#movq %rcx, 16(%rsp)
#movq %r9, 24(%rsp)
#movq %rax, 32(%rsp)
#movq %r11, 40(%rsp)
#movq %r8, 48(%rsp)
#movq %r14, 56(%rsp)
#movq %rbx, 64(%rsp)
ret
.text
.p2align 6
.globl scrypt_core
.globl _scrypt_core
scrypt_core:
_scrypt_core:
pushq %rbx
pushq %rbp
pushq %r12
pushq %r13
pushq %r14
pushq %r15
#if defined(WIN64)
subq $176, %rsp
movdqa %xmm6, 8(%rsp)
movdqa %xmm7, 24(%rsp)
movdqa %xmm8, 40(%rsp)
movdqa %xmm9, 56(%rsp)
movdqa %xmm10, 72(%rsp)
movdqa %xmm11, 88(%rsp)
movdqa %xmm12, 104(%rsp)
movdqa %xmm13, 120(%rsp)
movdqa %xmm14, 136(%rsp)
movdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#endif
.macro scrypt_core_cleanup
#if defined(WIN64)
popq %rsi
popq %rdi
movdqa 8(%rsp), %xmm6
movdqa 24(%rsp), %xmm7
movdqa 40(%rsp), %xmm8
movdqa 56(%rsp), %xmm9
movdqa 72(%rsp), %xmm10
movdqa 88(%rsp), %xmm11
movdqa 104(%rsp), %xmm12
movdqa 120(%rsp), %xmm13
movdqa 136(%rsp), %xmm14
movdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
.endm
# GenuineIntel processors have fast SIMD
xorl %eax, %eax
cpuid
cmpl $0x6c65746e, %ecx
jne scrypt_core_gen
cmpl $0x49656e69, %edx
jne scrypt_core_gen
cmpl $0x756e6547, %ebx
je scrypt_core_xmm
.p2align 6
scrypt_core_gen:
subq $136, %rsp
movdqa 0(%rdi), %xmm8
movdqa 16(%rdi), %xmm9
movdqa 32(%rdi), %xmm10
movdqa 48(%rdi), %xmm11
movdqa 64(%rdi), %xmm12
movdqa 80(%rdi), %xmm13
movdqa 96(%rdi), %xmm14
movdqa 112(%rdi), %xmm15
leaq 131072(%rsi), %rcx
movq %rdi, 104(%rsp)
movq %rsi, 112(%rsp)
movq %rcx, 120(%rsp)
scrypt_core_gen_loop1:
movdqa %xmm8, 0(%rsi)
movdqa %xmm9, 16(%rsi)
movdqa %xmm10, 32(%rsi)
movdqa %xmm11, 48(%rsi)
movdqa %xmm12, 64(%rsi)
movdqa %xmm13, 80(%rsi)
movdqa %xmm14, 96(%rsi)
movdqa %xmm15, 112(%rsi)
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, 0(%rsp)
movdqa %xmm9, 16(%rsp)
movdqa %xmm10, 32(%rsp)
movdqa %xmm11, 48(%rsp)
movq %rsi, 128(%rsp)
call salsa8_core_gen
paddd %xmm0, %xmm8
paddd %xmm1, %xmm9
paddd %xmm2, %xmm10
paddd %xmm3, %xmm11
pxor %xmm8, %xmm12
pxor %xmm9, %xmm13
pxor %xmm10, %xmm14
pxor %xmm11, %xmm15
movdqa %xmm12, 0(%rsp)
movdqa %xmm13, 16(%rsp)
movdqa %xmm14, 32(%rsp)
movdqa %xmm15, 48(%rsp)
call salsa8_core_gen
movq 128(%rsp), %rsi
paddd %xmm0, %xmm12
paddd %xmm1, %xmm13
paddd %xmm2, %xmm14
paddd %xmm3, %xmm15
addq $128, %rsi
movq 120(%rsp), %rcx
cmpq %rcx, %rsi
jne scrypt_core_gen_loop1
movq $1024, %rcx
scrypt_core_gen_loop2:
movq 112(%rsp), %rsi
movd %xmm12, %edx
andl $1023, %edx
shll $7, %edx
movdqa 0(%rsi, %rdx), %xmm0
movdqa 16(%rsi, %rdx), %xmm1
movdqa 32(%rsi, %rdx), %xmm2
movdqa 48(%rsi, %rdx), %xmm3
movdqa 64(%rsi, %rdx), %xmm4
movdqa 80(%rsi, %rdx), %xmm5
movdqa 96(%rsi, %rdx), %xmm6
movdqa 112(%rsi, %rdx), %xmm7
pxor %xmm0, %xmm8
pxor %xmm1, %xmm9
pxor %xmm2, %xmm10
pxor %xmm3, %xmm11
pxor %xmm4, %xmm12
pxor %xmm5, %xmm13
pxor %xmm6, %xmm14
pxor %xmm7, %xmm15
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, 0(%rsp)
movdqa %xmm9, 16(%rsp)
movdqa %xmm10, 32(%rsp)
movdqa %xmm11, 48(%rsp)
movq %rcx, 128(%rsp)
call salsa8_core_gen
paddd %xmm0, %xmm8
paddd %xmm1, %xmm9
paddd %xmm2, %xmm10
paddd %xmm3, %xmm11
pxor %xmm8, %xmm12
pxor %xmm9, %xmm13
pxor %xmm10, %xmm14
pxor %xmm11, %xmm15
movdqa %xmm12, 0(%rsp)
movdqa %xmm13, 16(%rsp)
movdqa %xmm14, 32(%rsp)
movdqa %xmm15, 48(%rsp)
call salsa8_core_gen
movq 128(%rsp), %rcx
paddd %xmm0, %xmm12
paddd %xmm1, %xmm13
paddd %xmm2, %xmm14
paddd %xmm3, %xmm15
subq $1, %rcx
ja scrypt_core_gen_loop2
movq 104(%rsp), %rdi
movdqa %xmm8, 0(%rdi)
movdqa %xmm9, 16(%rdi)
movdqa %xmm10, 32(%rdi)
movdqa %xmm11, 48(%rdi)
movdqa %xmm12, 64(%rdi)
movdqa %xmm13, 80(%rdi)
movdqa %xmm14, 96(%rdi)
movdqa %xmm15, 112(%rdi)
addq $136, %rsp
scrypt_core_cleanup
ret
.macro salsa8_core_xmm_doubleround
movdqa %xmm1, %xmm4
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm3
pxor %xmm5, %xmm3
movdqa %xmm0, %xmm4
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm3, %xmm4
pshufd $0x93, %xmm3, %xmm3
pxor %xmm5, %xmm2
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm2
pxor %xmm5, %xmm1
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm1, %xmm1
pxor %xmm5, %xmm0
movdqa %xmm3, %xmm4
paddd %xmm0, %xmm4
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm1
pxor %xmm5, %xmm1
movdqa %xmm0, %xmm4
paddd %xmm1, %xmm4
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm1, %xmm4
pshufd $0x93, %xmm1, %xmm1
pxor %xmm5, %xmm2
paddd %xmm2, %xmm4
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm2
pxor %xmm5, %xmm3
paddd %xmm3, %xmm4
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm3, %xmm3
pxor %xmm5, %xmm0
.endm
.macro salsa8_core_xmm
salsa8_core_xmm_doubleround
salsa8_core_xmm_doubleround
salsa8_core_xmm_doubleround
salsa8_core_xmm_doubleround
.endm
.p2align 6
scrypt_core_xmm:
# shuffle 1st block into %xmm8-%xmm11
movl 60(%rdi), %edx
movl 44(%rdi), %ecx
movl 28(%rdi), %ebx
movl 12(%rdi), %eax
movd %edx, %xmm0
movd %ecx, %xmm1
movd %ebx, %xmm2
movd %eax, %xmm3
movl 40(%rdi), %ecx
movl 24(%rdi), %ebx
movl 8(%rdi), %eax
movl 56(%rdi), %edx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %ecx, %xmm4
movd %ebx, %xmm5
movd %eax, %xmm6
movd %edx, %xmm7
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movl 20(%rdi), %ebx
movl 4(%rdi), %eax
movl 52(%rdi), %edx
movl 36(%rdi), %ecx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %ebx, %xmm4
movd %eax, %xmm5
movd %edx, %xmm6
movd %ecx, %xmm7
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movl 0(%rdi), %eax
movl 48(%rdi), %edx
movl 32(%rdi), %ecx
movl 16(%rdi), %ebx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %eax, %xmm8
movd %edx, %xmm9
movd %ecx, %xmm10
movd %ebx, %xmm11
paddd %xmm0, %xmm8
paddd %xmm1, %xmm9
paddd %xmm2, %xmm10
paddd %xmm3, %xmm11
# shuffle 2nd block into %xmm12-%xmm15
movl 124(%rdi), %edx
movl 108(%rdi), %ecx
movl 92(%rdi), %ebx
movl 76(%rdi), %eax
movd %edx, %xmm0
movd %ecx, %xmm1
movd %ebx, %xmm2
movd %eax, %xmm3
movl 104(%rdi), %ecx
movl 88(%rdi), %ebx
movl 72(%rdi), %eax
movl 120(%rdi), %edx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %ecx, %xmm4
movd %ebx, %xmm5
movd %eax, %xmm6
movd %edx, %xmm7
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movl 84(%rdi), %ebx
movl 68(%rdi), %eax
movl 116(%rdi), %edx
movl 100(%rdi), %ecx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %ebx, %xmm4
movd %eax, %xmm5
movd %edx, %xmm6
movd %ecx, %xmm7
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
paddd %xmm6, %xmm2
paddd %xmm7, %xmm3
movl 64(%rdi), %eax
movl 112(%rdi), %edx
movl 96(%rdi), %ecx
movl 80(%rdi), %ebx
pshufd $0x93, %xmm0, %xmm0
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm2, %xmm2
pshufd $0x93, %xmm3, %xmm3
movd %eax, %xmm12
movd %edx, %xmm13
movd %ecx, %xmm14
movd %ebx, %xmm15
paddd %xmm0, %xmm12
paddd %xmm1, %xmm13
paddd %xmm2, %xmm14
paddd %xmm3, %xmm15
movq %rsi, %rdx
leaq 131072(%rsi), %rcx
scrypt_core_xmm_loop1:
movdqa %xmm8, 0(%rdx)
movdqa %xmm9, 16(%rdx)
movdqa %xmm10, 32(%rdx)
movdqa %xmm11, 48(%rdx)
movdqa %xmm12, 64(%rdx)
movdqa %xmm13, 80(%rdx)
movdqa %xmm14, 96(%rdx)
movdqa %xmm15, 112(%rdx)
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, %xmm0
movdqa %xmm9, %xmm1
movdqa %xmm10, %xmm2
movdqa %xmm11, %xmm3
salsa8_core_xmm
paddd %xmm0, %xmm8
paddd %xmm1, %xmm9
paddd %xmm2, %xmm10
paddd %xmm3, %xmm11
pxor %xmm8, %xmm12
pxor %xmm9, %xmm13
pxor %xmm10, %xmm14
pxor %xmm11, %xmm15
movdqa %xmm12, %xmm0
movdqa %xmm13, %xmm1
movdqa %xmm14, %xmm2
movdqa %xmm15, %xmm3
salsa8_core_xmm
paddd %xmm0, %xmm12
paddd %xmm1, %xmm13
paddd %xmm2, %xmm14
paddd %xmm3, %xmm15
addq $128, %rdx
cmpq %rcx, %rdx
jne scrypt_core_xmm_loop1
movq $1024, %rcx
scrypt_core_xmm_loop2:
movd %xmm12, %edx
andl $1023, %edx
shll $7, %edx
movdqa 0(%rsi, %rdx), %xmm0
movdqa 16(%rsi, %rdx), %xmm1
movdqa 32(%rsi, %rdx), %xmm2
movdqa 48(%rsi, %rdx), %xmm3
movdqa 64(%rsi, %rdx), %xmm4
movdqa 80(%rsi, %rdx), %xmm5
movdqa 96(%rsi, %rdx), %xmm6
movdqa 112(%rsi, %rdx), %xmm7
pxor %xmm0, %xmm8
pxor %xmm1, %xmm9
pxor %xmm2, %xmm10
pxor %xmm3, %xmm11
pxor %xmm4, %xmm12
pxor %xmm5, %xmm13
pxor %xmm6, %xmm14
pxor %xmm7, %xmm15
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, %xmm0
movdqa %xmm9, %xmm1
movdqa %xmm10, %xmm2
movdqa %xmm11, %xmm3
salsa8_core_xmm
paddd %xmm0, %xmm8
paddd %xmm1, %xmm9
paddd %xmm2, %xmm10
paddd %xmm3, %xmm11
pxor %xmm8, %xmm12
pxor %xmm9, %xmm13
pxor %xmm10, %xmm14
pxor %xmm11, %xmm15
movdqa %xmm12, %xmm0
movdqa %xmm13, %xmm1
movdqa %xmm14, %xmm2
movdqa %xmm15, %xmm3
salsa8_core_xmm
paddd %xmm0, %xmm12
paddd %xmm1, %xmm13
paddd %xmm2, %xmm14
paddd %xmm3, %xmm15
subq $1, %rcx
ja scrypt_core_xmm_loop2
# re-shuffle 1st block back
movd %xmm8, %eax
movd %xmm9, %edx
movd %xmm10, %ecx
movd %xmm11, %ebx
pshufd $0x39, %xmm8, %xmm8
pshufd $0x39, %xmm9, %xmm9
pshufd $0x39, %xmm10, %xmm10
pshufd $0x39, %xmm11, %xmm11
movl %eax, 0(%rdi)
movl %edx, 48(%rdi)
movl %ecx, 32(%rdi)
movl %ebx, 16(%rdi)
movd %xmm8, %ebx
movd %xmm9, %eax
movd %xmm10, %edx
movd %xmm11, %ecx
pshufd $0x39, %xmm8, %xmm8
pshufd $0x39, %xmm9, %xmm9
pshufd $0x39, %xmm10, %xmm10
pshufd $0x39, %xmm11, %xmm11
movl %ebx, 20(%rdi)
movl %eax, 4(%rdi)
movl %edx, 52(%rdi)
movl %ecx, 36(%rdi)
movd %xmm8, %ecx
movd %xmm9, %ebx
movd %xmm10, %eax
movd %xmm11, %edx
pshufd $0x39, %xmm8, %xmm8
pshufd $0x39, %xmm9, %xmm9
pshufd $0x39, %xmm10, %xmm10
pshufd $0x39, %xmm11, %xmm11
movl %ecx, 40(%rdi)
movl %ebx, 24(%rdi)
movl %eax, 8(%rdi)
movl %edx, 56(%rdi)
movd %xmm8, %edx
movd %xmm9, %ecx
movd %xmm10, %ebx
movd %xmm11, %eax
movl %edx, 60(%rdi)
movl %ecx, 44(%rdi)
movl %ebx, 28(%rdi)
movl %eax, 12(%rdi)
# re-shuffle 2nd block back
movd %xmm12, %eax
movd %xmm13, %edx
movd %xmm14, %ecx
movd %xmm15, %ebx
pshufd $0x39, %xmm12, %xmm12
pshufd $0x39, %xmm13, %xmm13
pshufd $0x39, %xmm14, %xmm14
pshufd $0x39, %xmm15, %xmm15
movl %eax, 64(%rdi)
movl %edx, 112(%rdi)
movl %ecx, 96(%rdi)
movl %ebx, 80(%rdi)
movd %xmm12, %ebx
movd %xmm13, %eax
movd %xmm14, %edx
movd %xmm15, %ecx
pshufd $0x39, %xmm12, %xmm12
pshufd $0x39, %xmm13, %xmm13
pshufd $0x39, %xmm14, %xmm14
pshufd $0x39, %xmm15, %xmm15
movl %ebx, 84(%rdi)
movl %eax, 68(%rdi)
movl %edx, 116(%rdi)
movl %ecx, 100(%rdi)
movd %xmm12, %ecx
movd %xmm13, %ebx
movd %xmm14, %eax
movd %xmm15, %edx
pshufd $0x39, %xmm12, %xmm12
pshufd $0x39, %xmm13, %xmm13
pshufd $0x39, %xmm14, %xmm14
pshufd $0x39, %xmm15, %xmm15
movl %ecx, 104(%rdi)
movl %ebx, 88(%rdi)
movl %eax, 72(%rdi)
movl %edx, 120(%rdi)
movd %xmm12, %edx
movd %xmm13, %ecx
movd %xmm14, %ebx
movd %xmm15, %eax
movl %edx, 124(%rdi)
movl %ecx, 108(%rdi)
movl %ebx, 92(%rdi)
movl %eax, 76(%rdi)
scrypt_core_cleanup
ret
.macro salsa8_core_2way_xmm_doubleround
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm6
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $7, %xmm4
pslld $7, %xmm6
psrld $25, %xmm5
psrld $25, %xmm7
pxor %xmm4, %xmm3
pxor %xmm6, %xmm11
pxor %xmm5, %xmm3
pxor %xmm7, %xmm11
movdqa %xmm0, %xmm4
movdqa %xmm8, %xmm6
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $9, %xmm4
pslld $9, %xmm6
psrld $23, %xmm5
psrld $23, %xmm7
pxor %xmm4, %xmm2
pxor %xmm6, %xmm10
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm6
pshufd $0x93, %xmm3, %xmm3
pshufd $0x93, %xmm11, %xmm11
pxor %xmm5, %xmm2
pxor %xmm7, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $13, %xmm4
pslld $13, %xmm6
psrld $19, %xmm5
psrld $19, %xmm7
pxor %xmm4, %xmm1
pxor %xmm6, %xmm9
movdqa %xmm2, %xmm4
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm1
pxor %xmm7, %xmm9
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $18, %xmm4
pslld $18, %xmm6
psrld $14, %xmm5
psrld $14, %xmm7
pxor %xmm4, %xmm0
pxor %xmm6, %xmm8
pshufd $0x39, %xmm1, %xmm1
pshufd $0x39, %xmm9, %xmm9
pxor %xmm5, %xmm0
pxor %xmm7, %xmm8
movdqa %xmm3, %xmm4
movdqa %xmm11, %xmm6
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $7, %xmm4
pslld $7, %xmm6
psrld $25, %xmm5
psrld $25, %xmm7
pxor %xmm4, %xmm1
pxor %xmm6, %xmm9
pxor %xmm5, %xmm1
pxor %xmm7, %xmm9
movdqa %xmm0, %xmm4
movdqa %xmm8, %xmm6
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $9, %xmm4
pslld $9, %xmm6
psrld $23, %xmm5
psrld $23, %xmm7
pxor %xmm4, %xmm2
pxor %xmm6, %xmm10
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm6
pshufd $0x93, %xmm1, %xmm1
pshufd $0x93, %xmm9, %xmm9
pxor %xmm5, %xmm2
pxor %xmm7, %xmm10
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $13, %xmm4
pslld $13, %xmm6
psrld $19, %xmm5
psrld $19, %xmm7
pxor %xmm4, %xmm3
pxor %xmm6, %xmm11
movdqa %xmm2, %xmm4
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm2, %xmm2
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm3
pxor %xmm7, %xmm11
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
movdqa %xmm4, %xmm5
movdqa %xmm6, %xmm7
pslld $18, %xmm4
pslld $18, %xmm6
psrld $14, %xmm5
psrld $14, %xmm7
pxor %xmm4, %xmm0
pxor %xmm6, %xmm8
pshufd $0x39, %xmm3, %xmm3
pshufd $0x39, %xmm11, %xmm11
pxor %xmm5, %xmm0
pxor %xmm7, %xmm8
.endm
.macro salsa8_core_2way_xmm
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
salsa8_core_2way_xmm_doubleround
.endm
.text
.p2align 6
.globl scrypt_core_2way
.globl _scrypt_core_2way
scrypt_core_2way:
_scrypt_core_2way:
pushq %rbx
pushq %rbp
#if defined(WIN64)
subq $176, %rsp
movdqa %xmm6, 8(%rsp)
movdqa %xmm7, 24(%rsp)
movdqa %xmm8, 40(%rsp)
movdqa %xmm9, 56(%rsp)
movdqa %xmm10, 72(%rsp)
movdqa %xmm11, 88(%rsp)
movdqa %xmm12, 104(%rsp)
movdqa %xmm13, 120(%rsp)
movdqa %xmm14, 136(%rsp)
movdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#endif
subq $264, %rsp
scrypt_shuffle %rdi, 0, %rsp, 0
scrypt_shuffle %rdi, 64, %rsp, 64
scrypt_shuffle %rdi, 128, %rsp, 128
scrypt_shuffle %rdi, 192, %rsp, 192
movdqa 192(%rsp), %xmm12
movdqa 208(%rsp), %xmm13
movdqa 224(%rsp), %xmm14
movdqa 240(%rsp), %xmm15
movq %rsi, %rbp
leaq 262144(%rsi), %rcx
scrypt_core_2way_loop1:
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
movdqa 128(%rsp), %xmm8
movdqa 144(%rsp), %xmm9
movdqa 160(%rsp), %xmm10
movdqa 176(%rsp), %xmm11
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm0, 0(%rbp)
movdqa %xmm1, 16(%rbp)
movdqa %xmm2, 32(%rbp)
movdqa %xmm3, 48(%rbp)
movdqa %xmm4, 64(%rbp)
movdqa %xmm5, 80(%rbp)
movdqa %xmm6, 96(%rbp)
movdqa %xmm7, 112(%rbp)
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, 128(%rbp)
movdqa %xmm9, 144(%rbp)
movdqa %xmm10, 160(%rbp)
movdqa %xmm11, 176(%rbp)
movdqa %xmm12, 192(%rbp)
movdqa %xmm13, 208(%rbp)
movdqa %xmm14, 224(%rbp)
movdqa %xmm15, 240(%rbp)
salsa8_core_2way_xmm
paddd 0(%rbp), %xmm0
paddd 16(%rbp), %xmm1
paddd 32(%rbp), %xmm2
paddd 48(%rbp), %xmm3
paddd 128(%rbp), %xmm8
paddd 144(%rbp), %xmm9
paddd 160(%rbp), %xmm10
paddd 176(%rbp), %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
salsa8_core_2way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd %xmm8, %xmm12
paddd %xmm9, %xmm13
paddd %xmm10, %xmm14
paddd %xmm11, %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
addq $256, %rbp
cmpq %rcx, %rbp
jne scrypt_core_2way_loop1
movq $1024, %rcx
scrypt_core_2way_loop2:
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
movdqa 128(%rsp), %xmm8
movdqa 144(%rsp), %xmm9
movdqa 160(%rsp), %xmm10
movdqa 176(%rsp), %xmm11
movd %xmm4, %ebp
andl $1023, %ebp
shll $8, %ebp
pxor 0(%rsi, %rbp), %xmm0
pxor 16(%rsi, %rbp), %xmm1
pxor 32(%rsi, %rbp), %xmm2
pxor 48(%rsi, %rbp), %xmm3
movd %xmm12, %ebx
andl $1023, %ebx
shll $8, %ebx
addl $128, %ebx
pxor 0(%rsi, %rbx), %xmm8
pxor 16(%rsi, %rbx), %xmm9
pxor 32(%rsi, %rbx), %xmm10
pxor 48(%rsi, %rbx), %xmm11
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
salsa8_core_2way_xmm
paddd 0(%rsp), %xmm0
paddd 16(%rsp), %xmm1
paddd 32(%rsp), %xmm2
paddd 48(%rsp), %xmm3
paddd 128(%rsp), %xmm8
paddd 144(%rsp), %xmm9
paddd 160(%rsp), %xmm10
paddd 176(%rsp), %xmm11
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128(%rsp)
movdqa %xmm9, 144(%rsp)
movdqa %xmm10, 160(%rsp)
movdqa %xmm11, 176(%rsp)
pxor 64(%rsi, %rbp), %xmm0
pxor 80(%rsi, %rbp), %xmm1
pxor 96(%rsi, %rbp), %xmm2
pxor 112(%rsi, %rbp), %xmm3
pxor 64(%rsi, %rbx), %xmm8
pxor 80(%rsi, %rbx), %xmm9
pxor 96(%rsi, %rbx), %xmm10
pxor 112(%rsi, %rbx), %xmm11
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
salsa8_core_2way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd %xmm8, %xmm12
paddd %xmm9, %xmm13
paddd %xmm10, %xmm14
paddd %xmm11, %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
subq $1, %rcx
ja scrypt_core_2way_loop2
movdqa %xmm12, 192(%rsp)
movdqa %xmm13, 208(%rsp)
movdqa %xmm14, 224(%rsp)
movdqa %xmm15, 240(%rsp)
scrypt_shuffle %rsp, 0, %rdi, 0
scrypt_shuffle %rsp, 64, %rdi, 64
scrypt_shuffle %rsp, 128, %rdi, 128
scrypt_shuffle %rsp, 192, %rdi, 192
addq $264, %rsp
#if defined(WIN64)
popq %rsi
popq %rdi
movdqa 8(%rsp), %xmm6
movdqa 24(%rsp), %xmm7
movdqa 40(%rsp), %xmm8
movdqa 56(%rsp), %xmm9
movdqa 72(%rsp), %xmm10
movdqa 88(%rsp), %xmm11
movdqa 104(%rsp), %xmm12
movdqa 120(%rsp), %xmm13
movdqa 136(%rsp), %xmm14
movdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %rbp
popq %rbx
ret
#if defined(USE_AVX)
.macro salsa8_core_3way_avx_doubleround
vpaddd %xmm0, %xmm1, %xmm4
vpaddd %xmm8, %xmm9, %xmm6
vpaddd %xmm12, %xmm13, %xmm7
vpslld $7, %xmm4, %xmm5
vpsrld $25, %xmm4, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm5, %xmm3, %xmm3
vpslld $7, %xmm6, %xmm5
vpsrld $25, %xmm6, %xmm6
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm5, %xmm11, %xmm11
vpslld $7, %xmm7, %xmm5
vpsrld $25, %xmm7, %xmm7
vpxor %xmm7, %xmm15, %xmm15
vpxor %xmm5, %xmm15, %xmm15
vpaddd %xmm3, %xmm0, %xmm4
vpaddd %xmm11, %xmm8, %xmm6
vpaddd %xmm15, %xmm12, %xmm7
vpslld $9, %xmm4, %xmm5
vpsrld $23, %xmm4, %xmm4
vpxor %xmm4, %xmm2, %xmm2
vpxor %xmm5, %xmm2, %xmm2
vpslld $9, %xmm6, %xmm5
vpsrld $23, %xmm6, %xmm6
vpxor %xmm6, %xmm10, %xmm10
vpxor %xmm5, %xmm10, %xmm10
vpslld $9, %xmm7, %xmm5
vpsrld $23, %xmm7, %xmm7
vpxor %xmm7, %xmm14, %xmm14
vpxor %xmm5, %xmm14, %xmm14
vpaddd %xmm2, %xmm3, %xmm4
vpaddd %xmm10, %xmm11, %xmm6
vpaddd %xmm14, %xmm15, %xmm7
vpslld $13, %xmm4, %xmm5
vpsrld $19, %xmm4, %xmm4
vpshufd $0x93, %xmm3, %xmm3
vpshufd $0x93, %xmm11, %xmm11
vpshufd $0x93, %xmm15, %xmm15
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm5, %xmm1, %xmm1
vpslld $13, %xmm6, %xmm5
vpsrld $19, %xmm6, %xmm6
vpxor %xmm6, %xmm9, %xmm9
vpxor %xmm5, %xmm9, %xmm9
vpslld $13, %xmm7, %xmm5
vpsrld $19, %xmm7, %xmm7
vpxor %xmm7, %xmm13, %xmm13
vpxor %xmm5, %xmm13, %xmm13
vpaddd %xmm1, %xmm2, %xmm4
vpaddd %xmm9, %xmm10, %xmm6
vpaddd %xmm13, %xmm14, %xmm7
vpslld $18, %xmm4, %xmm5
vpsrld $14, %xmm4, %xmm4
vpshufd $0x4e, %xmm2, %xmm2
vpshufd $0x4e, %xmm10, %xmm10
vpshufd $0x4e, %xmm14, %xmm14
vpxor %xmm4, %xmm0, %xmm0
vpxor %xmm5, %xmm0, %xmm0
vpslld $18, %xmm6, %xmm5
vpsrld $14, %xmm6, %xmm6
vpxor %xmm6, %xmm8, %xmm8
vpxor %xmm5, %xmm8, %xmm8
vpslld $18, %xmm7, %xmm5
vpsrld $14, %xmm7, %xmm7
vpxor %xmm7, %xmm12, %xmm12
vpxor %xmm5, %xmm12, %xmm12
vpaddd %xmm0, %xmm3, %xmm4
vpaddd %xmm8, %xmm11, %xmm6
vpaddd %xmm12, %xmm15, %xmm7
vpslld $7, %xmm4, %xmm5
vpsrld $25, %xmm4, %xmm4
vpshufd $0x39, %xmm1, %xmm1
vpshufd $0x39, %xmm9, %xmm9
vpshufd $0x39, %xmm13, %xmm13
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm5, %xmm1, %xmm1
vpslld $7, %xmm6, %xmm5
vpsrld $25, %xmm6, %xmm6
vpxor %xmm6, %xmm9, %xmm9
vpxor %xmm5, %xmm9, %xmm9
vpslld $7, %xmm7, %xmm5
vpsrld $25, %xmm7, %xmm7
vpxor %xmm7, %xmm13, %xmm13
vpxor %xmm5, %xmm13, %xmm13
vpaddd %xmm1, %xmm0, %xmm4
vpaddd %xmm9, %xmm8, %xmm6
vpaddd %xmm13, %xmm12, %xmm7
vpslld $9, %xmm4, %xmm5
vpsrld $23, %xmm4, %xmm4
vpxor %xmm4, %xmm2, %xmm2
vpxor %xmm5, %xmm2, %xmm2
vpslld $9, %xmm6, %xmm5
vpsrld $23, %xmm6, %xmm6
vpxor %xmm6, %xmm10, %xmm10
vpxor %xmm5, %xmm10, %xmm10
vpslld $9, %xmm7, %xmm5
vpsrld $23, %xmm7, %xmm7
vpxor %xmm7, %xmm14, %xmm14
vpxor %xmm5, %xmm14, %xmm14
vpaddd %xmm2, %xmm1, %xmm4
vpaddd %xmm10, %xmm9, %xmm6
vpaddd %xmm14, %xmm13, %xmm7
vpslld $13, %xmm4, %xmm5
vpsrld $19, %xmm4, %xmm4
vpshufd $0x93, %xmm1, %xmm1
vpshufd $0x93, %xmm9, %xmm9
vpshufd $0x93, %xmm13, %xmm13
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm5, %xmm3, %xmm3
vpslld $13, %xmm6, %xmm5
vpsrld $19, %xmm6, %xmm6
vpxor %xmm6, %xmm11, %xmm11
vpxor %xmm5, %xmm11, %xmm11
vpslld $13, %xmm7, %xmm5
vpsrld $19, %xmm7, %xmm7
vpxor %xmm7, %xmm15, %xmm15
vpxor %xmm5, %xmm15, %xmm15
vpaddd %xmm3, %xmm2, %xmm4
vpaddd %xmm11, %xmm10, %xmm6
vpaddd %xmm15, %xmm14, %xmm7
vpslld $18, %xmm4, %xmm5
vpsrld $14, %xmm4, %xmm4
vpshufd $0x4e, %xmm2, %xmm2
vpshufd $0x4e, %xmm10, %xmm10
vpshufd $0x4e, %xmm14, %xmm14
vpxor %xmm4, %xmm0, %xmm0
vpxor %xmm5, %xmm0, %xmm0
vpslld $18, %xmm6, %xmm5
vpsrld $14, %xmm6, %xmm6
vpxor %xmm6, %xmm8, %xmm8
vpxor %xmm5, %xmm8, %xmm8
vpslld $18, %xmm7, %xmm5
vpsrld $14, %xmm7, %xmm7
vpshufd $0x39, %xmm3, %xmm3
vpshufd $0x39, %xmm11, %xmm11
vpshufd $0x39, %xmm15, %xmm15
vpxor %xmm7, %xmm12, %xmm12
vpxor %xmm5, %xmm12, %xmm12
.endm
.macro salsa8_core_3way_avx
salsa8_core_3way_avx_doubleround
salsa8_core_3way_avx_doubleround
salsa8_core_3way_avx_doubleround
salsa8_core_3way_avx_doubleround
.endm
#endif
.text
.p2align 6
.globl scrypt_core_3way
.globl _scrypt_core_3way
scrypt_core_3way:
_scrypt_core_3way:
pushq %rbx
pushq %rbp
#if defined(WIN64)
subq $176, %rsp
movdqa %xmm6, 8(%rsp)
movdqa %xmm7, 24(%rsp)
movdqa %xmm8, 40(%rsp)
movdqa %xmm9, 56(%rsp)
movdqa %xmm10, 72(%rsp)
movdqa %xmm11, 88(%rsp)
movdqa %xmm12, 104(%rsp)
movdqa %xmm13, 120(%rsp)
movdqa %xmm14, 136(%rsp)
movdqa %xmm15, 152(%rsp)
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
#endif
subq $392, %rsp
.macro scrypt_core_3way_cleanup
addq $392, %rsp
#if defined(WIN64)
popq %rsi
popq %rdi
movdqa 8(%rsp), %xmm6
movdqa 24(%rsp), %xmm7
movdqa 40(%rsp), %xmm8
movdqa 56(%rsp), %xmm9
movdqa 72(%rsp), %xmm10
movdqa 88(%rsp), %xmm11
movdqa 104(%rsp), %xmm12
movdqa 120(%rsp), %xmm13
movdqa 136(%rsp), %xmm14
movdqa 152(%rsp), %xmm15
addq $176, %rsp
#endif
popq %rbp
popq %rbx
.endm
#if !defined(USE_AVX)
jmp scrypt_core_3way_xmm
#else
# Check for AVX and OSXSAVE support
movl $1, %eax
cpuid
andl $0x18000000, %ecx
cmpl $0x18000000, %ecx
jne scrypt_core_3way_xmm
# Check for XMM and YMM state support
xorl %ecx, %ecx
xgetbv
andl $0x00000006, %eax
cmpl $0x00000006, %eax
jne scrypt_core_3way_xmm
avx_scrypt_core_3way:
scrypt_shuffle %rdi, 0, %rsp, 0
scrypt_shuffle %rdi, 64, %rsp, 64
scrypt_shuffle %rdi, 128, %rsp, 128
scrypt_shuffle %rdi, 192, %rsp, 192
scrypt_shuffle %rdi, 256, %rsp, 256
scrypt_shuffle %rdi, 320, %rsp, 320
movdqa 128+64(%rsp), %xmm8
movdqa 128+80(%rsp), %xmm9
movdqa 128+96(%rsp), %xmm10
movdqa 128+112(%rsp), %xmm11
movq %rsi, %rbx
leaq 3*131072(%rsi), %rax
scrypt_core_3way_avx_loop1:
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
vpxor 0(%rsp), %xmm4, %xmm0
vpxor 16(%rsp), %xmm5, %xmm1
vpxor 32(%rsp), %xmm6, %xmm2
vpxor 48(%rsp), %xmm7, %xmm3
vpxor 128+0(%rsp), %xmm12, %xmm8
vpxor 128+16(%rsp), %xmm13, %xmm9
vpxor 128+32(%rsp), %xmm14, %xmm10
vpxor 128+48(%rsp), %xmm15, %xmm11
movdqa %xmm0, 0(%rbx)
movdqa %xmm1, 16(%rbx)
movdqa %xmm2, 32(%rbx)
movdqa %xmm3, 48(%rbx)
movdqa %xmm4, 64(%rbx)
movdqa %xmm5, 80(%rbx)
movdqa %xmm6, 96(%rbx)
movdqa %xmm7, 112(%rbx)
movdqa %xmm8, 128+0(%rbx)
movdqa %xmm9, 128+16(%rbx)
movdqa %xmm10, 128+32(%rbx)
movdqa %xmm11, 128+48(%rbx)
movdqa %xmm12, 128+64(%rbx)
movdqa %xmm13, 128+80(%rbx)
movdqa %xmm14, 128+96(%rbx)
movdqa %xmm15, 128+112(%rbx)
movdqa 256+64(%rsp), %xmm4
movdqa 256+80(%rsp), %xmm5
movdqa 256+96(%rsp), %xmm6
movdqa 256+112(%rsp), %xmm7
vpxor 256+0(%rsp), %xmm4, %xmm12
vpxor 256+16(%rsp), %xmm5, %xmm13
vpxor 256+32(%rsp), %xmm6, %xmm14
vpxor 256+48(%rsp), %xmm7, %xmm15
movdqa %xmm12, 256+0(%rbx)
movdqa %xmm13, 256+16(%rbx)
movdqa %xmm14, 256+32(%rbx)
movdqa %xmm15, 256+48(%rbx)
movdqa %xmm4, 256+64(%rbx)
movdqa %xmm5, 256+80(%rbx)
movdqa %xmm6, 256+96(%rbx)
movdqa %xmm7, 256+112(%rbx)
salsa8_core_3way_avx
paddd 0(%rbx), %xmm0
paddd 16(%rbx), %xmm1
paddd 32(%rbx), %xmm2
paddd 48(%rbx), %xmm3
paddd 128+0(%rbx), %xmm8
paddd 128+16(%rbx), %xmm9
paddd 128+32(%rbx), %xmm10
paddd 128+48(%rbx), %xmm11
paddd 256+0(%rbx), %xmm12
paddd 256+16(%rbx), %xmm13
paddd 256+32(%rbx), %xmm14
paddd 256+48(%rbx), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
salsa8_core_3way_avx
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd 128+64(%rsp), %xmm8
paddd 128+80(%rsp), %xmm9
paddd 128+96(%rsp), %xmm10
paddd 128+112(%rsp), %xmm11
paddd 256+64(%rsp), %xmm12
paddd 256+80(%rsp), %xmm13
paddd 256+96(%rsp), %xmm14
paddd 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
addq $3*128, %rbx
cmpq %rax, %rbx
jne scrypt_core_3way_avx_loop1
movq $1024, %rcx
.p2align 4
scrypt_core_3way_avx_loop2:
movl 64(%rsp), %ebp
andl $1023, %ebp
leaq (%rbp, %rbp, 2), %rbp
movl 128+64(%rsp), %ebx
shll $7, %ebp
movl 256+64(%rsp), %eax
andl $1023, %ebx
leaq (%rbx, %rbx, 2), %rbx
shll $7, %ebx
shll $7, %eax
addl $128, %ebx
andl $131071, %eax
leaq (%rax, %rax, 2), %rax
addl $256, %eax
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 128+0(%rsp), %xmm8
movdqa 128+16(%rsp), %xmm9
movdqa 128+32(%rsp), %xmm10
movdqa 128+48(%rsp), %xmm11
movdqa 256+0(%rsp), %xmm12
movdqa 256+16(%rsp), %xmm13
movdqa 256+32(%rsp), %xmm14
movdqa 256+48(%rsp), %xmm15
pxor 0(%rsi, %rbp), %xmm0
pxor 16(%rsi, %rbp), %xmm1
pxor 32(%rsi, %rbp), %xmm2
pxor 48(%rsi, %rbp), %xmm3
pxor 0(%rsi, %rbx), %xmm8
pxor 16(%rsi, %rbx), %xmm9
pxor 32(%rsi, %rbx), %xmm10
pxor 48(%rsi, %rbx), %xmm11
pxor 0(%rsi, %rax), %xmm12
pxor 16(%rsi, %rax), %xmm13
pxor 32(%rsi, %rax), %xmm14
pxor 48(%rsi, %rax), %xmm15
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
salsa8_core_3way_avx
paddd 0(%rsp), %xmm0
paddd 16(%rsp), %xmm1
paddd 32(%rsp), %xmm2
paddd 48(%rsp), %xmm3
paddd 128+0(%rsp), %xmm8
paddd 128+16(%rsp), %xmm9
paddd 128+32(%rsp), %xmm10
paddd 128+48(%rsp), %xmm11
paddd 256+0(%rsp), %xmm12
paddd 256+16(%rsp), %xmm13
paddd 256+32(%rsp), %xmm14
paddd 256+48(%rsp), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
pxor 64(%rsi, %rbp), %xmm0
pxor 80(%rsi, %rbp), %xmm1
pxor 96(%rsi, %rbp), %xmm2
pxor 112(%rsi, %rbp), %xmm3
pxor 64(%rsi, %rbx), %xmm8
pxor 80(%rsi, %rbx), %xmm9
pxor 96(%rsi, %rbx), %xmm10
pxor 112(%rsi, %rbx), %xmm11
pxor 64(%rsi, %rax), %xmm12
pxor 80(%rsi, %rax), %xmm13
pxor 96(%rsi, %rax), %xmm14
pxor 112(%rsi, %rax), %xmm15
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
salsa8_core_3way_avx
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd 128+64(%rsp), %xmm8
paddd 128+80(%rsp), %xmm9
paddd 128+96(%rsp), %xmm10
paddd 128+112(%rsp), %xmm11
paddd 256+64(%rsp), %xmm12
paddd 256+80(%rsp), %xmm13
paddd 256+96(%rsp), %xmm14
paddd 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
subq $1, %rcx
ja scrypt_core_3way_avx_loop2
scrypt_shuffle %rsp, 0, %rdi, 0
scrypt_shuffle %rsp, 64, %rdi, 64
scrypt_shuffle %rsp, 128, %rdi, 128
scrypt_shuffle %rsp, 192, %rdi, 192
scrypt_shuffle %rsp, 256, %rdi, 256
scrypt_shuffle %rsp, 320, %rdi, 320
scrypt_core_3way_cleanup
ret
#endif
.macro salsa8_core_3way_xmm_doubleround
movdqa %xmm1, %xmm4
movdqa %xmm9, %xmm6
movdqa %xmm13, %xmm7
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
paddd %xmm12, %xmm7
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm3
pxor %xmm5, %xmm3
movdqa %xmm0, %xmm4
movdqa %xmm6, %xmm5
pslld $7, %xmm6
psrld $25, %xmm5
pxor %xmm6, %xmm11
pxor %xmm5, %xmm11
movdqa %xmm8, %xmm6
movdqa %xmm7, %xmm5
pslld $7, %xmm7
psrld $25, %xmm5
pxor %xmm7, %xmm15
pxor %xmm5, %xmm15
movdqa %xmm12, %xmm7
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
paddd %xmm15, %xmm7
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm3, %xmm4
pshufd $0x93, %xmm3, %xmm3
pxor %xmm5, %xmm2
movdqa %xmm6, %xmm5
pslld $9, %xmm6
psrld $23, %xmm5
pxor %xmm6, %xmm10
movdqa %xmm11, %xmm6
pshufd $0x93, %xmm11, %xmm11
pxor %xmm5, %xmm10
movdqa %xmm7, %xmm5
pslld $9, %xmm7
psrld $23, %xmm5
pxor %xmm7, %xmm14
movdqa %xmm15, %xmm7
pshufd $0x93, %xmm15, %xmm15
pxor %xmm5, %xmm14
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
paddd %xmm14, %xmm7
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm1
movdqa %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm2
pxor %xmm5, %xmm1
movdqa %xmm6, %xmm5
pslld $13, %xmm6
psrld $19, %xmm5
pxor %xmm6, %xmm9
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm9
movdqa %xmm7, %xmm5
pslld $13, %xmm7
psrld $19, %xmm5
pxor %xmm7, %xmm13
movdqa %xmm14, %xmm7
pshufd $0x4e, %xmm14, %xmm14
pxor %xmm5, %xmm13
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
paddd %xmm13, %xmm7
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm1, %xmm1
pxor %xmm5, %xmm0
movdqa %xmm3, %xmm4
movdqa %xmm6, %xmm5
pslld $18, %xmm6
psrld $14, %xmm5
pxor %xmm6, %xmm8
pshufd $0x39, %xmm9, %xmm9
pxor %xmm5, %xmm8
movdqa %xmm11, %xmm6
movdqa %xmm7, %xmm5
pslld $18, %xmm7
psrld $14, %xmm5
pxor %xmm7, %xmm12
pshufd $0x39, %xmm13, %xmm13
pxor %xmm5, %xmm12
movdqa %xmm15, %xmm7
paddd %xmm0, %xmm4
paddd %xmm8, %xmm6
paddd %xmm12, %xmm7
movdqa %xmm4, %xmm5
pslld $7, %xmm4
psrld $25, %xmm5
pxor %xmm4, %xmm1
pxor %xmm5, %xmm1
movdqa %xmm0, %xmm4
movdqa %xmm6, %xmm5
pslld $7, %xmm6
psrld $25, %xmm5
pxor %xmm6, %xmm9
pxor %xmm5, %xmm9
movdqa %xmm8, %xmm6
movdqa %xmm7, %xmm5
pslld $7, %xmm7
psrld $25, %xmm5
pxor %xmm7, %xmm13
pxor %xmm5, %xmm13
movdqa %xmm12, %xmm7
paddd %xmm1, %xmm4
paddd %xmm9, %xmm6
paddd %xmm13, %xmm7
movdqa %xmm4, %xmm5
pslld $9, %xmm4
psrld $23, %xmm5
pxor %xmm4, %xmm2
movdqa %xmm1, %xmm4
pshufd $0x93, %xmm1, %xmm1
pxor %xmm5, %xmm2
movdqa %xmm6, %xmm5
pslld $9, %xmm6
psrld $23, %xmm5
pxor %xmm6, %xmm10
movdqa %xmm9, %xmm6
pshufd $0x93, %xmm9, %xmm9
pxor %xmm5, %xmm10
movdqa %xmm7, %xmm5
pslld $9, %xmm7
psrld $23, %xmm5
pxor %xmm7, %xmm14
movdqa %xmm13, %xmm7
pshufd $0x93, %xmm13, %xmm13
pxor %xmm5, %xmm14
paddd %xmm2, %xmm4
paddd %xmm10, %xmm6
paddd %xmm14, %xmm7
movdqa %xmm4, %xmm5
pslld $13, %xmm4
psrld $19, %xmm5
pxor %xmm4, %xmm3
movdqa %xmm2, %xmm4
pshufd $0x4e, %xmm2, %xmm2
pxor %xmm5, %xmm3
movdqa %xmm6, %xmm5
pslld $13, %xmm6
psrld $19, %xmm5
pxor %xmm6, %xmm11
movdqa %xmm10, %xmm6
pshufd $0x4e, %xmm10, %xmm10
pxor %xmm5, %xmm11
movdqa %xmm7, %xmm5
pslld $13, %xmm7
psrld $19, %xmm5
pxor %xmm7, %xmm15
movdqa %xmm14, %xmm7
pshufd $0x4e, %xmm14, %xmm14
pxor %xmm5, %xmm15
paddd %xmm3, %xmm4
paddd %xmm11, %xmm6
paddd %xmm15, %xmm7
movdqa %xmm4, %xmm5
pslld $18, %xmm4
psrld $14, %xmm5
pxor %xmm4, %xmm0
pshufd $0x39, %xmm3, %xmm3
pxor %xmm5, %xmm0
movdqa %xmm6, %xmm5
pslld $18, %xmm6
psrld $14, %xmm5
pxor %xmm6, %xmm8
pshufd $0x39, %xmm11, %xmm11
pxor %xmm5, %xmm8
movdqa %xmm7, %xmm5
pslld $18, %xmm7
psrld $14, %xmm5
pxor %xmm7, %xmm12
pshufd $0x39, %xmm15, %xmm15
pxor %xmm5, %xmm12
.endm
.macro salsa8_core_3way_xmm
salsa8_core_3way_xmm_doubleround
salsa8_core_3way_xmm_doubleround
salsa8_core_3way_xmm_doubleround
salsa8_core_3way_xmm_doubleround
.endm
.p2align 6
scrypt_core_3way_xmm:
scrypt_shuffle %rdi, 0, %rsp, 0
scrypt_shuffle %rdi, 64, %rsp, 64
scrypt_shuffle %rdi, 128, %rsp, 128
scrypt_shuffle %rdi, 192, %rsp, 192
scrypt_shuffle %rdi, 256, %rsp, 256
scrypt_shuffle %rdi, 320, %rsp, 320
movdqa 128+64(%rsp), %xmm8
movdqa 128+80(%rsp), %xmm9
movdqa 128+96(%rsp), %xmm10
movdqa 128+112(%rsp), %xmm11
movq %rsi, %rbx
leaq 3*131072(%rsi), %rax
scrypt_core_3way_xmm_loop1:
movdqa %xmm8, %xmm12
movdqa %xmm9, %xmm13
movdqa %xmm10, %xmm14
movdqa %xmm11, %xmm15
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 64(%rsp), %xmm4
movdqa 80(%rsp), %xmm5
movdqa 96(%rsp), %xmm6
movdqa 112(%rsp), %xmm7
movdqa 128+0(%rsp), %xmm8
movdqa 128+16(%rsp), %xmm9
movdqa 128+32(%rsp), %xmm10
movdqa 128+48(%rsp), %xmm11
pxor %xmm4, %xmm0
pxor %xmm5, %xmm1
pxor %xmm6, %xmm2
pxor %xmm7, %xmm3
movdqa %xmm0, 0(%rbx)
movdqa %xmm1, 16(%rbx)
movdqa %xmm2, 32(%rbx)
movdqa %xmm3, 48(%rbx)
movdqa %xmm4, 64(%rbx)
movdqa %xmm5, 80(%rbx)
movdqa %xmm6, 96(%rbx)
movdqa %xmm7, 112(%rbx)
pxor %xmm12, %xmm8
pxor %xmm13, %xmm9
pxor %xmm14, %xmm10
pxor %xmm15, %xmm11
movdqa %xmm8, 128+0(%rbx)
movdqa %xmm9, 128+16(%rbx)
movdqa %xmm10, 128+32(%rbx)
movdqa %xmm11, 128+48(%rbx)
movdqa %xmm12, 128+64(%rbx)
movdqa %xmm13, 128+80(%rbx)
movdqa %xmm14, 128+96(%rbx)
movdqa %xmm15, 128+112(%rbx)
movdqa 256+0(%rsp), %xmm12
movdqa 256+16(%rsp), %xmm13
movdqa 256+32(%rsp), %xmm14
movdqa 256+48(%rsp), %xmm15
movdqa 256+64(%rsp), %xmm4
movdqa 256+80(%rsp), %xmm5
movdqa 256+96(%rsp), %xmm6
movdqa 256+112(%rsp), %xmm7
pxor %xmm4, %xmm12
pxor %xmm5, %xmm13
pxor %xmm6, %xmm14
pxor %xmm7, %xmm15
movdqa %xmm12, 256+0(%rbx)
movdqa %xmm13, 256+16(%rbx)
movdqa %xmm14, 256+32(%rbx)
movdqa %xmm15, 256+48(%rbx)
movdqa %xmm4, 256+64(%rbx)
movdqa %xmm5, 256+80(%rbx)
movdqa %xmm6, 256+96(%rbx)
movdqa %xmm7, 256+112(%rbx)
salsa8_core_3way_xmm
paddd 0(%rbx), %xmm0
paddd 16(%rbx), %xmm1
paddd 32(%rbx), %xmm2
paddd 48(%rbx), %xmm3
paddd 128+0(%rbx), %xmm8
paddd 128+16(%rbx), %xmm9
paddd 128+32(%rbx), %xmm10
paddd 128+48(%rbx), %xmm11
paddd 256+0(%rbx), %xmm12
paddd 256+16(%rbx), %xmm13
paddd 256+32(%rbx), %xmm14
paddd 256+48(%rbx), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
salsa8_core_3way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd 128+64(%rsp), %xmm8
paddd 128+80(%rsp), %xmm9
paddd 128+96(%rsp), %xmm10
paddd 128+112(%rsp), %xmm11
paddd 256+64(%rsp), %xmm12
paddd 256+80(%rsp), %xmm13
paddd 256+96(%rsp), %xmm14
paddd 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
addq $3*128, %rbx
cmpq %rax, %rbx
jne scrypt_core_3way_xmm_loop1
movq $1024, %rcx
.p2align 4
scrypt_core_3way_xmm_loop2:
movl 64(%rsp), %ebp
andl $1023, %ebp
leaq (%rbp, %rbp, 2), %rbp
movl 128+64(%rsp), %ebx
shll $7, %ebp
movl 256+64(%rsp), %eax
andl $1023, %ebx
leaq (%rbx, %rbx, 2), %rbx
shll $7, %ebx
shll $7, %eax
addl $128, %ebx
andl $131071, %eax
leaq (%rax, %rax, 2), %rax
addl $256, %eax
movdqa 0(%rsp), %xmm0
movdqa 16(%rsp), %xmm1
movdqa 32(%rsp), %xmm2
movdqa 48(%rsp), %xmm3
movdqa 128+0(%rsp), %xmm8
movdqa 128+16(%rsp), %xmm9
movdqa 128+32(%rsp), %xmm10
movdqa 128+48(%rsp), %xmm11
movdqa 256+0(%rsp), %xmm12
movdqa 256+16(%rsp), %xmm13
movdqa 256+32(%rsp), %xmm14
movdqa 256+48(%rsp), %xmm15
pxor 0(%rsi, %rbp), %xmm0
pxor 16(%rsi, %rbp), %xmm1
pxor 32(%rsi, %rbp), %xmm2
pxor 48(%rsi, %rbp), %xmm3
pxor 0(%rsi, %rbx), %xmm8
pxor 16(%rsi, %rbx), %xmm9
pxor 32(%rsi, %rbx), %xmm10
pxor 48(%rsi, %rbx), %xmm11
pxor 0(%rsi, %rax), %xmm12
pxor 16(%rsi, %rax), %xmm13
pxor 32(%rsi, %rax), %xmm14
pxor 48(%rsi, %rax), %xmm15
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
salsa8_core_3way_xmm
paddd 0(%rsp), %xmm0
paddd 16(%rsp), %xmm1
paddd 32(%rsp), %xmm2
paddd 48(%rsp), %xmm3
paddd 128+0(%rsp), %xmm8
paddd 128+16(%rsp), %xmm9
paddd 128+32(%rsp), %xmm10
paddd 128+48(%rsp), %xmm11
paddd 256+0(%rsp), %xmm12
paddd 256+16(%rsp), %xmm13
paddd 256+32(%rsp), %xmm14
paddd 256+48(%rsp), %xmm15
movdqa %xmm0, 0(%rsp)
movdqa %xmm1, 16(%rsp)
movdqa %xmm2, 32(%rsp)
movdqa %xmm3, 48(%rsp)
movdqa %xmm8, 128+0(%rsp)
movdqa %xmm9, 128+16(%rsp)
movdqa %xmm10, 128+32(%rsp)
movdqa %xmm11, 128+48(%rsp)
movdqa %xmm12, 256+0(%rsp)
movdqa %xmm13, 256+16(%rsp)
movdqa %xmm14, 256+32(%rsp)
movdqa %xmm15, 256+48(%rsp)
pxor 64(%rsi, %rbp), %xmm0
pxor 80(%rsi, %rbp), %xmm1
pxor 96(%rsi, %rbp), %xmm2
pxor 112(%rsi, %rbp), %xmm3
pxor 64(%rsi, %rbx), %xmm8
pxor 80(%rsi, %rbx), %xmm9
pxor 96(%rsi, %rbx), %xmm10
pxor 112(%rsi, %rbx), %xmm11
pxor 64(%rsi, %rax), %xmm12
pxor 80(%rsi, %rax), %xmm13
pxor 96(%rsi, %rax), %xmm14
pxor 112(%rsi, %rax), %xmm15
pxor 64(%rsp), %xmm0
pxor 80(%rsp), %xmm1
pxor 96(%rsp), %xmm2
pxor 112(%rsp), %xmm3
pxor 128+64(%rsp), %xmm8
pxor 128+80(%rsp), %xmm9
pxor 128+96(%rsp), %xmm10
pxor 128+112(%rsp), %xmm11
pxor 256+64(%rsp), %xmm12
pxor 256+80(%rsp), %xmm13
pxor 256+96(%rsp), %xmm14
pxor 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
salsa8_core_3way_xmm
paddd 64(%rsp), %xmm0
paddd 80(%rsp), %xmm1
paddd 96(%rsp), %xmm2
paddd 112(%rsp), %xmm3
paddd 128+64(%rsp), %xmm8
paddd 128+80(%rsp), %xmm9
paddd 128+96(%rsp), %xmm10
paddd 128+112(%rsp), %xmm11
paddd 256+64(%rsp), %xmm12
paddd 256+80(%rsp), %xmm13
paddd 256+96(%rsp), %xmm14
paddd 256+112(%rsp), %xmm15
movdqa %xmm0, 64(%rsp)
movdqa %xmm1, 80(%rsp)
movdqa %xmm2, 96(%rsp)
movdqa %xmm3, 112(%rsp)
movdqa %xmm8, 128+64(%rsp)
movdqa %xmm9, 128+80(%rsp)
movdqa %xmm10, 128+96(%rsp)
movdqa %xmm11, 128+112(%rsp)
movdqa %xmm12, 256+64(%rsp)
movdqa %xmm13, 256+80(%rsp)
movdqa %xmm14, 256+96(%rsp)
movdqa %xmm15, 256+112(%rsp)
subq $1, %rcx
ja scrypt_core_3way_xmm_loop2
scrypt_shuffle %rsp, 0, %rdi, 0
scrypt_shuffle %rsp, 64, %rdi, 64
scrypt_shuffle %rsp, 128, %rdi, 128
scrypt_shuffle %rsp, 192, %rdi, 192
scrypt_shuffle %rsp, 256, %rdi, 256
scrypt_shuffle %rsp, 320, %rdi, 320
scrypt_core_3way_cleanup
ret
#endif