/* * Copyright 2011-2012 pooler@litecoinpool.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #if defined(__x86_64__) .data .p2align 6 sha256_4h: .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 .data .p2align 6 sha256_4k: .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 .long 0x243185be, 0x243185be, 0x243185be, 0x243185be .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 .text .p2align 5 .globl SHA256_InitState_4way .globl _SHA256_InitState_4way SHA256_InitState_4way: _SHA256_InitState_4way: #if defined(WIN64) pushq %rdi movq %rcx, %rdi #endif movdqa sha256_4h+0(%rip), %xmm0 movdqa sha256_4h+16(%rip), %xmm1 movdqa sha256_4h+32(%rip), %xmm2 movdqa sha256_4h+48(%rip), %xmm3 movdqu %xmm0, 0(%rdi) movdqu %xmm1, 16(%rdi) movdqu %xmm2, 32(%rdi) movdqu %xmm3, 48(%rdi) movdqa sha256_4h+64(%rip), %xmm0 movdqa sha256_4h+80(%rip), %xmm1 movdqa sha256_4h+96(%rip), %xmm2 movdqa sha256_4h+112(%rip), %xmm3 movdqu %xmm0, 64(%rdi) movdqu %xmm1, 80(%rdi) movdqu %xmm2, 96(%rdi) movdqu %xmm3, 112(%rdi) #if defined(WIN64) popq %rdi #endif ret .macro p2bswap_rsi_rsp i movdqu \i*16(%rsi), %xmm0 movdqu (\i+1)*16(%rsi), %xmm2 pshuflw $0xb1, %xmm0, %xmm0 pshuflw $0xb1, %xmm2, %xmm2 pshufhw $0xb1, %xmm0, %xmm0 pshufhw $0xb1, %xmm2, %xmm2 movdqa %xmm0, %xmm1 movdqa %xmm2, %xmm3 psrlw $8, %xmm1 psrlw $8, %xmm3 psllw $8, %xmm0 psllw $8, %xmm2 pxor %xmm1, %xmm0 pxor %xmm3, %xmm2 movdqa %xmm0, \i*16(%rsp) movdqa %xmm2, (\i+1)*16(%rsp) .endm .text .p2align 5 .globl SHA256_Transform_4way .globl _SHA256_Transform_4way SHA256_Transform_4way: _SHA256_Transform_4way: #if defined(WIN64) pushq %rdi subq $96, %rsp movdqa %xmm6, 0(%rsp) movdqa %xmm7, 16(%rsp) movdqa %xmm8, 32(%rsp) movdqa %xmm9, 48(%rsp) movdqa %xmm10, 64(%rsp) movdqa %xmm11, 80(%rsp) pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif subq $1032, %rsp testq %rdx, %rdx jz sha256_transform_4way_block_copy p2bswap_rsi_rsp 0 p2bswap_rsi_rsp 2 p2bswap_rsi_rsp 4 p2bswap_rsi_rsp 6 p2bswap_rsi_rsp 8 p2bswap_rsi_rsp 10 p2bswap_rsi_rsp 12 p2bswap_rsi_rsp 14 jmp sha256_transform_4way_extend .p2align 5 sha256_transform_4way_block_copy: movdqu 0*16(%rsi), %xmm0 movdqu 1*16(%rsi), %xmm1 movdqu 2*16(%rsi), %xmm2 movdqu 3*16(%rsi), %xmm3 movdqu 4*16(%rsi), %xmm4 movdqu 5*16(%rsi), %xmm5 movdqu 6*16(%rsi), %xmm6 movdqu 7*16(%rsi), %xmm7 movdqa %xmm0, 0*16(%rsp) movdqa %xmm1, 1*16(%rsp) movdqa %xmm2, 2*16(%rsp) movdqa %xmm3, 3*16(%rsp) movdqa %xmm4, 4*16(%rsp) movdqa %xmm5, 5*16(%rsp) movdqa %xmm6, 6*16(%rsp) movdqa %xmm7, 7*16(%rsp) movdqu 8*16(%rsi), %xmm0 movdqu 9*16(%rsi), %xmm1 movdqu 10*16(%rsi), %xmm2 movdqu 11*16(%rsi), %xmm3 movdqu 12*16(%rsi), %xmm4 movdqu 13*16(%rsi), %xmm5 movdqu 14*16(%rsi), %xmm6 movdqu 15*16(%rsi), %xmm7 movdqa %xmm0, 8*16(%rsp) movdqa %xmm1, 9*16(%rsp) movdqa %xmm2, 10*16(%rsp) movdqa %xmm3, 11*16(%rsp) movdqa %xmm4, 12*16(%rsp) movdqa %xmm5, 13*16(%rsp) movdqa %xmm6, 14*16(%rsp) movdqa %xmm7, 15*16(%rsp) sha256_transform_4way_extend: leaq 256(%rsp), %rcx leaq 48*16(%rcx), %rax sha256_transform_4way_extend_loop: movdqa -15*16(%rcx), %xmm0 movdqa -14*16(%rcx), %xmm4 movdqa %xmm0, %xmm2 movdqa %xmm4, %xmm6 psrld $3, %xmm0 psrld $3, %xmm4 movdqa %xmm0, %xmm1 movdqa %xmm4, %xmm5 pslld $14, %xmm2 pslld $14, %xmm6 psrld $4, %xmm1 psrld $4, %xmm5 pxor %xmm1, %xmm0 pxor %xmm5, %xmm4 psrld $11, %xmm1 psrld $11, %xmm5 pxor %xmm2, %xmm0 pxor %xmm6, %xmm4 pslld $11, %xmm2 pslld $11, %xmm6 pxor %xmm1, %xmm0 pxor %xmm5, %xmm4 pxor %xmm2, %xmm0 pxor %xmm6, %xmm4 movdqa -2*16(%rcx), %xmm3 movdqa -1*16(%rcx), %xmm7 paddd -16*16(%rcx), %xmm0 paddd -15*16(%rcx), %xmm4 movdqa %xmm3, %xmm2 movdqa %xmm7, %xmm6 psrld $10, %xmm3 psrld $10, %xmm7 movdqa %xmm3, %xmm1 movdqa %xmm7, %xmm5 paddd -7*16(%rcx), %xmm0 pslld $13, %xmm2 pslld $13, %xmm6 psrld $7, %xmm1 psrld $7, %xmm5 paddd -6*16(%rcx), %xmm4 pxor %xmm1, %xmm3 pxor %xmm5, %xmm7 psrld $2, %xmm1 psrld $2, %xmm5 pxor %xmm2, %xmm3 pxor %xmm6, %xmm7 pslld $2, %xmm2 pslld $2, %xmm6 pxor %xmm1, %xmm3 pxor %xmm5, %xmm7 pxor %xmm2, %xmm3 pxor %xmm6, %xmm7 paddd %xmm3, %xmm0 paddd %xmm7, %xmm4 movdqa %xmm0, (%rcx) movdqa %xmm4, 16(%rcx) addq $2*16, %rcx cmpq %rcx, %rax jne sha256_transform_4way_extend_loop movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 movdqu 32(%rdi), %xmm4 movdqu 48(%rdi), %xmm3 movdqu 64(%rdi), %xmm0 movdqu 80(%rdi), %xmm8 movdqu 96(%rdi), %xmm9 movdqu 112(%rdi), %xmm10 leaq sha256_4k(%rip), %rcx xorq %rax, %rax sha256_transform_4way_main_loop: movdqa (%rsp, %rax), %xmm6 paddd (%rcx, %rax), %xmm6 paddd %xmm10, %xmm6 movdqa %xmm0, %xmm1 movdqa %xmm9, %xmm2 pandn %xmm2, %xmm1 movdqa %xmm2, %xmm10 movdqa %xmm8, %xmm2 movdqa %xmm2, %xmm9 pand %xmm0, %xmm2 pxor %xmm2, %xmm1 movdqa %xmm0, %xmm8 paddd %xmm1, %xmm6 movdqa %xmm0, %xmm1 psrld $6, %xmm0 movdqa %xmm0, %xmm2 pslld $7, %xmm1 psrld $5, %xmm2 pxor %xmm1, %xmm0 pxor %xmm2, %xmm0 pslld $14, %xmm1 psrld $14, %xmm2 pxor %xmm1, %xmm0 pxor %xmm2, %xmm0 pslld $5, %xmm1 pxor %xmm1, %xmm0 paddd %xmm0, %xmm6 movdqa %xmm3, %xmm0 paddd %xmm6, %xmm0 movdqa %xmm5, %xmm1 movdqa %xmm4, %xmm3 movdqa %xmm4, %xmm2 pand %xmm5, %xmm2 pand %xmm7, %xmm4 pand %xmm7, %xmm1 pxor %xmm4, %xmm1 movdqa %xmm5, %xmm4 movdqa %xmm7, %xmm5 pxor %xmm2, %xmm1 paddd %xmm1, %xmm6 movdqa %xmm7, %xmm2 psrld $2, %xmm7 movdqa %xmm7, %xmm1 pslld $10, %xmm2 psrld $11, %xmm1 pxor %xmm2, %xmm7 pxor %xmm1, %xmm7 pslld $9, %xmm2 psrld $9, %xmm1 pxor %xmm2, %xmm7 pxor %xmm1, %xmm7 pslld $11, %xmm2 pxor %xmm2, %xmm7 paddd %xmm6, %xmm7 addq $16, %rax cmpq $16*64, %rax jne sha256_transform_4way_main_loop movdqu 0(%rdi), %xmm2 movdqu 16(%rdi), %xmm6 movdqu 32(%rdi), %xmm11 movdqu 48(%rdi), %xmm1 paddd %xmm2, %xmm7 paddd %xmm6, %xmm5 paddd %xmm11, %xmm4 paddd %xmm1, %xmm3 movdqu 64(%rdi), %xmm2 movdqu 80(%rdi), %xmm6 movdqu 96(%rdi), %xmm11 movdqu 112(%rdi), %xmm1 paddd %xmm2, %xmm0 paddd %xmm6, %xmm8 paddd %xmm11, %xmm9 paddd %xmm1, %xmm10 movdqu %xmm7, 0(%rdi) movdqu %xmm5, 16(%rdi) movdqu %xmm4, 32(%rdi) movdqu %xmm3, 48(%rdi) movdqu %xmm0, 64(%rdi) movdqu %xmm8, 80(%rdi) movdqu %xmm9, 96(%rdi) movdqu %xmm10, 112(%rdi) addq $1032, %rsp #if defined(WIN64) popq %rsi movdqa 0(%rsp), %xmm6 movdqa 16(%rsp), %xmm7 movdqa 32(%rsp), %xmm8 movdqa 48(%rsp), %xmm9 movdqa 64(%rsp), %xmm10 movdqa 80(%rsp), %xmm11 addq $96, %rsp popq %rdi #endif ret .macro scrypt_shuffle src, so, dest, do movl \so+60(\src), %r8d movl \so+44(\src), %r9d movl \so+28(\src), %r10d movl \so+12(\src), %r11d movl %r8d, \do+12(\dest) movl %r9d, \do+28(\dest) movl %r10d, \do+44(\dest) movl %r11d, \do+60(\dest) movl \so+40(\src), %r8d movl \so+8(\src), %r9d movl \so+48(\src), %r10d movl \so+16(\src), %r11d movl %r8d, \do+8(\dest) movl %r9d, \do+40(\dest) movl %r10d, \do+16(\dest) movl %r11d, \do+48(\dest) movl \so+20(\src), %r8d movl \so+4(\src), %r9d movl \so+52(\src), %r10d movl \so+36(\src), %r11d movl %r8d, \do+4(\dest) movl %r9d, \do+20(\dest) movl %r10d, \do+36(\dest) movl %r11d, \do+52(\dest) movl \so+0(\src), %r8d movl \so+24(\src), %r9d movl \so+32(\src), %r10d movl \so+56(\src), %r11d movl %r8d, \do+0(\dest) movl %r9d, \do+24(\dest) movl %r10d, \do+32(\dest) movl %r11d, \do+56(\dest) .endm .macro gen_salsa8_core_doubleround movq 72(%rsp), %r15 leaq (%r14, %rdx), %rbp roll $7, %ebp xorq %rbp, %r9 leaq (%rdi, %r15), %rbp roll $7, %ebp xorq %rbp, %r10 leaq (%rdx, %r9), %rbp roll $9, %ebp xorq %rbp, %r11 leaq (%r15, %r10), %rbp roll $9, %ebp xorq %rbp, %r13 leaq (%r9, %r11), %rbp roll $13, %ebp xorq %rbp, %r14 leaq (%r10, %r13), %rbp roll $13, %ebp xorq %rbp, %rdi leaq (%r11, %r14), %rbp roll $18, %ebp xorq %rbp, %rdx leaq (%r13, %rdi), %rbp roll $18, %ebp xorq %rbp, %r15 movq 48(%rsp), %rbp movq %r15, 72(%rsp) leaq (%rax, %rbp), %r15 roll $7, %r15d xorq %r15, %rbx leaq (%rbp, %rbx), %r15 roll $9, %r15d xorq %r15, %rcx leaq (%rbx, %rcx), %r15 roll $13, %r15d xorq %r15, %rax leaq (%rcx, %rax), %r15 roll $18, %r15d xorq %r15, %rbp movq 88(%rsp), %r15 movq %rbp, 48(%rsp) leaq (%r12, %r15), %rbp roll $7, %ebp xorq %rbp, %rsi leaq (%r15, %rsi), %rbp roll $9, %ebp xorq %rbp, %r8 leaq (%rsi, %r8), %rbp roll $13, %ebp xorq %rbp, %r12 leaq (%r8, %r12), %rbp roll $18, %ebp xorq %rbp, %r15 movq %r15, 88(%rsp) movq 72(%rsp), %r15 leaq (%rsi, %rdx), %rbp roll $7, %ebp xorq %rbp, %rdi leaq (%r9, %r15), %rbp roll $7, %ebp xorq %rbp, %rax leaq (%rdx, %rdi), %rbp roll $9, %ebp xorq %rbp, %rcx leaq (%r15, %rax), %rbp roll $9, %ebp xorq %rbp, %r8 leaq (%rdi, %rcx), %rbp roll $13, %ebp xorq %rbp, %rsi leaq (%rax, %r8), %rbp roll $13, %ebp xorq %rbp, %r9 leaq (%rcx, %rsi), %rbp roll $18, %ebp xorq %rbp, %rdx leaq (%r8, %r9), %rbp roll $18, %ebp xorq %rbp, %r15 movq 48(%rsp), %rbp movq %r15, 72(%rsp) leaq (%r10, %rbp), %r15 roll $7, %r15d xorq %r15, %r12 leaq (%rbp, %r12), %r15 roll $9, %r15d xorq %r15, %r11 leaq (%r12, %r11), %r15 roll $13, %r15d xorq %r15, %r10 leaq (%r11, %r10), %r15 roll $18, %r15d xorq %r15, %rbp movq 88(%rsp), %r15 movq %rbp, 48(%rsp) leaq (%rbx, %r15), %rbp roll $7, %ebp xorq %rbp, %r14 leaq (%r15, %r14), %rbp roll $9, %ebp xorq %rbp, %r13 leaq (%r14, %r13), %rbp roll $13, %ebp xorq %rbp, %rbx leaq (%r13, %rbx), %rbp roll $18, %ebp xorq %rbp, %r15 movq %r15, 88(%rsp) .endm .text .p2align 5 gen_salsa8_core: # 0: %rdx, %rdi, %rcx, %rsi movq 8(%rsp), %rdi movq %rdi, %rdx shrq $32, %rdi movq 16(%rsp), %rsi movq %rsi, %rcx shrq $32, %rsi # 1: %r9, 72(%rsp), %rax, %r8 movq 24(%rsp), %r8 movq %r8, %r9 shrq $32, %r8 movq %r8, 72(%rsp) movq 32(%rsp), %r8 movq %r8, %rax shrq $32, %r8 # 2: %r11, %r10, 48(%rsp), %r12 movq 40(%rsp), %r10 movq %r10, %r11 shrq $32, %r10 movq 48(%rsp), %r12 #movq %r12, %r13 #movq %r13, 48(%rsp) shrq $32, %r12 # 3: %r14, %r13, %rbx, 88(%rsp) movq 56(%rsp), %r13 movq %r13, %r14 shrq $32, %r13 movq 64(%rsp), %r15 movq %r15, %rbx shrq $32, %r15 movq %r15, 88(%rsp) gen_salsa8_core_doubleround gen_salsa8_core_doubleround gen_salsa8_core_doubleround gen_salsa8_core_doubleround movl %edx, %edx shlq $32, %rdi addq %rdi, %rdx movd %rdx, %xmm0 movl %ecx, %ecx shlq $32, %rsi addq %rsi, %rcx movd %rcx, %xmm4 movq 72(%rsp), %rdi movl %r9d, %r9d shlq $32, %rdi addq %rdi, %r9 movd %r9, %xmm1 movl %eax, %eax shlq $32, %r8 addq %r8, %rax movd %rax, %xmm5 movl %r11d, %r11d shlq $32, %r10 addq %r10, %r11 movd %r11, %xmm2 movl 48(%rsp), %r8d shlq $32, %r12 addq %r12, %r8 movd %r8, %xmm6 movl %r14d, %r14d shlq $32, %r13 addq %r13, %r14 movd %r14, %xmm3 movq 88(%rsp), %rdi movl %ebx, %ebx shlq $32, %rdi addq %rdi, %rbx movd %rbx, %xmm7 punpcklqdq %xmm4, %xmm0 punpcklqdq %xmm5, %xmm1 punpcklqdq %xmm6, %xmm2 punpcklqdq %xmm7, %xmm3 #movq %rdx, 8(%rsp) #movq %rcx, 16(%rsp) #movq %r9, 24(%rsp) #movq %rax, 32(%rsp) #movq %r11, 40(%rsp) #movq %r8, 48(%rsp) #movq %r14, 56(%rsp) #movq %rbx, 64(%rsp) ret .text .p2align 5 .globl scrypt_core .globl _scrypt_core scrypt_core: _scrypt_core: pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 #if defined(WIN64) subq $176, %rsp movdqa %xmm6, 8(%rsp) movdqa %xmm7, 24(%rsp) movdqa %xmm8, 40(%rsp) movdqa %xmm9, 56(%rsp) movdqa %xmm10, 72(%rsp) movdqa %xmm11, 88(%rsp) movdqa %xmm12, 104(%rsp) movdqa %xmm13, 120(%rsp) movdqa %xmm14, 136(%rsp) movdqa %xmm15, 152(%rsp) pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi #endif .macro scrypt_core_cleanup #if defined(WIN64) popq %rsi popq %rdi movdqa 8(%rsp), %xmm6 movdqa 24(%rsp), %xmm7 movdqa 40(%rsp), %xmm8 movdqa 56(%rsp), %xmm9 movdqa 72(%rsp), %xmm10 movdqa 88(%rsp), %xmm11 movdqa 104(%rsp), %xmm12 movdqa 120(%rsp), %xmm13 movdqa 136(%rsp), %xmm14 movdqa 152(%rsp), %xmm15 addq $176, %rsp #endif popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx .endm # GenuineIntel processors have fast SIMD xorl %eax, %eax cpuid cmpl $0x6c65746e, %ecx jne gen_scrypt_core cmpl $0x49656e69, %edx jne gen_scrypt_core cmpl $0x756e6547, %ebx je xmm_scrypt_core gen_scrypt_core: subq $136, %rsp movdqa 0(%rdi), %xmm8 movdqa 16(%rdi), %xmm9 movdqa 32(%rdi), %xmm10 movdqa 48(%rdi), %xmm11 movdqa 64(%rdi), %xmm12 movdqa 80(%rdi), %xmm13 movdqa 96(%rdi), %xmm14 movdqa 112(%rdi), %xmm15 leaq 131072(%rsi), %rcx movq %rdi, 104(%rsp) movq %rsi, 112(%rsp) movq %rcx, 120(%rsp) gen_scrypt_core_loop1: movdqa %xmm8, 0(%rsi) movdqa %xmm9, 16(%rsi) movdqa %xmm10, 32(%rsi) movdqa %xmm11, 48(%rsi) movdqa %xmm12, 64(%rsi) movdqa %xmm13, 80(%rsi) movdqa %xmm14, 96(%rsi) movdqa %xmm15, 112(%rsi) pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, 0(%rsp) movdqa %xmm9, 16(%rsp) movdqa %xmm10, 32(%rsp) movdqa %xmm11, 48(%rsp) movq %rsi, 128(%rsp) call gen_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 pxor %xmm8, %xmm12 pxor %xmm9, %xmm13 pxor %xmm10, %xmm14 pxor %xmm11, %xmm15 movdqa %xmm12, 0(%rsp) movdqa %xmm13, 16(%rsp) movdqa %xmm14, 32(%rsp) movdqa %xmm15, 48(%rsp) call gen_salsa8_core movq 128(%rsp), %rsi paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 addq $128, %rsi movq 120(%rsp), %rcx cmpq %rcx, %rsi jne gen_scrypt_core_loop1 movq $1024, %rcx gen_scrypt_core_loop2: movq 112(%rsp), %rsi movd %xmm12, %edx andl $1023, %edx shll $7, %edx movdqa 0(%rsi, %rdx), %xmm0 movdqa 16(%rsi, %rdx), %xmm1 movdqa 32(%rsi, %rdx), %xmm2 movdqa 48(%rsi, %rdx), %xmm3 movdqa 64(%rsi, %rdx), %xmm4 movdqa 80(%rsi, %rdx), %xmm5 movdqa 96(%rsi, %rdx), %xmm6 movdqa 112(%rsi, %rdx), %xmm7 pxor %xmm0, %xmm8 pxor %xmm1, %xmm9 pxor %xmm2, %xmm10 pxor %xmm3, %xmm11 pxor %xmm4, %xmm12 pxor %xmm5, %xmm13 pxor %xmm6, %xmm14 pxor %xmm7, %xmm15 pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, 0(%rsp) movdqa %xmm9, 16(%rsp) movdqa %xmm10, 32(%rsp) movdqa %xmm11, 48(%rsp) movq %rcx, 128(%rsp) call gen_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 pxor %xmm8, %xmm12 pxor %xmm9, %xmm13 pxor %xmm10, %xmm14 pxor %xmm11, %xmm15 movdqa %xmm12, 0(%rsp) movdqa %xmm13, 16(%rsp) movdqa %xmm14, 32(%rsp) movdqa %xmm15, 48(%rsp) call gen_salsa8_core movq 128(%rsp), %rcx paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 subq $1, %rcx ja gen_scrypt_core_loop2 movq 104(%rsp), %rdi movdqa %xmm8, 0(%rdi) movdqa %xmm9, 16(%rdi) movdqa %xmm10, 32(%rdi) movdqa %xmm11, 48(%rdi) movdqa %xmm12, 64(%rdi) movdqa %xmm13, 80(%rdi) movdqa %xmm14, 96(%rdi) movdqa %xmm15, 112(%rdi) addq $136, %rsp scrypt_core_cleanup ret .macro xmm_salsa8_core_doubleround movdqa %xmm1, %xmm4 paddd %xmm0, %xmm4 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm3 pxor %xmm5, %xmm3 movdqa %xmm0, %xmm4 paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm3, %xmm4 pshufd $0x93, %xmm3, %xmm3 pxor %xmm5, %xmm2 paddd %xmm2, %xmm4 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm1 movdqa %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm1 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm1, %xmm1 pxor %xmm5, %xmm0 movdqa %xmm3, %xmm4 paddd %xmm0, %xmm4 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm1 pxor %xmm5, %xmm1 movdqa %xmm0, %xmm4 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm1, %xmm4 pshufd $0x93, %xmm1, %xmm1 pxor %xmm5, %xmm2 paddd %xmm2, %xmm4 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm3 movdqa %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm3 paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm3, %xmm3 pxor %xmm5, %xmm0 .endm .macro xmm_salsa8_core xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround xmm_salsa8_core_doubleround .endm .p2align 5 xmm_scrypt_core: # shuffle 1st block into %xmm8-%xmm11 movl 60(%rdi), %edx movl 44(%rdi), %ecx movl 28(%rdi), %ebx movl 12(%rdi), %eax movd %edx, %xmm0 movd %ecx, %xmm1 movd %ebx, %xmm2 movd %eax, %xmm3 movl 40(%rdi), %ecx movl 24(%rdi), %ebx movl 8(%rdi), %eax movl 56(%rdi), %edx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %ecx, %xmm4 movd %ebx, %xmm5 movd %eax, %xmm6 movd %edx, %xmm7 paddd %xmm4, %xmm0 paddd %xmm5, %xmm1 paddd %xmm6, %xmm2 paddd %xmm7, %xmm3 movl 20(%rdi), %ebx movl 4(%rdi), %eax movl 52(%rdi), %edx movl 36(%rdi), %ecx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %ebx, %xmm4 movd %eax, %xmm5 movd %edx, %xmm6 movd %ecx, %xmm7 paddd %xmm4, %xmm0 paddd %xmm5, %xmm1 paddd %xmm6, %xmm2 paddd %xmm7, %xmm3 movl 0(%rdi), %eax movl 48(%rdi), %edx movl 32(%rdi), %ecx movl 16(%rdi), %ebx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %eax, %xmm8 movd %edx, %xmm9 movd %ecx, %xmm10 movd %ebx, %xmm11 paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 # shuffle 2nd block into %xmm12-%xmm15 movl 124(%rdi), %edx movl 108(%rdi), %ecx movl 92(%rdi), %ebx movl 76(%rdi), %eax movd %edx, %xmm0 movd %ecx, %xmm1 movd %ebx, %xmm2 movd %eax, %xmm3 movl 104(%rdi), %ecx movl 88(%rdi), %ebx movl 72(%rdi), %eax movl 120(%rdi), %edx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %ecx, %xmm4 movd %ebx, %xmm5 movd %eax, %xmm6 movd %edx, %xmm7 paddd %xmm4, %xmm0 paddd %xmm5, %xmm1 paddd %xmm6, %xmm2 paddd %xmm7, %xmm3 movl 84(%rdi), %ebx movl 68(%rdi), %eax movl 116(%rdi), %edx movl 100(%rdi), %ecx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %ebx, %xmm4 movd %eax, %xmm5 movd %edx, %xmm6 movd %ecx, %xmm7 paddd %xmm4, %xmm0 paddd %xmm5, %xmm1 paddd %xmm6, %xmm2 paddd %xmm7, %xmm3 movl 64(%rdi), %eax movl 112(%rdi), %edx movl 96(%rdi), %ecx movl 80(%rdi), %ebx pshufd $0x93, %xmm0, %xmm0 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm2, %xmm2 pshufd $0x93, %xmm3, %xmm3 movd %eax, %xmm12 movd %edx, %xmm13 movd %ecx, %xmm14 movd %ebx, %xmm15 paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 movq %rsi, %rdx leaq 131072(%rsi), %rcx xmm_scrypt_core_loop1: movdqa %xmm8, 0(%rdx) movdqa %xmm9, 16(%rdx) movdqa %xmm10, 32(%rdx) movdqa %xmm11, 48(%rdx) movdqa %xmm12, 64(%rdx) movdqa %xmm13, 80(%rdx) movdqa %xmm14, 96(%rdx) movdqa %xmm15, 112(%rdx) pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, %xmm0 movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 xmm_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 pxor %xmm8, %xmm12 pxor %xmm9, %xmm13 pxor %xmm10, %xmm14 pxor %xmm11, %xmm15 movdqa %xmm12, %xmm0 movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 xmm_salsa8_core paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 addq $128, %rdx cmpq %rcx, %rdx jne xmm_scrypt_core_loop1 movq $1024, %rcx xmm_scrypt_core_loop2: movd %xmm12, %edx andl $1023, %edx shll $7, %edx movdqa 0(%rsi, %rdx), %xmm0 movdqa 16(%rsi, %rdx), %xmm1 movdqa 32(%rsi, %rdx), %xmm2 movdqa 48(%rsi, %rdx), %xmm3 movdqa 64(%rsi, %rdx), %xmm4 movdqa 80(%rsi, %rdx), %xmm5 movdqa 96(%rsi, %rdx), %xmm6 movdqa 112(%rsi, %rdx), %xmm7 pxor %xmm0, %xmm8 pxor %xmm1, %xmm9 pxor %xmm2, %xmm10 pxor %xmm3, %xmm11 pxor %xmm4, %xmm12 pxor %xmm5, %xmm13 pxor %xmm6, %xmm14 pxor %xmm7, %xmm15 pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, %xmm0 movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 xmm_salsa8_core paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 paddd %xmm3, %xmm11 pxor %xmm8, %xmm12 pxor %xmm9, %xmm13 pxor %xmm10, %xmm14 pxor %xmm11, %xmm15 movdqa %xmm12, %xmm0 movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 xmm_salsa8_core paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 paddd %xmm3, %xmm15 subq $1, %rcx ja xmm_scrypt_core_loop2 # re-shuffle 1st block back movd %xmm8, %eax movd %xmm9, %edx movd %xmm10, %ecx movd %xmm11, %ebx pshufd $0x39, %xmm8, %xmm8 pshufd $0x39, %xmm9, %xmm9 pshufd $0x39, %xmm10, %xmm10 pshufd $0x39, %xmm11, %xmm11 movl %eax, 0(%rdi) movl %edx, 48(%rdi) movl %ecx, 32(%rdi) movl %ebx, 16(%rdi) movd %xmm8, %ebx movd %xmm9, %eax movd %xmm10, %edx movd %xmm11, %ecx pshufd $0x39, %xmm8, %xmm8 pshufd $0x39, %xmm9, %xmm9 pshufd $0x39, %xmm10, %xmm10 pshufd $0x39, %xmm11, %xmm11 movl %ebx, 20(%rdi) movl %eax, 4(%rdi) movl %edx, 52(%rdi) movl %ecx, 36(%rdi) movd %xmm8, %ecx movd %xmm9, %ebx movd %xmm10, %eax movd %xmm11, %edx pshufd $0x39, %xmm8, %xmm8 pshufd $0x39, %xmm9, %xmm9 pshufd $0x39, %xmm10, %xmm10 pshufd $0x39, %xmm11, %xmm11 movl %ecx, 40(%rdi) movl %ebx, 24(%rdi) movl %eax, 8(%rdi) movl %edx, 56(%rdi) movd %xmm8, %edx movd %xmm9, %ecx movd %xmm10, %ebx movd %xmm11, %eax movl %edx, 60(%rdi) movl %ecx, 44(%rdi) movl %ebx, 28(%rdi) movl %eax, 12(%rdi) # re-shuffle 2nd block back movd %xmm12, %eax movd %xmm13, %edx movd %xmm14, %ecx movd %xmm15, %ebx pshufd $0x39, %xmm12, %xmm12 pshufd $0x39, %xmm13, %xmm13 pshufd $0x39, %xmm14, %xmm14 pshufd $0x39, %xmm15, %xmm15 movl %eax, 64(%rdi) movl %edx, 112(%rdi) movl %ecx, 96(%rdi) movl %ebx, 80(%rdi) movd %xmm12, %ebx movd %xmm13, %eax movd %xmm14, %edx movd %xmm15, %ecx pshufd $0x39, %xmm12, %xmm12 pshufd $0x39, %xmm13, %xmm13 pshufd $0x39, %xmm14, %xmm14 pshufd $0x39, %xmm15, %xmm15 movl %ebx, 84(%rdi) movl %eax, 68(%rdi) movl %edx, 116(%rdi) movl %ecx, 100(%rdi) movd %xmm12, %ecx movd %xmm13, %ebx movd %xmm14, %eax movd %xmm15, %edx pshufd $0x39, %xmm12, %xmm12 pshufd $0x39, %xmm13, %xmm13 pshufd $0x39, %xmm14, %xmm14 pshufd $0x39, %xmm15, %xmm15 movl %ecx, 104(%rdi) movl %ebx, 88(%rdi) movl %eax, 72(%rdi) movl %edx, 120(%rdi) movd %xmm12, %edx movd %xmm13, %ecx movd %xmm14, %ebx movd %xmm15, %eax movl %edx, 124(%rdi) movl %ecx, 108(%rdi) movl %ebx, 92(%rdi) movl %eax, 76(%rdi) scrypt_core_cleanup ret .text .p2align 5 .globl scrypt_best_throughput .globl _scrypt_best_throughput scrypt_best_throughput: _scrypt_best_throughput: pushq %rbx xorq %rax, %rax cpuid movl $3, %eax cmpl $0x444d4163, %ecx jne scrypt_best_throughput_exit cmpl $0x69746e65, %edx jne scrypt_best_throughput_exit cmpl $0x68747541, %ebx jne scrypt_best_throughput_exit movl $1, %eax cpuid andl $0x0ff00000, %eax movl $3, %eax jnz scrypt_best_throughput_exit movl $1, %eax scrypt_best_throughput_exit: popq %rbx ret .macro xmm_salsa8_core_2way_doubleround movdqa %xmm1, %xmm4 movdqa %xmm9, %xmm6 paddd %xmm0, %xmm4 paddd %xmm8, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $7, %xmm4 pslld $7, %xmm6 psrld $25, %xmm5 psrld $25, %xmm7 pxor %xmm4, %xmm3 pxor %xmm6, %xmm11 pxor %xmm5, %xmm3 pxor %xmm7, %xmm11 movdqa %xmm0, %xmm4 movdqa %xmm8, %xmm6 paddd %xmm3, %xmm4 paddd %xmm11, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $9, %xmm4 pslld $9, %xmm6 psrld $23, %xmm5 psrld $23, %xmm7 pxor %xmm4, %xmm2 pxor %xmm6, %xmm10 movdqa %xmm3, %xmm4 movdqa %xmm11, %xmm6 pshufd $0x93, %xmm3, %xmm3 pshufd $0x93, %xmm11, %xmm11 pxor %xmm5, %xmm2 pxor %xmm7, %xmm10 paddd %xmm2, %xmm4 paddd %xmm10, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $13, %xmm4 pslld $13, %xmm6 psrld $19, %xmm5 psrld $19, %xmm7 pxor %xmm4, %xmm1 pxor %xmm6, %xmm9 movdqa %xmm2, %xmm4 movdqa %xmm10, %xmm6 pshufd $0x4e, %xmm2, %xmm2 pshufd $0x4e, %xmm10, %xmm10 pxor %xmm5, %xmm1 pxor %xmm7, %xmm9 paddd %xmm1, %xmm4 paddd %xmm9, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $18, %xmm4 pslld $18, %xmm6 psrld $14, %xmm5 psrld $14, %xmm7 pxor %xmm4, %xmm0 pxor %xmm6, %xmm8 pshufd $0x39, %xmm1, %xmm1 pshufd $0x39, %xmm9, %xmm9 pxor %xmm5, %xmm0 pxor %xmm7, %xmm8 movdqa %xmm3, %xmm4 movdqa %xmm11, %xmm6 paddd %xmm0, %xmm4 paddd %xmm8, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $7, %xmm4 pslld $7, %xmm6 psrld $25, %xmm5 psrld $25, %xmm7 pxor %xmm4, %xmm1 pxor %xmm6, %xmm9 pxor %xmm5, %xmm1 pxor %xmm7, %xmm9 movdqa %xmm0, %xmm4 movdqa %xmm8, %xmm6 paddd %xmm1, %xmm4 paddd %xmm9, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $9, %xmm4 pslld $9, %xmm6 psrld $23, %xmm5 psrld $23, %xmm7 pxor %xmm4, %xmm2 pxor %xmm6, %xmm10 movdqa %xmm1, %xmm4 movdqa %xmm9, %xmm6 pshufd $0x93, %xmm1, %xmm1 pshufd $0x93, %xmm9, %xmm9 pxor %xmm5, %xmm2 pxor %xmm7, %xmm10 paddd %xmm2, %xmm4 paddd %xmm10, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $13, %xmm4 pslld $13, %xmm6 psrld $19, %xmm5 psrld $19, %xmm7 pxor %xmm4, %xmm3 pxor %xmm6, %xmm11 movdqa %xmm2, %xmm4 movdqa %xmm10, %xmm6 pshufd $0x4e, %xmm2, %xmm2 pshufd $0x4e, %xmm10, %xmm10 pxor %xmm5, %xmm3 pxor %xmm7, %xmm11 paddd %xmm3, %xmm4 paddd %xmm11, %xmm6 movdqa %xmm4, %xmm5 movdqa %xmm6, %xmm7 pslld $18, %xmm4 pslld $18, %xmm6 psrld $14, %xmm5 psrld $14, %xmm7 pxor %xmm4, %xmm0 pxor %xmm6, %xmm8 pshufd $0x39, %xmm3, %xmm3 pshufd $0x39, %xmm11, %xmm11 pxor %xmm5, %xmm0 pxor %xmm7, %xmm8 .endm .macro xmm_salsa8_core_2way xmm_salsa8_core_2way_doubleround xmm_salsa8_core_2way_doubleround xmm_salsa8_core_2way_doubleround xmm_salsa8_core_2way_doubleround .endm .text .p2align 5 .globl scrypt_core_2way .globl _scrypt_core_2way scrypt_core_2way: _scrypt_core_2way: pushq %rbx pushq %rbp #if defined(WIN64) subq $176, %rsp movdqa %xmm6, 8(%rsp) movdqa %xmm7, 24(%rsp) movdqa %xmm8, 40(%rsp) movdqa %xmm9, 56(%rsp) movdqa %xmm10, 72(%rsp) movdqa %xmm11, 88(%rsp) movdqa %xmm12, 104(%rsp) movdqa %xmm13, 120(%rsp) movdqa %xmm14, 136(%rsp) movdqa %xmm15, 152(%rsp) pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #endif subq $264, %rsp scrypt_shuffle %rdi, 0, %rsp, 0 scrypt_shuffle %rdi, 64, %rsp, 64 scrypt_shuffle %rsi, 0, %rsp, 128 scrypt_shuffle %rsi, 64, %rsp, 192 movdqa 192(%rsp), %xmm12 movdqa 208(%rsp), %xmm13 movdqa 224(%rsp), %xmm14 movdqa 240(%rsp), %xmm15 movq %rdx, %rbp leaq 262144(%rdx), %rcx scrypt_core_2way_loop1: movdqa 0(%rsp), %xmm0 movdqa 16(%rsp), %xmm1 movdqa 32(%rsp), %xmm2 movdqa 48(%rsp), %xmm3 movdqa 64(%rsp), %xmm4 movdqa 80(%rsp), %xmm5 movdqa 96(%rsp), %xmm6 movdqa 112(%rsp), %xmm7 movdqa 128(%rsp), %xmm8 movdqa 144(%rsp), %xmm9 movdqa 160(%rsp), %xmm10 movdqa 176(%rsp), %xmm11 pxor %xmm4, %xmm0 pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 pxor %xmm7, %xmm3 movdqa %xmm0, 0(%rbp) movdqa %xmm1, 16(%rbp) movdqa %xmm2, 32(%rbp) movdqa %xmm3, 48(%rbp) movdqa %xmm4, 64(%rbp) movdqa %xmm5, 80(%rbp) movdqa %xmm6, 96(%rbp) movdqa %xmm7, 112(%rbp) pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, 128(%rbp) movdqa %xmm9, 144(%rbp) movdqa %xmm10, 160(%rbp) movdqa %xmm11, 176(%rbp) movdqa %xmm12, 192(%rbp) movdqa %xmm13, 208(%rbp) movdqa %xmm14, 224(%rbp) movdqa %xmm15, 240(%rbp) xmm_salsa8_core_2way paddd 0(%rbp), %xmm0 paddd 16(%rbp), %xmm1 paddd 32(%rbp), %xmm2 paddd 48(%rbp), %xmm3 paddd 128(%rbp), %xmm8 paddd 144(%rbp), %xmm9 paddd 160(%rbp), %xmm10 paddd 176(%rbp), %xmm11 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128(%rsp) movdqa %xmm9, 144(%rsp) movdqa %xmm10, 160(%rsp) movdqa %xmm11, 176(%rsp) pxor 64(%rsp), %xmm0 pxor 80(%rsp), %xmm1 pxor 96(%rsp), %xmm2 pxor 112(%rsp), %xmm3 pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, %xmm12 movdqa %xmm9, %xmm13 movdqa %xmm10, %xmm14 movdqa %xmm11, %xmm15 xmm_salsa8_core_2way paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 paddd 112(%rsp), %xmm3 paddd %xmm8, %xmm12 paddd %xmm9, %xmm13 paddd %xmm10, %xmm14 paddd %xmm11, %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) addq $256, %rbp cmpq %rcx, %rbp jne scrypt_core_2way_loop1 movq $1024, %rcx scrypt_core_2way_loop2: movdqa 0(%rsp), %xmm0 movdqa 16(%rsp), %xmm1 movdqa 32(%rsp), %xmm2 movdqa 48(%rsp), %xmm3 movdqa 64(%rsp), %xmm4 movdqa 80(%rsp), %xmm5 movdqa 96(%rsp), %xmm6 movdqa 112(%rsp), %xmm7 movdqa 128(%rsp), %xmm8 movdqa 144(%rsp), %xmm9 movdqa 160(%rsp), %xmm10 movdqa 176(%rsp), %xmm11 movd %xmm4, %ebp andl $1023, %ebp shll $8, %ebp pxor 0(%rdx, %rbp), %xmm0 pxor 16(%rdx, %rbp), %xmm1 pxor 32(%rdx, %rbp), %xmm2 pxor 48(%rdx, %rbp), %xmm3 movd %xmm12, %ebx andl $1023, %ebx shll $8, %ebx addl $128, %ebx pxor 0(%rdx, %rbx), %xmm8 pxor 16(%rdx, %rbx), %xmm9 pxor 32(%rdx, %rbx), %xmm10 pxor 48(%rdx, %rbx), %xmm11 pxor %xmm4, %xmm0 pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 pxor %xmm7, %xmm3 pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128(%rsp) movdqa %xmm9, 144(%rsp) movdqa %xmm10, 160(%rsp) movdqa %xmm11, 176(%rsp) xmm_salsa8_core_2way paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 paddd 48(%rsp), %xmm3 paddd 128(%rsp), %xmm8 paddd 144(%rsp), %xmm9 paddd 160(%rsp), %xmm10 paddd 176(%rsp), %xmm11 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128(%rsp) movdqa %xmm9, 144(%rsp) movdqa %xmm10, 160(%rsp) movdqa %xmm11, 176(%rsp) pxor 64(%rdx, %rbp), %xmm0 pxor 80(%rdx, %rbp), %xmm1 pxor 96(%rdx, %rbp), %xmm2 pxor 112(%rdx, %rbp), %xmm3 pxor 64(%rdx, %rbx), %xmm8 pxor 80(%rdx, %rbx), %xmm9 pxor 96(%rdx, %rbx), %xmm10 pxor 112(%rdx, %rbx), %xmm11 pxor 64(%rsp), %xmm0 pxor 80(%rsp), %xmm1 pxor 96(%rsp), %xmm2 pxor 112(%rsp), %xmm3 pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, %xmm12 movdqa %xmm9, %xmm13 movdqa %xmm10, %xmm14 movdqa %xmm11, %xmm15 xmm_salsa8_core_2way paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 paddd 112(%rsp), %xmm3 paddd %xmm8, %xmm12 paddd %xmm9, %xmm13 paddd %xmm10, %xmm14 paddd %xmm11, %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) subq $1, %rcx ja scrypt_core_2way_loop2 movdqa %xmm12, 192(%rsp) movdqa %xmm13, 208(%rsp) movdqa %xmm14, 224(%rsp) movdqa %xmm15, 240(%rsp) scrypt_shuffle %rsp, 0, %rdi, 0 scrypt_shuffle %rsp, 64, %rdi, 64 scrypt_shuffle %rsp, 128, %rsi, 0 scrypt_shuffle %rsp, 192, %rsi, 64 addq $264, %rsp #if defined(WIN64) popq %rsi popq %rdi movdqa 8(%rsp), %xmm6 movdqa 24(%rsp), %xmm7 movdqa 40(%rsp), %xmm8 movdqa 56(%rsp), %xmm9 movdqa 72(%rsp), %xmm10 movdqa 88(%rsp), %xmm11 movdqa 104(%rsp), %xmm12 movdqa 120(%rsp), %xmm13 movdqa 136(%rsp), %xmm14 movdqa 152(%rsp), %xmm15 addq $176, %rsp #endif popq %rbp popq %rbx ret .macro xmm_salsa8_core_3way_doubleround movdqa %xmm1, %xmm4 movdqa %xmm9, %xmm6 movdqa %xmm13, %xmm7 paddd %xmm0, %xmm4 paddd %xmm8, %xmm6 paddd %xmm12, %xmm7 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm3 pxor %xmm5, %xmm3 movdqa %xmm0, %xmm4 movdqa %xmm6, %xmm5 pslld $7, %xmm6 psrld $25, %xmm5 pxor %xmm6, %xmm11 pxor %xmm5, %xmm11 movdqa %xmm8, %xmm6 movdqa %xmm7, %xmm5 pslld $7, %xmm7 psrld $25, %xmm5 pxor %xmm7, %xmm15 pxor %xmm5, %xmm15 movdqa %xmm12, %xmm7 paddd %xmm3, %xmm4 paddd %xmm11, %xmm6 paddd %xmm15, %xmm7 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm3, %xmm4 pshufd $0x93, %xmm3, %xmm3 pxor %xmm5, %xmm2 movdqa %xmm6, %xmm5 pslld $9, %xmm6 psrld $23, %xmm5 pxor %xmm6, %xmm10 movdqa %xmm11, %xmm6 pshufd $0x93, %xmm11, %xmm11 pxor %xmm5, %xmm10 movdqa %xmm7, %xmm5 pslld $9, %xmm7 psrld $23, %xmm5 pxor %xmm7, %xmm14 movdqa %xmm15, %xmm7 pshufd $0x93, %xmm15, %xmm15 pxor %xmm5, %xmm14 paddd %xmm2, %xmm4 paddd %xmm10, %xmm6 paddd %xmm14, %xmm7 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm1 movdqa %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm1 movdqa %xmm6, %xmm5 pslld $13, %xmm6 psrld $19, %xmm5 pxor %xmm6, %xmm9 movdqa %xmm10, %xmm6 pshufd $0x4e, %xmm10, %xmm10 pxor %xmm5, %xmm9 movdqa %xmm7, %xmm5 pslld $13, %xmm7 psrld $19, %xmm5 pxor %xmm7, %xmm13 movdqa %xmm14, %xmm7 pshufd $0x4e, %xmm14, %xmm14 pxor %xmm5, %xmm13 paddd %xmm1, %xmm4 paddd %xmm9, %xmm6 paddd %xmm13, %xmm7 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm1, %xmm1 pxor %xmm5, %xmm0 movdqa %xmm3, %xmm4 movdqa %xmm6, %xmm5 pslld $18, %xmm6 psrld $14, %xmm5 pxor %xmm6, %xmm8 pshufd $0x39, %xmm9, %xmm9 pxor %xmm5, %xmm8 movdqa %xmm11, %xmm6 movdqa %xmm7, %xmm5 pslld $18, %xmm7 psrld $14, %xmm5 pxor %xmm7, %xmm12 pshufd $0x39, %xmm13, %xmm13 pxor %xmm5, %xmm12 movdqa %xmm15, %xmm7 paddd %xmm0, %xmm4 paddd %xmm8, %xmm6 paddd %xmm12, %xmm7 movdqa %xmm4, %xmm5 pslld $7, %xmm4 psrld $25, %xmm5 pxor %xmm4, %xmm1 pxor %xmm5, %xmm1 movdqa %xmm0, %xmm4 movdqa %xmm6, %xmm5 pslld $7, %xmm6 psrld $25, %xmm5 pxor %xmm6, %xmm9 pxor %xmm5, %xmm9 movdqa %xmm8, %xmm6 movdqa %xmm7, %xmm5 pslld $7, %xmm7 psrld $25, %xmm5 pxor %xmm7, %xmm13 pxor %xmm5, %xmm13 movdqa %xmm12, %xmm7 paddd %xmm1, %xmm4 paddd %xmm9, %xmm6 paddd %xmm13, %xmm7 movdqa %xmm4, %xmm5 pslld $9, %xmm4 psrld $23, %xmm5 pxor %xmm4, %xmm2 movdqa %xmm1, %xmm4 pshufd $0x93, %xmm1, %xmm1 pxor %xmm5, %xmm2 movdqa %xmm6, %xmm5 pslld $9, %xmm6 psrld $23, %xmm5 pxor %xmm6, %xmm10 movdqa %xmm9, %xmm6 pshufd $0x93, %xmm9, %xmm9 pxor %xmm5, %xmm10 movdqa %xmm7, %xmm5 pslld $9, %xmm7 psrld $23, %xmm5 pxor %xmm7, %xmm14 movdqa %xmm13, %xmm7 pshufd $0x93, %xmm13, %xmm13 pxor %xmm5, %xmm14 paddd %xmm2, %xmm4 paddd %xmm10, %xmm6 paddd %xmm14, %xmm7 movdqa %xmm4, %xmm5 pslld $13, %xmm4 psrld $19, %xmm5 pxor %xmm4, %xmm3 movdqa %xmm2, %xmm4 pshufd $0x4e, %xmm2, %xmm2 pxor %xmm5, %xmm3 movdqa %xmm6, %xmm5 pslld $13, %xmm6 psrld $19, %xmm5 pxor %xmm6, %xmm11 movdqa %xmm10, %xmm6 pshufd $0x4e, %xmm10, %xmm10 pxor %xmm5, %xmm11 movdqa %xmm7, %xmm5 pslld $13, %xmm7 psrld $19, %xmm5 pxor %xmm7, %xmm15 movdqa %xmm14, %xmm7 pshufd $0x4e, %xmm14, %xmm14 pxor %xmm5, %xmm15 paddd %xmm3, %xmm4 paddd %xmm11, %xmm6 paddd %xmm15, %xmm7 movdqa %xmm4, %xmm5 pslld $18, %xmm4 psrld $14, %xmm5 pxor %xmm4, %xmm0 pshufd $0x39, %xmm3, %xmm3 pxor %xmm5, %xmm0 movdqa %xmm6, %xmm5 pslld $18, %xmm6 psrld $14, %xmm5 pxor %xmm6, %xmm8 pshufd $0x39, %xmm11, %xmm11 pxor %xmm5, %xmm8 movdqa %xmm7, %xmm5 pslld $18, %xmm7 psrld $14, %xmm5 pxor %xmm7, %xmm12 pshufd $0x39, %xmm15, %xmm15 pxor %xmm5, %xmm12 .endm .macro xmm_salsa8_core_3way xmm_salsa8_core_3way_doubleround xmm_salsa8_core_3way_doubleround xmm_salsa8_core_3way_doubleround xmm_salsa8_core_3way_doubleround .endm .text .p2align 5 .globl scrypt_core_3way .globl _scrypt_core_3way scrypt_core_3way: _scrypt_core_3way: pushq %rbx pushq %rbp #if defined(WIN64) subq $176, %rsp movdqa %xmm6, 8(%rsp) movdqa %xmm7, 24(%rsp) movdqa %xmm8, 40(%rsp) movdqa %xmm9, 56(%rsp) movdqa %xmm10, 72(%rsp) movdqa %xmm11, 88(%rsp) movdqa %xmm12, 104(%rsp) movdqa %xmm13, 120(%rsp) movdqa %xmm14, 136(%rsp) movdqa %xmm15, 152(%rsp) pushq %rdi pushq %rsi movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx #endif subq $392, %rsp scrypt_shuffle %rdi, 0, %rsp, 0 scrypt_shuffle %rdi, 64, %rsp, 64 scrypt_shuffle %rsi, 0, %rsp, 128 scrypt_shuffle %rsi, 64, %rsp, 192 scrypt_shuffle %rdx, 0, %rsp, 256 scrypt_shuffle %rdx, 64, %rsp, 320 movdqa 128+64(%rsp), %xmm8 movdqa 128+80(%rsp), %xmm9 movdqa 128+96(%rsp), %xmm10 movdqa 128+112(%rsp), %xmm11 movq %rcx, %rbp leaq 3*131072(%rcx), %rax scrypt_core_3way_loop1: movdqa %xmm8, %xmm12 movdqa %xmm9, %xmm13 movdqa %xmm10, %xmm14 movdqa %xmm11, %xmm15 movdqa 0(%rsp), %xmm0 movdqa 16(%rsp), %xmm1 movdqa 32(%rsp), %xmm2 movdqa 48(%rsp), %xmm3 movdqa 64(%rsp), %xmm4 movdqa 80(%rsp), %xmm5 movdqa 96(%rsp), %xmm6 movdqa 112(%rsp), %xmm7 movdqa 128+0(%rsp), %xmm8 movdqa 128+16(%rsp), %xmm9 movdqa 128+32(%rsp), %xmm10 movdqa 128+48(%rsp), %xmm11 pxor %xmm4, %xmm0 pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 pxor %xmm7, %xmm3 movdqa %xmm0, 0(%rbp) movdqa %xmm1, 16(%rbp) movdqa %xmm2, 32(%rbp) movdqa %xmm3, 48(%rbp) movdqa %xmm4, 64(%rbp) movdqa %xmm5, 80(%rbp) movdqa %xmm6, 96(%rbp) movdqa %xmm7, 112(%rbp) pxor %xmm12, %xmm8 pxor %xmm13, %xmm9 pxor %xmm14, %xmm10 pxor %xmm15, %xmm11 movdqa %xmm8, 128+0(%rbp) movdqa %xmm9, 128+16(%rbp) movdqa %xmm10, 128+32(%rbp) movdqa %xmm11, 128+48(%rbp) movdqa %xmm12, 128+64(%rbp) movdqa %xmm13, 128+80(%rbp) movdqa %xmm14, 128+96(%rbp) movdqa %xmm15, 128+112(%rbp) movdqa 256+0(%rsp), %xmm12 movdqa 256+16(%rsp), %xmm13 movdqa 256+32(%rsp), %xmm14 movdqa 256+48(%rsp), %xmm15 movdqa 256+64(%rsp), %xmm4 movdqa 256+80(%rsp), %xmm5 movdqa 256+96(%rsp), %xmm6 movdqa 256+112(%rsp), %xmm7 pxor %xmm4, %xmm12 pxor %xmm5, %xmm13 pxor %xmm6, %xmm14 pxor %xmm7, %xmm15 movdqa %xmm12, 256+0(%rbp) movdqa %xmm13, 256+16(%rbp) movdqa %xmm14, 256+32(%rbp) movdqa %xmm15, 256+48(%rbp) movdqa %xmm4, 256+64(%rbp) movdqa %xmm5, 256+80(%rbp) movdqa %xmm6, 256+96(%rbp) movdqa %xmm7, 256+112(%rbp) xmm_salsa8_core_3way paddd 0(%rbp), %xmm0 paddd 16(%rbp), %xmm1 paddd 32(%rbp), %xmm2 paddd 48(%rbp), %xmm3 paddd 128+0(%rbp), %xmm8 paddd 128+16(%rbp), %xmm9 paddd 128+32(%rbp), %xmm10 paddd 128+48(%rbp), %xmm11 paddd 256+0(%rbp), %xmm12 paddd 256+16(%rbp), %xmm13 paddd 256+32(%rbp), %xmm14 paddd 256+48(%rbp), %xmm15 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128+0(%rsp) movdqa %xmm9, 128+16(%rsp) movdqa %xmm10, 128+32(%rsp) movdqa %xmm11, 128+48(%rsp) movdqa %xmm12, 256+0(%rsp) movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) pxor 64(%rsp), %xmm0 pxor 80(%rsp), %xmm1 pxor 96(%rsp), %xmm2 pxor 112(%rsp), %xmm3 pxor 128+64(%rsp), %xmm8 pxor 128+80(%rsp), %xmm9 pxor 128+96(%rsp), %xmm10 pxor 128+112(%rsp), %xmm11 pxor 256+64(%rsp), %xmm12 pxor 256+80(%rsp), %xmm13 pxor 256+96(%rsp), %xmm14 pxor 256+112(%rsp), %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, 128+64(%rsp) movdqa %xmm9, 128+80(%rsp) movdqa %xmm10, 128+96(%rsp) movdqa %xmm11, 128+112(%rsp) movdqa %xmm12, 256+64(%rsp) movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) xmm_salsa8_core_3way paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 paddd 112(%rsp), %xmm3 paddd 128+64(%rsp), %xmm8 paddd 128+80(%rsp), %xmm9 paddd 128+96(%rsp), %xmm10 paddd 128+112(%rsp), %xmm11 paddd 256+64(%rsp), %xmm12 paddd 256+80(%rsp), %xmm13 paddd 256+96(%rsp), %xmm14 paddd 256+112(%rsp), %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, 128+64(%rsp) movdqa %xmm9, 128+80(%rsp) movdqa %xmm10, 128+96(%rsp) movdqa %xmm11, 128+112(%rsp) movdqa %xmm12, 256+64(%rsp) movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) addq $3*128, %rbp cmpq %rax, %rbp jne scrypt_core_3way_loop1 movq $1024, %r8 .p2align 4 scrypt_core_3way_loop2: movl 64(%rsp), %ebp andl $1023, %ebp leaq (%rbp, %rbp, 2), %rbp movl 128+64(%rsp), %ebx shll $7, %ebp movl 256+64(%rsp), %eax andl $1023, %ebx leaq (%rbx, %rbx, 2), %rbx shll $7, %ebx shll $7, %eax addl $128, %ebx andl $131071, %eax leaq (%rax, %rax, 2), %rax addl $256, %eax movdqa 0(%rsp), %xmm0 movdqa 16(%rsp), %xmm1 movdqa 32(%rsp), %xmm2 movdqa 48(%rsp), %xmm3 movdqa 128+0(%rsp), %xmm8 movdqa 128+16(%rsp), %xmm9 movdqa 128+32(%rsp), %xmm10 movdqa 128+48(%rsp), %xmm11 movdqa 256+0(%rsp), %xmm12 movdqa 256+16(%rsp), %xmm13 movdqa 256+32(%rsp), %xmm14 movdqa 256+48(%rsp), %xmm15 pxor 0(%rcx, %rbp), %xmm0 pxor 16(%rcx, %rbp), %xmm1 pxor 32(%rcx, %rbp), %xmm2 pxor 48(%rcx, %rbp), %xmm3 pxor 0(%rcx, %rbx), %xmm8 pxor 16(%rcx, %rbx), %xmm9 pxor 32(%rcx, %rbx), %xmm10 pxor 48(%rcx, %rbx), %xmm11 pxor 0(%rcx, %rax), %xmm12 pxor 16(%rcx, %rax), %xmm13 pxor 32(%rcx, %rax), %xmm14 pxor 48(%rcx, %rax), %xmm15 pxor 64(%rsp), %xmm0 pxor 80(%rsp), %xmm1 pxor 96(%rsp), %xmm2 pxor 112(%rsp), %xmm3 pxor 128+64(%rsp), %xmm8 pxor 128+80(%rsp), %xmm9 pxor 128+96(%rsp), %xmm10 pxor 128+112(%rsp), %xmm11 pxor 256+64(%rsp), %xmm12 pxor 256+80(%rsp), %xmm13 pxor 256+96(%rsp), %xmm14 pxor 256+112(%rsp), %xmm15 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128+0(%rsp) movdqa %xmm9, 128+16(%rsp) movdqa %xmm10, 128+32(%rsp) movdqa %xmm11, 128+48(%rsp) movdqa %xmm12, 256+0(%rsp) movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) xmm_salsa8_core_3way paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 paddd 48(%rsp), %xmm3 paddd 128+0(%rsp), %xmm8 paddd 128+16(%rsp), %xmm9 paddd 128+32(%rsp), %xmm10 paddd 128+48(%rsp), %xmm11 paddd 256+0(%rsp), %xmm12 paddd 256+16(%rsp), %xmm13 paddd 256+32(%rsp), %xmm14 paddd 256+48(%rsp), %xmm15 movdqa %xmm0, 0(%rsp) movdqa %xmm1, 16(%rsp) movdqa %xmm2, 32(%rsp) movdqa %xmm3, 48(%rsp) movdqa %xmm8, 128+0(%rsp) movdqa %xmm9, 128+16(%rsp) movdqa %xmm10, 128+32(%rsp) movdqa %xmm11, 128+48(%rsp) movdqa %xmm12, 256+0(%rsp) movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) pxor 64(%rcx, %rbp), %xmm0 pxor 80(%rcx, %rbp), %xmm1 pxor 96(%rcx, %rbp), %xmm2 pxor 112(%rcx, %rbp), %xmm3 pxor 64(%rcx, %rbx), %xmm8 pxor 80(%rcx, %rbx), %xmm9 pxor 96(%rcx, %rbx), %xmm10 pxor 112(%rcx, %rbx), %xmm11 pxor 64(%rcx, %rax), %xmm12 pxor 80(%rcx, %rax), %xmm13 pxor 96(%rcx, %rax), %xmm14 pxor 112(%rcx, %rax), %xmm15 pxor 64(%rsp), %xmm0 pxor 80(%rsp), %xmm1 pxor 96(%rsp), %xmm2 pxor 112(%rsp), %xmm3 pxor 128+64(%rsp), %xmm8 pxor 128+80(%rsp), %xmm9 pxor 128+96(%rsp), %xmm10 pxor 128+112(%rsp), %xmm11 pxor 256+64(%rsp), %xmm12 pxor 256+80(%rsp), %xmm13 pxor 256+96(%rsp), %xmm14 pxor 256+112(%rsp), %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, 128+64(%rsp) movdqa %xmm9, 128+80(%rsp) movdqa %xmm10, 128+96(%rsp) movdqa %xmm11, 128+112(%rsp) movdqa %xmm12, 256+64(%rsp) movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) xmm_salsa8_core_3way paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 paddd 112(%rsp), %xmm3 paddd 128+64(%rsp), %xmm8 paddd 128+80(%rsp), %xmm9 paddd 128+96(%rsp), %xmm10 paddd 128+112(%rsp), %xmm11 paddd 256+64(%rsp), %xmm12 paddd 256+80(%rsp), %xmm13 paddd 256+96(%rsp), %xmm14 paddd 256+112(%rsp), %xmm15 movdqa %xmm0, 64(%rsp) movdqa %xmm1, 80(%rsp) movdqa %xmm2, 96(%rsp) movdqa %xmm3, 112(%rsp) movdqa %xmm8, 128+64(%rsp) movdqa %xmm9, 128+80(%rsp) movdqa %xmm10, 128+96(%rsp) movdqa %xmm11, 128+112(%rsp) movdqa %xmm12, 256+64(%rsp) movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) subq $1, %r8 ja scrypt_core_3way_loop2 scrypt_shuffle %rsp, 0, %rdi, 0 scrypt_shuffle %rsp, 64, %rdi, 64 scrypt_shuffle %rsp, 128, %rsi, 0 scrypt_shuffle %rsp, 192, %rsi, 64 scrypt_shuffle %rsp, 256, %rdx, 0 scrypt_shuffle %rsp, 320, %rdx, 64 addq $392, %rsp #if defined(WIN64) popq %rsi popq %rdi movdqa 8(%rsp), %xmm6 movdqa 24(%rsp), %xmm7 movdqa 40(%rsp), %xmm8 movdqa 56(%rsp), %xmm9 movdqa 72(%rsp), %xmm10 movdqa 88(%rsp), %xmm11 movdqa 104(%rsp), %xmm12 movdqa 120(%rsp), %xmm13 movdqa 136(%rsp), %xmm14 movdqa 152(%rsp), %xmm15 addq $176, %rsp #endif popq %rbp popq %rbx ret #endif