cpuminer/scrypt-arm.S

580 lines
11 KiB
ArmAsm

/*
* Copyright 2012 pooler@litecoinpool.org
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "cpuminer-config.h"
#if defined(__arm__) && defined(__APCS_32__)
#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \
defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \
defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \
defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
#define __ARM_ARCH_5E_OR_6__
#endif
#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \
defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \
defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
#define __ARM_ARCH_5E_OR_6_OR_7__
#endif
#ifdef __ARM_ARCH_5E_OR_6__
.macro scrypt_shuffle
add lr, r0, #9*4
ldmia r0, {r2-r7}
ldmia lr, {r2, r8-r12, lr}
str r3, [r0, #5*4]
str r5, [r0, #15*4]
str r6, [r0, #12*4]
str r7, [r0, #1*4]
ldr r5, [r0, #7*4]
str r2, [r0, #13*4]
str r8, [r0, #2*4]
strd r4, [r0, #10*4]
str r9, [r0, #7*4]
str r10, [r0, #4*4]
str r11, [r0, #9*4]
str lr, [r0, #3*4]
add r2, r0, #64+0*4
add lr, r0, #64+9*4
ldmia r2, {r2-r7}
ldmia lr, {r2, r8-r12, lr}
str r3, [r0, #64+5*4]
str r5, [r0, #64+15*4]
str r6, [r0, #64+12*4]
str r7, [r0, #64+1*4]
ldr r5, [r0, #64+7*4]
str r2, [r0, #64+13*4]
str r8, [r0, #64+2*4]
strd r4, [r0, #64+10*4]
str r9, [r0, #64+7*4]
str r10, [r0, #64+4*4]
str r11, [r0, #64+9*4]
str lr, [r0, #64+3*4]
.endm
.macro salsa8_core_doubleround_body
add r6, r2, r6
add r7, r3, r7
eor r10, r10, r6, ror #25
add r6, r0, r4
eor r11, r11, r7, ror #25
add r7, r1, r5
strd r10, [sp, #14*4]
eor r12, r12, r6, ror #25
eor lr, lr, r7, ror #25
ldrd r6, [sp, #10*4]
add r2, r10, r2
add r3, r11, r3
eor r6, r6, r2, ror #23
add r2, r12, r0
eor r7, r7, r3, ror #23
add r3, lr, r1
strd r6, [sp, #10*4]
eor r8, r8, r2, ror #23
eor r9, r9, r3, ror #23
ldrd r2, [sp, #6*4]
add r10, r6, r10
add r11, r7, r11
eor r2, r2, r10, ror #19
add r10, r8, r12
eor r3, r3, r11, ror #19
add r11, r9, lr
eor r4, r4, r10, ror #19
eor r5, r5, r11, ror #19
ldrd r10, [sp, #2*4]
add r6, r2, r6
add r7, r3, r7
eor r10, r10, r6, ror #14
add r6, r4, r8
eor r11, r11, r7, ror #14
add r7, r5, r9
eor r0, r0, r6, ror #14
eor r1, r1, r7, ror #14
ldrd r6, [sp, #14*4]
strd r2, [sp, #6*4]
strd r10, [sp, #2*4]
add r6, r11, r6
add r7, r0, r7
eor r4, r4, r6, ror #25
add r6, r1, r12
eor r5, r5, r7, ror #25
add r7, r10, lr
eor r2, r2, r6, ror #25
eor r3, r3, r7, ror #25
strd r2, [sp, #6*4]
add r10, r3, r10
ldrd r6, [sp, #10*4]
add r11, r4, r11
eor r8, r8, r10, ror #23
add r10, r5, r0
eor r9, r9, r11, ror #23
add r11, r2, r1
eor r6, r6, r10, ror #23
eor r7, r7, r11, ror #23
strd r6, [sp, #10*4]
add r2, r7, r2
ldrd r10, [sp, #14*4]
add r3, r8, r3
eor r12, r12, r2, ror #19
add r2, r9, r4
eor lr, lr, r3, ror #19
add r3, r6, r5
eor r10, r10, r2, ror #19
eor r11, r11, r3, ror #19
ldrd r2, [sp, #2*4]
add r6, r11, r6
add r7, r12, r7
eor r0, r0, r6, ror #14
add r6, lr, r8
eor r1, r1, r7, ror #14
add r7, r10, r9
eor r2, r2, r6, ror #14
eor r3, r3, r7, ror #14
.endm
.macro salsa8_core
ldmia sp, {r0-r12, lr}
ldrd r10, [sp, #14*4]
salsa8_core_doubleround_body
ldrd r6, [sp, #6*4]
strd r2, [sp, #2*4]
strd r10, [sp, #14*4]
salsa8_core_doubleround_body
ldrd r6, [sp, #6*4]
strd r2, [sp, #2*4]
strd r10, [sp, #14*4]
salsa8_core_doubleround_body
ldrd r6, [sp, #6*4]
strd r2, [sp, #2*4]
strd r10, [sp, #14*4]
salsa8_core_doubleround_body
stmia sp, {r0-r5}
strd r8, [sp, #8*4]
str r12, [sp, #12*4]
str lr, [sp, #13*4]
strd r10, [sp, #14*4]
.endm
#else
.macro scrypt_shuffle
.endm
.macro salsa8_core_doubleround_body
ldr r8, [sp, #8*4]
add r11, r11, r10
ldr lr, [sp, #13*4]
add r12, r12, r3
eor r2, r2, r11, ror #23
add r11, r4, r0
eor r7, r7, r12, ror #23
add r12, r9, r5
str r9, [sp, #9*4]
eor r8, r8, r11, ror #23
str r10, [sp, #14*4]
eor lr, lr, r12, ror #23
ldr r11, [sp, #11*4]
add r9, lr, r9
ldr r12, [sp, #12*4]
add r10, r2, r10
eor r1, r1, r9, ror #19
add r9, r7, r3
eor r6, r6, r10, ror #19
add r10, r8, r4
str r8, [sp, #8*4]
eor r11, r11, r9, ror #19
str lr, [sp, #13*4]
eor r12, r12, r10, ror #19
ldr r9, [sp, #10*4]
add r8, r12, r8
ldr r10, [sp, #15*4]
add lr, r1, lr
eor r0, r0, r8, ror #14
add r8, r6, r2
eor r5, r5, lr, ror #14
add lr, r11, r7
eor r9, r9, r8, ror #14
ldr r8, [sp, #9*4]
eor r10, r10, lr, ror #14
ldr lr, [sp, #14*4]
add r8, r9, r8
str r9, [sp, #10*4]
add lr, r10, lr
str r10, [sp, #15*4]
eor r11, r11, r8, ror #25
add r8, r0, r3
eor r12, r12, lr, ror #25
add lr, r5, r4
eor r1, r1, r8, ror #25
ldr r8, [sp, #8*4]
eor r6, r6, lr, ror #25
add r9, r11, r9
ldr lr, [sp, #13*4]
add r10, r12, r10
eor r8, r8, r9, ror #23
add r9, r1, r0
eor lr, lr, r10, ror #23
add r10, r6, r5
str r11, [sp, #11*4]
eor r2, r2, r9, ror #23
str r12, [sp, #12*4]
eor r7, r7, r10, ror #23
ldr r9, [sp, #9*4]
add r11, r8, r11
ldr r10, [sp, #14*4]
add r12, lr, r12
eor r9, r9, r11, ror #19
add r11, r2, r1
eor r10, r10, r12, ror #19
add r12, r7, r6
str r8, [sp, #8*4]
eor r3, r3, r11, ror #19
str lr, [sp, #13*4]
eor r4, r4, r12, ror #19
.endm
.macro salsa8_core
ldmia sp, {r0-r7}
ldr r12, [sp, #15*4]
ldr r8, [sp, #11*4]
ldr lr, [sp, #12*4]
ldr r9, [sp, #9*4]
add r8, r8, r12
ldr r11, [sp, #10*4]
add lr, lr, r0
eor r3, r3, r8, ror #25
add r8, r5, r1
ldr r10, [sp, #14*4]
eor r4, r4, lr, ror #25
add lr, r11, r6
eor r9, r9, r8, ror #25
eor r10, r10, lr, ror #25
salsa8_core_doubleround_body
ldr r11, [sp, #10*4]
add r8, r9, r8
ldr r12, [sp, #15*4]
add lr, r10, lr
eor r11, r11, r8, ror #14
add r8, r3, r2
eor r12, r12, lr, ror #14
add lr, r4, r7
eor r0, r0, r8, ror #14
ldr r8, [sp, #11*4]
eor r5, r5, lr, ror #14
ldr lr, [sp, #12*4]
add r8, r8, r12
str r11, [sp, #10*4]
add lr, lr, r0
str r12, [sp, #15*4]
eor r3, r3, r8, ror #25
add r8, r5, r1
eor r4, r4, lr, ror #25
add lr, r11, r6
str r9, [sp, #9*4]
eor r9, r9, r8, ror #25
str r10, [sp, #14*4]
eor r10, r10, lr, ror #25
salsa8_core_doubleround_body
ldr r11, [sp, #10*4]
add r8, r9, r8
ldr r12, [sp, #15*4]
add lr, r10, lr
eor r11, r11, r8, ror #14
add r8, r3, r2
eor r12, r12, lr, ror #14
add lr, r4, r7
eor r0, r0, r8, ror #14
ldr r8, [sp, #11*4]
eor r5, r5, lr, ror #14
ldr lr, [sp, #12*4]
add r8, r8, r12
str r11, [sp, #10*4]
add lr, lr, r0
str r12, [sp, #15*4]
eor r3, r3, r8, ror #25
add r8, r5, r1
eor r4, r4, lr, ror #25
add lr, r11, r6
str r9, [sp, #9*4]
eor r9, r9, r8, ror #25
str r10, [sp, #14*4]
eor r10, r10, lr, ror #25
salsa8_core_doubleround_body
ldr r11, [sp, #10*4]
add r8, r9, r8
ldr r12, [sp, #15*4]
add lr, r10, lr
eor r11, r11, r8, ror #14
add r8, r3, r2
eor r12, r12, lr, ror #14
add lr, r4, r7
eor r0, r0, r8, ror #14
ldr r8, [sp, #11*4]
eor r5, r5, lr, ror #14
ldr lr, [sp, #12*4]
add r8, r8, r12
str r11, [sp, #10*4]
add lr, lr, r0
str r12, [sp, #15*4]
eor r3, r3, r8, ror #25
add r8, r5, r1
eor r4, r4, lr, ror #25
add lr, r11, r6
str r9, [sp, #9*4]
eor r9, r9, r8, ror #25
str r10, [sp, #14*4]
eor r10, r10, lr, ror #25
salsa8_core_doubleround_body
ldr r11, [sp, #10*4]
add r8, r9, r8
ldr r12, [sp, #15*4]
add lr, r10, lr
str r9, [sp, #9*4]
eor r11, r11, r8, ror #14
eor r12, r12, lr, ror #14
add r8, r3, r2
str r10, [sp, #14*4]
add lr, r4, r7
str r11, [sp, #10*4]
eor r0, r0, r8, ror #14
str r12, [sp, #15*4]
eor r5, r5, lr, ror #14
stmia sp, {r0-r7}
.endm
#endif
.macro scrypt_core_macro1a_x4
ldmia r0, {r4-r7}
ldmia lr!, {r8-r11}
stmia r1!, {r4-r7}
stmia r3!, {r8-r11}
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
stmia r0!, {r4-r7}
stmia r12!, {r4-r7}
.endm
.macro scrypt_core_macro1b_x4
ldmia r3!, {r8-r11}
ldmia r2, {r4-r7}
eor r8, r8, r4
eor r9, r9, r5
eor r10, r10, r6
eor r11, r11, r7
ldmia r0, {r4-r7}
stmia r2!, {r8-r11}
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
ldmia r1!, {r8-r11}
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
stmia r0!, {r4-r7}
stmia r12!, {r4-r7}
.endm
.macro scrypt_core_macro2_x4
ldmia r12, {r4-r7}
ldmia r0, {r8-r11}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
stmia r0!, {r4-r7}
ldmia r2, {r8-r11}
eor r4, r4, r8
eor r5, r5, r9
eor r6, r6, r10
eor r7, r7, r11
stmia r2!, {r4-r7}
stmia r12!, {r4-r7}
.endm
.macro scrypt_core_macro3_x4
ldmia r1!, {r4-r7}
ldmia r0, {r8-r11}
add r4, r4, r8
add r5, r5, r9
add r6, r6, r10
add r7, r7, r11
stmia r0!, {r4-r7}
.endm
.macro scrypt_core_macro3_x6
ldmia r1!, {r2-r7}
ldmia r0, {r8-r12, lr}
add r2, r2, r8
add r3, r3, r9
add r4, r4, r10
add r5, r5, r11
add r6, r6, r12
add r7, r7, lr
stmia r0!, {r2-r7}
.endm
.text
.code 32
.align 2
.globl scrypt_core
.globl _scrypt_core
#ifdef __ELF__
.type scrypt_core, %function
#endif
scrypt_core:
_scrypt_core:
stmfd sp!, {r4-r11, lr}
sub sp, sp, #20*4
scrypt_shuffle
str r0, [sp, #16*4]
add r12, r1, #1024*32*4
str r12, [sp, #18*4]
scrypt_core_loop1:
add lr, r0, #16*4
add r3, r1, #16*4
mov r12, sp
scrypt_core_macro1a_x4
scrypt_core_macro1a_x4
scrypt_core_macro1a_x4
scrypt_core_macro1a_x4
str r1, [sp, #17*4]
salsa8_core
ldr r0, [sp, #16*4]
mov r12, sp
add r2, r0, #16*4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
salsa8_core
ldr r0, [sp, #16*4]
mov r1, sp
add r0, r0, #16*4
scrypt_core_macro3_x6
scrypt_core_macro3_x6
ldr r3, [sp, #17*4]
ldr r12, [sp, #18*4]
scrypt_core_macro3_x4
add r1, r3, #16*4
sub r0, r0, #32*4
cmp r1, r12
bne scrypt_core_loop1
ldr r4, [r0, #16*4]
sub r1, r1, #1024*32*4
str r1, [sp, #17*4]
mov r4, r4, lsl #32-10
mov r12, #1024
add r1, r1, r4, lsr #32-10-7
scrypt_core_loop2:
add r2, r0, #16*4
add r3, r1, #16*4
str r12, [sp, #18*4]
mov r12, sp
#ifdef __ARM_ARCH_5E_OR_6_OR_7__
pld [r1, #24*4]
pld [r1, #8*4]
#endif
scrypt_core_macro1b_x4
scrypt_core_macro1b_x4
scrypt_core_macro1b_x4
scrypt_core_macro1b_x4
salsa8_core
ldr r0, [sp, #16*4]
mov r12, sp
add r2, r0, #16*4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
scrypt_core_macro2_x4
salsa8_core
ldr r0, [sp, #16*4]
mov r1, sp
ldr r3, [sp, #17*4]
add r0, r0, #16*4
scrypt_core_macro3_x4
mov r4, r4, lsl #32-10
add r3, r3, r4, lsr #32-10-7
str r3, [sp, #19*4]
#ifdef __ARM_ARCH_5E_OR_6_OR_7__
pld [r3, #16*4]
pld [r3]
#endif
scrypt_core_macro3_x6
scrypt_core_macro3_x6
ldr r12, [sp, #18*4]
sub r0, r0, #32*4
ldr r1, [sp, #19*4]
subs r12, r12, #1
bne scrypt_core_loop2
scrypt_shuffle
add sp, sp, #20*4
#ifdef __thumb__
ldmfd sp!, {r4-r11, lr}
bx lr
#else
ldmfd sp!, {r4-r11, pc}
#endif
#endif