From 9373a5c43386acf1a3d1a796ac03ccb32d901058 Mon Sep 17 00:00:00 2001 From: pooler Date: Tue, 24 Feb 2015 17:06:20 +0100 Subject: [PATCH] Add optimized PowerPC code --- Makefile.am | 3 + README | 2 + configure.ac | 4 + cpu-miner.c | 6 + miner.h | 2 +- scrypt-ppc.S | 1136 ++++++++++++++++++++++++++++++ scrypt.c | 8 +- sha2-ppc.S | 1919 ++++++++++++++++++++++++++++++++++++++++++++++++++ sha2.c | 4 +- 9 files changed, 3081 insertions(+), 3 deletions(-) create mode 100644 scrypt-ppc.S create mode 100644 sha2-ppc.S diff --git a/Makefile.am b/Makefile.am index ca8f9e5..a23a508 100644 --- a/Makefile.am +++ b/Makefile.am @@ -28,6 +28,9 @@ endif if ARCH_ARM minerd_SOURCES += sha2-arm.S scrypt-arm.S endif +if ARCH_PPC +minerd_SOURCES += sha2-ppc.S scrypt-ppc.S +endif endif minerd_LDFLAGS = $(PTHREAD_FLAGS) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ diff --git a/README b/README index 2dd93ac..192590f 100644 --- a/README +++ b/README @@ -40,6 +40,8 @@ Architecture-specific notes: but the decision whether to use them is made at compile time, based on compiler-defined macros. To use NEON instructions, add "-mfpu=neon" to CFLAGS. + PowerPC: No runtime CPU detection. + To use AltiVec instructions, add "-maltivec" to CFLAGS. x86: The miner checks for SSE2 instructions support at runtime, and uses them if they are available. x86-64: The miner can take advantage of AVX, AVX2 and XOP instructions, diff --git a/configure.ac b/configure.ac index fa5a6ab..8172247 100644 --- a/configure.ac +++ b/configure.ac @@ -48,6 +48,9 @@ case $target in arm*-*-*) have_arm=true ;; + powerpc*-*-*) + have_ppc=true + ;; esac PTHREAD_FLAGS="-pthread" @@ -108,6 +111,7 @@ AM_CONDITIONAL([USE_ASM], [test x$enable_assembly != xno]) AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue]) AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue]) AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue]) +AM_CONDITIONAL([ARCH_PPC], [test x$have_ppc = xtrue]) if test x$request_jansson = xtrue then diff --git a/cpu-miner.c b/cpu-miner.c index f848b4e..d43500f 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1456,6 +1456,12 @@ static void show_version_and_exit(void) #if defined(__ARM_NEON__) " NEON" #endif +#endif +#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + " PowerPC" +#if defined(__ALTIVEC__) + " AltiVec" +#endif #endif "\n"); diff --git a/miner.h b/miner.h index cc5adbd..08e1092 100644 --- a/miner.h +++ b/miner.h @@ -137,7 +137,7 @@ void sha256_transform(uint32_t *state, const uint32_t *block, int swap); void sha256d(unsigned char *hash, const unsigned char *data, int len); #ifdef USE_ASM -#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) +#if defined(__ARM_NEON__) || defined(__ALTIVEC__) || defined(__i386__) || defined(__x86_64__) #define HAVE_SHA256_4WAY 1 int sha256_use_4way(); void sha256_init_4way(uint32_t *state); diff --git a/scrypt-ppc.S b/scrypt-ppc.S new file mode 100644 index 0000000..6c4efc9 --- /dev/null +++ b/scrypt-ppc.S @@ -0,0 +1,1136 @@ +/* + * Copyright 2014-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + +#ifndef __APPLE__ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#endif + + +#ifdef __ALTIVEC__ + +#ifdef __APPLE__ + .machine ppc7400 +#endif + +.macro salsa8_core_doubleround + vadduwm v4, v0, v1 + vrlw v4, v4, v16 + vxor v3, v3, v4 + + vadduwm v4, v3, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v3 + vrlw v4, v4, v18 + vsldoi v3, v3, v3, 12 + vxor v1, v1, v4 + + vadduwm v4, v1, v2 + vrlw v4, v4, v19 + vsldoi v1, v1, v1, 4 + vxor v0, v0, v4 + + vadduwm v4, v0, v3 + vrlw v4, v4, v16 + vsldoi v2, v2, v2, 8 + vxor v1, v1, v4 + + vadduwm v4, v1, v0 + vrlw v4, v4, v17 + vxor v2, v2, v4 + + vadduwm v4, v2, v1 + vrlw v4, v4, v18 + vsldoi v1, v1, v1, 12 + vxor v3, v3, v4 + + vadduwm v4, v3, v2 + vrlw v4, v4, v19 + vsldoi v3, v3, v3, 4 + vxor v0, v0, v4 + vsldoi v2, v2, v2, 8 +.endm + +.macro salsa8_core + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround +.endm + + .text + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stwu r1, -4*4(r1) + mflr r0 + stw r0, 5*4(r1) + mfspr r0, 256 + stw r0, 2*4(r1) + oris r0, r0, 0xffff + ori r0, r0, 0xf000 + mtspr 256, r0 + + li r6, 1*16 + li r7, 2*16 + li r8, 3*16 + li r9, 4*16 + li r10, 5*16 + li r11, 6*16 + li r12, 7*16 + + lvx v8, 0, r3 + lvx v9, r3, r6 + lvx v10, r3, r7 + lvx v11, r3, r8 + lvx v12, r3, r9 + lvx v13, r3, r10 + lvx v14, r3, r11 + lvx v15, r3, r12 + + vxor v0, v0, v0 + vnor v1, v0, v0 + vsldoi v2, v0, v1, 4 + vsldoi v3, v2, v0, 8 + vor v3, v3, v2 + vsldoi v1, v0, v1, 8 + + vor v4, v8, v8 + vsel v8, v8, v9, v3 + vsel v9, v9, v10, v3 + vsel v10, v10, v11, v3 + vsel v11, v11, v4, v3 + vor v4, v8, v8 + vor v5, v9, v9 + vsel v8, v8, v10, v1 + vsel v9, v11, v9, v1 + vsel v10, v10, v4, v1 + vsel v11, v5, v11, v1 + + vor v4, v12, v12 + vsel v12, v12, v13, v3 + vsel v13, v13, v14, v3 + vsel v14, v14, v15, v3 + vsel v15, v15, v4, v3 + vor v4, v12, v12 + vor v5, v13, v13 + vsel v12, v12, v14, v1 + vsel v13, v15, v13, v1 + vsel v14, v14, v4, v1 + vsel v15, v5, v15, v1 + + vspltisw v16, 7 + vspltisw v17, 9 + vspltisw v18, 13 + vadduwm v19, v17, v17 + + mtctr r5 +scrypt_core_loop1: + vxor v8, v8, v12 + stvx v8, 0, r4 + vxor v9, v9, v13 + stvx v9, r4, r6 + vxor v10, v10, v14 + stvx v10, r4, r7 + vxor v11, v11, v15 + stvx v11, r4, r8 + vor v0, v8, v8 + stvx v12, r4, r9 + vor v1, v9, v9 + stvx v13, r4, r10 + vor v2, v10, v10 + stvx v14, r4, r11 + vor v3, v11, v11 + stvx v15, r4, r12 + + salsa8_core + + vadduwm v8, v8, v0 + vadduwm v9, v9, v1 + vadduwm v10, v10, v2 + vadduwm v11, v11, v3 + + vxor v12, v12, v8 + vxor v13, v13, v9 + vxor v14, v14, v10 + vxor v15, v15, v11 + vor v0, v12, v12 + vor v1, v13, v13 + vor v2, v14, v14 + vor v3, v15, v15 + + salsa8_core + + vadduwm v12, v12, v0 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + stvx v12, 0, r3 + slwi r6, r5, 7 + subf r4, r6, r4 + mtctr r5 + addi r5, r5, -1 + addi r7, r4, 1*16 + addi r8, r4, 2*16 + addi r9, r4, 3*16 +scrypt_core_loop2: + lwz r6, 0(r3) + and r6, r6, r5 + slwi r6, r6, 7 + lvx v0, r4, r6 + vxor v8, v8, v12 + lvx v1, r7, r6 + vxor v9, v9, v13 + lvx v2, r8, r6 + vxor v10, v10, v14 + lvx v3, r9, r6 + vxor v11, v11, v15 + vxor v0, v0, v8 + vxor v1, v1, v9 + vxor v2, v2, v10 + vxor v3, v3, v11 + addi r6, r6, 64 + vor v8, v0, v0 + vor v9, v1, v1 + lvx v5, r4, r6 + vor v10, v2, v2 + lvx v6, r7, r6 + vor v11, v3, v3 + lvx v7, r8, r6 + + salsa8_core + + vadduwm v8, v8, v0 + lvx v0, r9, r6 + vadduwm v9, v9, v1 + vadduwm v10, v10, v2 + vadduwm v11, v11, v3 + + vxor v12, v12, v5 + vxor v13, v13, v6 + vxor v14, v14, v7 + vxor v15, v15, v0 + vxor v12, v12, v8 + vxor v13, v13, v9 + vxor v14, v14, v10 + vxor v15, v15, v11 + vor v0, v12, v12 + vor v1, v13, v13 + vor v2, v14, v14 + vor v3, v15, v15 + + salsa8_core + + vadduwm v12, v12, v0 + stvx v12, 0, r3 + vadduwm v13, v13, v1 + vadduwm v14, v14, v2 + vadduwm v15, v15, v3 + + bdnz scrypt_core_loop2 + + vxor v0, v0, v0 + vnor v1, v0, v0 + vsldoi v2, v0, v1, 4 + vsldoi v3, v2, v0, 8 + vor v3, v3, v2 + vsldoi v1, v0, v1, 8 + + vor v4, v8, v8 + vsel v8, v8, v9, v3 + vsel v9, v9, v10, v3 + vsel v10, v10, v11, v3 + vsel v11, v11, v4, v3 + vor v4, v8, v8 + vor v5, v9, v9 + vsel v8, v8, v10, v1 + vsel v9, v11, v9, v1 + vsel v10, v10, v4, v1 + vsel v11, v5, v11, v1 + + vor v4, v12, v12 + vsel v12, v12, v13, v3 + vsel v13, v13, v14, v3 + vsel v14, v14, v15, v3 + vsel v15, v15, v4, v3 + vor v4, v12, v12 + vor v5, v13, v13 + vsel v12, v12, v14, v1 + vsel v13, v15, v13, v1 + vsel v14, v14, v4, v1 + vsel v15, v5, v15, v1 + + li r6, 1*16 + li r7, 2*16 + li r8, 3*16 + li r9, 4*16 + + stvx v8, 0, r3 + stvx v9, r3, r6 + stvx v10, r3, r7 + stvx v11, r3, r8 + stvx v12, r3, r9 + stvx v13, r3, r10 + stvx v14, r3, r11 + stvx v15, r3, r12 + + lwz r0, 2*4(r1) + mtspr 256, r0 + lwz r0, 5*4(r1) + mtlr r0 + addi r1, r1, 4*4 + blr + +#else /* __ALTIVEC__ */ + +.macro salsa8_core_doubleround + add r0, r16, r28 + add r5, r21, r17 + add r6, r26, r22 + add r7, r31, r27 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r20, r20, r0 + xor r25, r25, r5 + xor r30, r30, r6 + xor r19, r19, r7 + + add r0, r20, r16 + add r5, r25, r21 + add r6, r30, r26 + add r7, r19, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r24, r24, r0 + xor r29, r29, r5 + xor r18, r18, r6 + xor r23, r23, r7 + + add r0, r24, r20 + add r5, r29, r25 + add r6, r18, r30 + add r7, r23, r19 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r28, r28, r0 + xor r17, r17, r5 + xor r22, r22, r6 + xor r27, r27, r7 + + add r0, r28, r24 + add r5, r17, r29 + add r6, r22, r18 + add r7, r27, r23 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 + + add r0, r16, r19 + add r5, r21, r20 + add r6, r26, r25 + add r7, r31, r30 + rotlwi r0, r0, 7 + rotlwi r5, r5, 7 + rotlwi r6, r6, 7 + rotlwi r7, r7, 7 + xor r17, r17, r0 + xor r22, r22, r5 + xor r27, r27, r6 + xor r28, r28, r7 + + add r0, r17, r16 + add r5, r22, r21 + add r6, r27, r26 + add r7, r28, r31 + rotlwi r0, r0, 9 + rotlwi r5, r5, 9 + rotlwi r6, r6, 9 + rotlwi r7, r7, 9 + xor r18, r18, r0 + xor r23, r23, r5 + xor r24, r24, r6 + xor r29, r29, r7 + + add r0, r18, r17 + add r5, r23, r22 + add r6, r24, r27 + add r7, r29, r28 + rotlwi r0, r0, 13 + rotlwi r5, r5, 13 + rotlwi r6, r6, 13 + rotlwi r7, r7, 13 + xor r19, r19, r0 + xor r20, r20, r5 + xor r25, r25, r6 + xor r30, r30, r7 + + add r0, r19, r18 + add r5, r20, r23 + add r6, r25, r24 + add r7, r30, r29 + rotlwi r0, r0, 18 + rotlwi r5, r5, 18 + rotlwi r6, r6, 18 + rotlwi r7, r7, 18 + xor r16, r16, r0 + xor r21, r21, r5 + xor r26, r26, r6 + xor r31, r31, r7 +.endm + +.macro salsa8_core + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround + salsa8_core_doubleround +.endm + + .text + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stwu r1, -48*4(r1) + mflr r0 + stw r0, 49*4(r1) + stw r5, 2*4(r1) + stw r13, 3*4(r1) + stw r14, 4*4(r1) + stw r15, 5*4(r1) + stw r16, 6*4(r1) + stw r17, 7*4(r1) + stw r18, 8*4(r1) + stw r19, 9*4(r1) + stw r20, 10*4(r1) + stw r21, 11*4(r1) + stw r22, 12*4(r1) + stw r23, 13*4(r1) + stw r24, 14*4(r1) + stw r25, 15*4(r1) + stw r26, 16*4(r1) + stw r27, 17*4(r1) + stw r28, 18*4(r1) + stw r29, 19*4(r1) + stw r30, 20*4(r1) + stw r31, 21*4(r1) + stw r3, 22*4(r1) + + lwz r16, 0*4(r3) + lwz r17, 1*4(r3) + lwz r18, 2*4(r3) + lwz r19, 3*4(r3) + lwz r20, 4*4(r3) + lwz r21, 5*4(r3) + lwz r22, 6*4(r3) + lwz r23, 7*4(r3) + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + lwz r24, 8*4(r3) + lwz r25, 9*4(r3) + lwz r26, 10*4(r3) + lwz r27, 11*4(r3) + lwz r28, 12*4(r3) + lwz r29, 13*4(r3) + lwz r30, 14*4(r3) + lwz r31, 15*4(r3) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + lwz r16, 16*4(r3) + lwz r17, 17*4(r3) + lwz r18, 18*4(r3) + lwz r19, 19*4(r3) + lwz r20, 20*4(r3) + lwz r21, 21*4(r3) + lwz r22, 22*4(r3) + lwz r23, 23*4(r3) + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + lwz r8, 24*4(r3) + lwz r9, 25*4(r3) + lwz r10, 26*4(r3) + lwz r11, 27*4(r3) + lwz r12, 28*4(r3) + lwz r13, 29*4(r3) + lwz r14, 30*4(r3) + lwz r15, 31*4(r3) + + mtctr r5 +scrypt_core_loop1: + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + stw r16, 0*4(r4) + stw r17, 1*4(r4) + stw r18, 2*4(r4) + stw r19, 3*4(r4) + stw r0, 16*4(r4) + stw r5, 17*4(r4) + stw r6, 18*4(r4) + stw r7, 19*4(r4) + + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + stw r0, 20*4(r4) + stw r5, 21*4(r4) + stw r6, 22*4(r4) + stw r7, 23*4(r4) + stw r20, 4*4(r4) + stw r21, 5*4(r4) + stw r22, 6*4(r4) + stw r23, 7*4(r4) + + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r24, 8*4(r4) + stw r25, 9*4(r4) + stw r26, 10*4(r4) + stw r27, 11*4(r4) + stw r28, 12*4(r4) + stw r29, 13*4(r4) + stw r30, 14*4(r4) + stw r31, 15*4(r4) + stw r8, 24*4(r4) + stw r9, 25*4(r4) + stw r10, 26*4(r4) + stw r11, 27*4(r4) + stw r12, 28*4(r4) + stw r13, 29*4(r4) + stw r14, 30*4(r4) + stw r15, 31*4(r4) + + salsa8_core + + lwz r0, 0*4(r4) + lwz r5, 1*4(r4) + lwz r6, 2*4(r4) + lwz r7, 3*4(r4) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 4*4(r4) + lwz r5, 5*4(r4) + lwz r6, 6*4(r4) + lwz r7, 7*4(r4) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 8*4(r4) + lwz r5, 9*4(r4) + lwz r6, 10*4(r4) + lwz r7, 11*4(r4) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 12*4(r4) + lwz r5, 13*4(r4) + lwz r6, 14*4(r4) + lwz r7, 15*4(r4) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + salsa8_core + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + add r8, r8, r24 + add r9, r9, r25 + add r10, r10, r26 + add r11, r11, r27 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + add r12, r12, r28 + add r13, r13, r29 + add r14, r14, r30 + add r15, r15, r31 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + + addi r4, r4, 32*4 + bdnz scrypt_core_loop1 + + lwz r5, 2*4(r1) + slwi r3, r5, 7 + subf r4, r3, r4 + mtctr r5 + addi r5, r5, -1 + stw r5, 2*4(r1) +scrypt_core_loop2: + and r3, r16, r5 + slwi r3, r3, 7 + add r3, r3, r4 + mr r0, r16 + mr r5, r17 + mr r6, r18 + mr r7, r19 + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + + lwz r0, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 4*4(r3) + lwz r5, 5*4(r3) + lwz r6, 6*4(r3) + lwz r7, 7*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 8*4(r3) + lwz r5, 9*4(r3) + lwz r6, 10*4(r3) + lwz r7, 11*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 12*4(r3) + lwz r5, 13*4(r3) + lwz r6, 14*4(r3) + lwz r7, 15*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + salsa8_core + + lwz r0, 24*4(r1) + lwz r5, 25*4(r1) + lwz r6, 26*4(r1) + lwz r7, 27*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 28*4(r1) + lwz r5, 29*4(r1) + lwz r6, 30*4(r1) + lwz r7, 31*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r0, 32*4(r1) + lwz r5, 33*4(r1) + lwz r6, 34*4(r1) + lwz r7, 35*4(r1) + add r24, r24, r0 + add r25, r25, r5 + add r26, r26, r6 + add r27, r27, r7 + lwz r0, 36*4(r1) + lwz r5, 37*4(r1) + lwz r6, 38*4(r1) + lwz r7, 39*4(r1) + add r28, r28, r0 + add r29, r29, r5 + add r30, r30, r6 + add r31, r31, r7 + + stw r16, 24*4(r1) + stw r17, 25*4(r1) + stw r18, 26*4(r1) + stw r19, 27*4(r1) + stw r20, 28*4(r1) + stw r21, 29*4(r1) + stw r22, 30*4(r1) + stw r23, 31*4(r1) + stw r24, 32*4(r1) + stw r25, 33*4(r1) + stw r26, 34*4(r1) + stw r27, 35*4(r1) + stw r28, 36*4(r1) + stw r29, 37*4(r1) + stw r30, 38*4(r1) + stw r31, 39*4(r1) + + lwz r0, 16*4(r3) + lwz r5, 17*4(r3) + lwz r6, 18*4(r3) + lwz r7, 19*4(r3) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 20*4(r3) + lwz r5, 21*4(r3) + lwz r6, 22*4(r3) + lwz r7, 23*4(r3) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + lwz r0, 24*4(r3) + lwz r5, 25*4(r3) + lwz r6, 26*4(r3) + lwz r7, 27*4(r3) + xor r24, r24, r0 + xor r25, r25, r5 + xor r26, r26, r6 + xor r27, r27, r7 + lwz r0, 28*4(r3) + lwz r5, 29*4(r3) + lwz r6, 30*4(r3) + lwz r7, 31*4(r3) + xor r28, r28, r0 + xor r29, r29, r5 + xor r30, r30, r6 + xor r31, r31, r7 + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + xor r16, r16, r0 + xor r17, r17, r5 + xor r18, r18, r6 + xor r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + xor r20, r20, r0 + xor r21, r21, r5 + xor r22, r22, r6 + xor r23, r23, r7 + xor r24, r24, r8 + xor r25, r25, r9 + xor r26, r26, r10 + xor r27, r27, r11 + xor r28, r28, r12 + xor r29, r29, r13 + xor r30, r30, r14 + xor r31, r31, r15 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + mr r8, r24 + mr r9, r25 + mr r10, r26 + mr r11, r27 + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + mr r12, r28 + mr r13, r29 + mr r14, r30 + mr r15, r31 + + salsa8_core + + lwz r0, 40*4(r1) + lwz r5, 41*4(r1) + lwz r6, 42*4(r1) + lwz r7, 43*4(r1) + add r16, r16, r0 + add r17, r17, r5 + add r18, r18, r6 + add r19, r19, r7 + lwz r0, 44*4(r1) + lwz r5, 45*4(r1) + lwz r6, 46*4(r1) + lwz r7, 47*4(r1) + add r20, r20, r0 + add r21, r21, r5 + add r22, r22, r6 + add r23, r23, r7 + lwz r5, 2*4(r1) + add r8, r8, r24 + add r9, r9, r25 + add r10, r10, r26 + add r11, r11, r27 + add r12, r12, r28 + add r13, r13, r29 + add r14, r14, r30 + add r15, r15, r31 + stw r16, 40*4(r1) + stw r17, 41*4(r1) + stw r18, 42*4(r1) + stw r19, 43*4(r1) + stw r20, 44*4(r1) + stw r21, 45*4(r1) + stw r22, 46*4(r1) + stw r23, 47*4(r1) + bdnz scrypt_core_loop2 + + lwz r3, 22*4(r1) + + lwz r16, 24*4(r1) + lwz r17, 25*4(r1) + lwz r18, 26*4(r1) + lwz r19, 27*4(r1) + lwz r20, 28*4(r1) + lwz r21, 29*4(r1) + lwz r22, 30*4(r1) + lwz r23, 31*4(r1) + stw r16, 0*4(r3) + stw r17, 1*4(r3) + stw r18, 2*4(r3) + stw r19, 3*4(r3) + stw r20, 4*4(r3) + stw r21, 5*4(r3) + stw r22, 6*4(r3) + stw r23, 7*4(r3) + lwz r24, 32*4(r1) + lwz r25, 33*4(r1) + lwz r26, 34*4(r1) + lwz r27, 35*4(r1) + lwz r28, 36*4(r1) + lwz r29, 37*4(r1) + lwz r30, 38*4(r1) + lwz r31, 39*4(r1) + stw r24, 8*4(r3) + stw r25, 9*4(r3) + stw r26, 10*4(r3) + stw r27, 11*4(r3) + stw r28, 12*4(r3) + stw r29, 13*4(r3) + stw r30, 14*4(r3) + stw r31, 15*4(r3) + lwz r16, 40*4(r1) + lwz r17, 41*4(r1) + lwz r18, 42*4(r1) + lwz r19, 43*4(r1) + lwz r20, 44*4(r1) + lwz r21, 45*4(r1) + lwz r22, 46*4(r1) + lwz r23, 47*4(r1) + stw r16, 16*4(r3) + stw r17, 17*4(r3) + stw r18, 18*4(r3) + stw r19, 19*4(r3) + stw r20, 20*4(r3) + stw r21, 21*4(r3) + stw r22, 22*4(r3) + stw r23, 23*4(r3) + stw r8, 24*4(r3) + stw r9, 25*4(r3) + stw r10, 26*4(r3) + stw r11, 27*4(r3) + stw r12, 28*4(r3) + stw r13, 29*4(r3) + stw r14, 30*4(r3) + stw r15, 31*4(r3) + + lwz r13, 3*4(r1) + lwz r14, 4*4(r1) + lwz r15, 5*4(r1) + lwz r16, 6*4(r1) + lwz r17, 7*4(r1) + lwz r18, 8*4(r1) + lwz r19, 9*4(r1) + lwz r20, 10*4(r1) + lwz r21, 11*4(r1) + lwz r22, 12*4(r1) + lwz r23, 13*4(r1) + lwz r24, 14*4(r1) + lwz r25, 15*4(r1) + lwz r26, 16*4(r1) + lwz r27, 17*4(r1) + lwz r28, 18*4(r1) + lwz r29, 19*4(r1) + lwz r30, 20*4(r1) + lwz r31, 21*4(r1) + lwz r0, 49*4(r1) + mtlr r0 + addi r1, r1, 48*4 + blr + +#endif /* __ALTIVEC__ */ + +#endif diff --git a/scrypt.c b/scrypt.c index f75123e..b8dce87 100644 --- a/scrypt.c +++ b/scrypt.c @@ -409,6 +409,12 @@ void scrypt_core(uint32_t *X, uint32_t *V, int N); void scrypt_core_3way(uint32_t *X, uint32_t *V, int N); #endif +#elif defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + +#define SCRYPT_MAX_WAYS 4 +#define scrypt_best_throughput() 1 +void scrypt_core(uint32_t *X, uint32_t *V, int N); + #else static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) @@ -513,7 +519,7 @@ static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, uint32_t *midstate, unsigned char *scratchpad, int N) { uint32_t tstate[8], ostate[8]; - uint32_t X[32]; + uint32_t X[32] __attribute__((aligned(128))); uint32_t *V; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); diff --git a/sha2-ppc.S b/sha2-ppc.S new file mode 100644 index 0000000..803fd77 --- /dev/null +++ b/sha2-ppc.S @@ -0,0 +1,1919 @@ +/* + * Copyright 2014-2015 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(USE_ASM) && (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) + +#ifdef __APPLE__ + +#define HI(name) ha16(name) +#define LO(name) lo16(name) + +#else + +#define HI(name) name@ha +#define LO(name) name@l + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#ifdef __ALTIVEC__ +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 +#endif + +#endif + + + .data + .align 2 +sha256_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + .data + .align 2 +sha256_k: + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + +.macro sha256_extend_doubleround i, rw, wo, ra, rb, ry, rz + lwz r14, \wo+(\i+1)*4(\rw) + rotrwi r12, \ry, 17 + rotrwi r13, \ry, 19 + add r11, r11, \ra + xor r12, r12, r13 + srwi r13, \ry, 10 + rotrwi \ra, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + add r12, r12, r11 + xor \ra, \ra, r13 + srwi r13, r14, 3 + lwz r11, \wo+(\i+2)*4(\rw) + xor \ra, \ra, r13 + rotrwi r13, \rz, 19 + add \ra, \ra, r12 + + rotrwi r12, \rz, 17 + add r14, r14, \rb + xor r12, r12, r13 + srwi r13, \rz, 10 + rotrwi \rb, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + stw \ra, \wo+(\i+16)*4(\rw) + xor \rb, \rb, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor \rb, \rb, r13 + add \rb, \rb, r14 + stw \rb, \wo+(\i+17)*4(\rw) +.endm + + +.macro sha256_main_round i, rk, rw, wo, ra, rb, rc, rd, re, rf, rg, rh + lwz r12, \wo+(\i)*4(\rw) + and r13, \rf, \re + andc r14, \rg, \re + lwz r15, (\i)*4(\rk) + or r14, r14, r13 + rotrwi r13, \re, 5 + add \rh, \rh, r14 + xor r14, \re, r13 + rotrwi r13, \re, 19 + add \rh, \rh, r12 + xor r14, r14, r13 + add \rh, \rh, r15 + rotrwi r13, r14, 6 + xor r15, \ra, \rb + add \rh, \rh, r13 + + rotrwi r13, \ra, 11 + and r15, r15, \rc + xor r12, \ra, r13 + rotrwi r13, \ra, 20 + and r14, \ra, \rb + xor r12, r12, r13 + xor r14, r14, r15 + rotrwi r13, r12, 2 + add r15, \rh, r14 + add \rh, \rh, \rd + add \rd, r15, r13 +.endm + +.macro sha256_main_quadround i, rk, rw, wo + sha256_main_round \i+0, \rk, \rw, \wo, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round \i+1, \rk, \rw, \wo, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round \i+2, \rk, \rw, \wo, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round \i+3, \rk, \rw, \wo, r5, r6, r7, r4, r9, r10, r11, r8 +.endm + + + .text + .align 2 + .globl sha256_transform + .globl _sha256_transform +#ifdef __ELF__ + .type sha256_transform, %function +#endif +sha256_transform: +_sha256_transform: + stwu r1, -72*4(r1) + cmpwi 0, r5, 0 + stw r13, 2*4(r1) + stw r14, 3*4(r1) + stw r15, 4*4(r1) + stw r16, 5*4(r1) + + bne 0, sha256_transform_swap + + lwz r11, 0*4(r4) + lwz r14, 1*4(r4) + lwz r15, 2*4(r4) + lwz r7, 3*4(r4) + lwz r8, 4*4(r4) + lwz r9, 5*4(r4) + lwz r10, 6*4(r4) + lwz r0, 7*4(r4) + lwz r12, 8*4(r4) + lwz r13, 9*4(r4) + lwz r5, 10*4(r4) + lwz r6, 11*4(r4) + stw r11, 8*4+0*4(r1) + stw r14, 8*4+1*4(r1) + stw r15, 8*4+2*4(r1) + stw r7, 8*4+3*4(r1) + stw r8, 8*4+4*4(r1) + stw r9, 8*4+5*4(r1) + stw r10, 8*4+6*4(r1) + stw r0, 8*4+7*4(r1) + stw r12, 8*4+8*4(r1) + stw r13, 8*4+9*4(r1) + stw r5, 8*4+10*4(r1) + stw r6, 8*4+11*4(r1) + lwz r7, 12*4(r4) + lwz r8, 13*4(r4) + lwz r9, 14*4(r4) + lwz r10, 15*4(r4) + mr r4, r13 + stw r7, 8*4+12*4(r1) + stw r8, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + b sha256_transform_extend + +sha256_transform_swap: + li r13, 1*4 + li r14, 2*4 + li r15, 3*4 + lwbrx r11, 0, r4 + lwbrx r7, r4, r13 + lwbrx r8, r4, r14 + lwbrx r9, r4, r15 + addi r4, r4, 4*4 + stw r11, 8*4+0*4(r1) + stw r7, 8*4+1*4(r1) + stw r8, 8*4+2*4(r1) + stw r9, 8*4+3*4(r1) + lwbrx r7, 0, r4 + lwbrx r8, r4, r13 + lwbrx r9, r4, r14 + lwbrx r10, r4, r15 + addi r4, r4, 4*4 + stw r7, 8*4+4*4(r1) + stw r8, 8*4+5*4(r1) + stw r9, 8*4+6*4(r1) + stw r10, 8*4+7*4(r1) + lwbrx r8, 0, r4 + lwbrx r12, r4, r13 + lwbrx r5, r4, r14 + lwbrx r6, r4, r15 + addi r4, r4, 4*4 + stw r8, 8*4+8*4(r1) + stw r12, 8*4+9*4(r1) + stw r5, 8*4+10*4(r1) + stw r6, 8*4+11*4(r1) + lwbrx r7, 0, r4 + lwbrx r8, r4, r13 + lwbrx r9, r4, r14 + lwbrx r10, r4, r15 + mr r4, r12 + stw r7, 8*4+12*4(r1) + stw r8, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + +sha256_transform_extend: + sha256_extend_doubleround 0, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 2, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 4, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 6, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 8, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 10, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 12, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 14, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 44, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 46, r1, 8*4, r8, r9, r6, r7 + + lwz r4, 0*4(r3) + lwz r5, 1*4(r3) + lwz r6, 2*4(r3) + lwz r7, 3*4(r3) + lwz r8, 4*4(r3) + lwz r9, 5*4(r3) + lwz r10, 6*4(r3) + lwz r11, 7*4(r3) + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) + sha256_main_quadround 0, r16, r1, 8*4 + sha256_main_quadround 4, r16, r1, 8*4 + sha256_main_quadround 8, r16, r1, 8*4 + sha256_main_quadround 12, r16, r1, 8*4 + sha256_main_quadround 16, r16, r1, 8*4 + sha256_main_quadround 20, r16, r1, 8*4 + sha256_main_quadround 24, r16, r1, 8*4 + sha256_main_quadround 28, r16, r1, 8*4 + sha256_main_quadround 32, r16, r1, 8*4 + sha256_main_quadround 36, r16, r1, 8*4 + sha256_main_quadround 40, r16, r1, 8*4 + sha256_main_quadround 44, r16, r1, 8*4 + sha256_main_quadround 48, r16, r1, 8*4 + sha256_main_quadround 52, r16, r1, 8*4 + sha256_main_quadround 56, r16, r1, 8*4 + sha256_main_quadround 60, r16, r1, 8*4 + + lwz r12, 0*4(r3) + lwz r13, 1*4(r3) + lwz r14, 2*4(r3) + lwz r15, 3*4(r3) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 0*4(r3) + stw r5, 1*4(r3) + stw r6, 2*4(r3) + stw r7, 3*4(r3) + lwz r12, 4*4(r3) + lwz r13, 5*4(r3) + lwz r14, 6*4(r3) + lwz r15, 7*4(r3) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 4*4(r3) + stw r9, 5*4(r3) + stw r10, 6*4(r3) + stw r11, 7*4(r3) + + lwz r13, 2*4(r1) + lwz r14, 3*4(r1) + lwz r15, 4*4(r1) + lwz r16, 5*4(r1) + addi r1, r1, 72*4 + blr + + + .text + .align 2 + .globl sha256d_ms + .globl _sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: + stwu r1, -72*4(r1) + stw r13, 2*4(r1) + stw r14, 3*4(r1) + stw r15, 4*4(r1) + stw r16, 5*4(r1) + stw r17, 6*4(r1) + stw r18, 7*4(r1) + + mr r17, r4 + mr r18, r5 + mr r16, r6 + + lwz r14, 3*4(r17) + lwz r6, 18*4(r17) + lwz r7, 19*4(r17) + + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r14, 3 + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + lwz r8, 20*4(r17) + add r6, r6, r12 + lwz r10, 22*4(r17) + add r7, r7, r14 + stw r6, 18*4(r17) + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 19*4(r17) + xor r12, r12, r13 + srwi r13, r6, 10 + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + lwz r4, 23*4(r17) + add r8, r8, r12 + lwz r5, 24*4(r17) + + rotrwi r9, r7, 17 + rotrwi r13, r7, 19 + stw r8, 20*4(r17) + xor r9, r9, r13 + srwi r13, r7, 10 + stw r10, 8*4+21*4(r1) + xor r9, r9, r13 + stw r4, 8*4+22*4(r1) + + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + stw r9, 21*4(r17) + xor r12, r12, r13 + srwi r13, r8, 10 + stw r5, 8*4+23*4(r1) + xor r12, r12, r13 + rotrwi r14, r9, 17 + rotrwi r13, r9, 19 + add r10, r10, r12 + lwz r11, 30*4(r17) + + xor r14, r14, r13 + srwi r13, r9, 10 + stw r10, 22*4(r17) + xor r14, r14, r13 + stw r11, 8*4+24*4(r1) + add r4, r4, r14 + + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + stw r4, 23*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + rotrwi r14, r4, 17 + xor r12, r12, r13 + rotrwi r13, r4, 19 + xor r14, r14, r13 + srwi r13, r4, 10 + add r5, r5, r12 + xor r14, r14, r13 + stw r5, 24*4(r17) + add r6, r6, r14 + + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + stw r6, 25*4(r17) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r14, r6, 17 + xor r12, r12, r13 + rotrwi r13, r6, 19 + xor r14, r14, r13 + srwi r13, r6, 10 + add r7, r7, r12 + xor r14, r14, r13 + stw r7, 26*4(r17) + add r8, r8, r14 + + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + stw r8, 27*4(r17) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r14, r8, 17 + xor r12, r12, r13 + rotrwi r13, r8, 19 + xor r14, r14, r13 + srwi r13, r8, 10 + add r9, r9, r12 + xor r14, r14, r13 + stw r9, 28*4(r17) + add r10, r10, r14 + + lwz r14, 31*4(r17) + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + stw r10, 29*4(r17) + xor r12, r12, r13 + srwi r13, r9, 10 + stw r14, 8*4+25*4(r1) + xor r12, r12, r13 + add r11, r11, r12 + add r5, r5, r14 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + add r4, r4, r11 + + lwz r11, 16*4(r17) + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 30*4(r17) + xor r12, r12, r13 + add r5, r5, r12 + stw r5, 31*4(r17) + + sha256_extend_doubleround 16, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 18, r17, 0, r8, r9, r6, r7 + sha256_extend_doubleround 20, r17, 0, r10, r4, r8, r9 + sha256_extend_doubleround 22, r17, 0, r5, r6, r10, r4 + sha256_extend_doubleround 24, r17, 0, r7, r8, r5, r6 + sha256_extend_doubleround 26, r17, 0, r9, r10, r7, r8 + sha256_extend_doubleround 28, r17, 0, r4, r5, r9, r10 + sha256_extend_doubleround 30, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 32, r17, 0, r8, r9, r6, r7 + sha256_extend_doubleround 34, r17, 0, r10, r4, r8, r9 + sha256_extend_doubleround 36, r17, 0, r5, r6, r10, r4 + sha256_extend_doubleround 38, r17, 0, r7, r8, r5, r6 + sha256_extend_doubleround 40, r17, 0, r9, r10, r7, r8 + sha256_extend_doubleround 42, r17, 0, r4, r5, r9, r10 + sha256_extend_doubleround 44, r17, 0, r6, r7, r4, r5 + sha256_extend_doubleround 46, r17, 0, r8, r9, r6, r7 + + lwz r4, 0*4(r16) + lwz r9, 1*4(r16) + lwz r10, 2*4(r16) + lwz r11, 3*4(r16) + lwz r8, 4*4(r16) + lwz r5, 5*4(r16) + lwz r6, 6*4(r16) + lwz r7, 7*4(r16) + lis r16, HI(sha256_k) + addi r16, r16, LO(sha256_k) + + sha256_main_round 3, r16, r17, 0, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 4, r16, r17, 0 + sha256_main_quadround 8, r16, r17, 0 + sha256_main_quadround 12, r16, r17, 0 + sha256_main_quadround 16, r16, r17, 0 + sha256_main_quadround 20, r16, r17, 0 + sha256_main_quadround 24, r16, r17, 0 + sha256_main_quadround 28, r16, r17, 0 + sha256_main_quadround 32, r16, r17, 0 + sha256_main_quadround 36, r16, r17, 0 + sha256_main_quadround 40, r16, r17, 0 + sha256_main_quadround 44, r16, r17, 0 + sha256_main_quadround 48, r16, r17, 0 + sha256_main_quadround 52, r16, r17, 0 + sha256_main_quadround 56, r16, r17, 0 + sha256_main_quadround 60, r16, r17, 0 + + lwz r12, 0*4(r18) + lwz r13, 1*4(r18) + lwz r14, 2*4(r18) + lwz r15, 3*4(r18) + add r4, r4, r12 + add r5, r5, r13 + add r6, r6, r14 + add r7, r7, r15 + stw r4, 8*4+0*4(r1) + stw r5, 8*4+1*4(r1) + stw r6, 8*4+2*4(r1) + stw r7, 8*4+3*4(r1) + lwz r12, 4*4(r18) + lwz r13, 5*4(r18) + lwz r14, 6*4(r18) + lwz r15, 7*4(r18) + add r8, r8, r12 + add r9, r9, r13 + add r10, r10, r14 + add r11, r11, r15 + stw r8, 8*4+4*4(r1) + stw r9, 8*4+5*4(r1) + stw r10, 8*4+6*4(r1) + stw r11, 8*4+7*4(r1) + + lwz r4, 8*4+18*4(r1) + lwz r5, 8*4+19*4(r1) + lwz r6, 8*4+20*4(r1) + lwz r7, 8*4+21*4(r1) + lwz r8, 8*4+22*4(r1) + lwz r9, 8*4+23*4(r1) + lwz r10, 8*4+24*4(r1) + lwz r11, 8*4+25*4(r1) + stw r4, 18*4(r17) + stw r5, 19*4(r17) + stw r6, 20*4(r17) + stw r7, 22*4(r17) + stw r8, 23*4(r17) + stw r9, 24*4(r17) + stw r10, 30*4(r17) + stw r11, 31*4(r17) + + lis r8, 0x8000 + li r9, 0 + li r10, 0x0100 + + lwz r14, 8*4+1*4(r1) + lwz r4, 8*4+0*4(r1) + + lwz r11, 8*4+2*4(r1) + rotrwi r12, r14, 7 + rotrwi r13, r14, 18 + + stw r8, 8*4+8*4(r1) + stw r9, 8*4+9*4(r1) + stw r9, 8*4+10*4(r1) + stw r9, 8*4+11*4(r1) + stw r9, 8*4+12*4(r1) + stw r9, 8*4+13*4(r1) + stw r9, 8*4+14*4(r1) + stw r10, 8*4+15*4(r1) + + xor r12, r12, r13 + srwi r13, r14, 3 + addis r5, r14, 0x00a0 + xor r12, r12, r13 + rotrwi r14, r11, 7 + rotrwi r13, r11, 18 + add r4, r4, r12 + xor r14, r14, r13 + srwi r13, r11, 3 + stw r4, 8*4+16*4(r1) + xor r14, r14, r13 + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r5, r5, r14 + lwz r14, 8*4+3*4(r1) + + stw r5, 8*4+17*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r6, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r6, r6, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r6, r6, r13 + rotrwi r12, r5, 17 + rotrwi r13, r5, 19 + add r6, r6, r11 + lwz r11, 8*4+4*4(r1) + + stw r6, 8*4+18*4(r1) + xor r12, r12, r13 + srwi r13, r5, 10 + rotrwi r7, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r7, r7, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r7, r7, r13 + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + add r7, r7, r14 + lwz r14, 8*4+5*4(r1) + + stw r7, 8*4+19*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r8, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r8, r8, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r8, r8, r13 + rotrwi r12, r7, 17 + rotrwi r13, r7, 19 + add r8, r8, r11 + lwz r11, 8*4+6*4(r1) + + stw r8, 8*4+20*4(r1) + xor r12, r12, r13 + srwi r13, r7, 10 + rotrwi r9, r11, 7 + xor r12, r12, r13 + rotrwi r13, r11, 18 + xor r9, r9, r13 + srwi r13, r11, 3 + add r14, r14, r12 + xor r9, r9, r13 + rotrwi r12, r8, 17 + rotrwi r13, r8, 19 + add r9, r9, r14 + lwz r14, 8*4+7*4(r1) + + stw r9, 8*4+21*4(r1) + xor r12, r12, r13 + srwi r13, r8, 10 + rotrwi r10, r14, 7 + xor r12, r12, r13 + rotrwi r13, r14, 18 + xor r10, r10, r13 + srwi r13, r14, 3 + add r11, r11, r12 + xor r10, r10, r13 + rotrwi r12, r9, 17 + rotrwi r13, r9, 19 + addi r11, r11, 0x0100 + add r14, r14, r4 + add r10, r10, r11 + + xor r12, r12, r13 + srwi r13, r9, 10 + stw r10, 8*4+22*4(r1) + addis r14, r14, 0x1100 + xor r12, r12, r13 + add r14, r14, r12 + rotrwi r12, r10, 17 + rotrwi r13, r10, 19 + addi r4, r14, 0x2000 + xor r12, r12, r13 + srwi r13, r10, 10 + stw r4, 8*4+23*4(r1) + addis r5, r5, 0x8000 + xor r12, r12, r13 + add r5, r5, r12 + + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + stw r5, 8*4+24*4(r1) + xor r12, r12, r13 + srwi r13, r4, 10 + rotrwi r11, r5, 17 + xor r12, r12, r13 + rotrwi r13, r5, 19 + xor r11, r11, r13 + srwi r13, r5, 10 + add r6, r6, r12 + xor r11, r11, r13 + stw r6, 8*4+25*4(r1) + add r7, r7, r11 + + rotrwi r12, r6, 17 + rotrwi r13, r6, 19 + stw r7, 8*4+26*4(r1) + xor r12, r12, r13 + srwi r13, r6, 10 + rotrwi r11, r7, 17 + xor r12, r12, r13 + rotrwi r13, r7, 19 + xor r11, r11, r13 + srwi r13, r7, 10 + add r8, r8, r12 + xor r11, r11, r13 + stw r8, 8*4+27*4(r1) + add r9, r9, r11 + + rotrwi r14, r8, 17 + rotrwi r13, r8, 19 + rotrwi r12, r9, 17 + stw r9, 8*4+28*4(r1) + addis r4, r4, 0x0040 + xor r14, r14, r13 + rotrwi r13, r9, 19 + xor r12, r12, r13 + srwi r13, r8, 10 + xor r14, r14, r13 + srwi r13, r9, 10 + xor r12, r12, r13 + addi r4, r4, 0x0022 + add r10, r10, r14 + add r4, r4, r12 + lwz r11, 8*4+16*4(r1) + + addi r5, r5, 0x0100 + stw r4, 8*4+30*4(r1) + rotrwi r14, r11, 7 + stw r10, 8*4+29*4(r1) + rotrwi r13, r11, 18 + rotrwi r12, r10, 17 + xor r14, r14, r13 + rotrwi r13, r10, 19 + xor r12, r12, r13 + srwi r13, r11, 3 + xor r14, r14, r13 + srwi r13, r10, 10 + xor r12, r12, r13 + add r5, r5, r14 + add r5, r5, r12 + stw r5, 8*4+31*4(r1) + + sha256_extend_doubleround 16, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 18, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 20, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 22, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 24, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 26, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 28, r1, 8*4, r4, r5, r9, r10 + sha256_extend_doubleround 30, r1, 8*4, r6, r7, r4, r5 + sha256_extend_doubleround 32, r1, 8*4, r8, r9, r6, r7 + sha256_extend_doubleround 34, r1, 8*4, r10, r4, r8, r9 + sha256_extend_doubleround 36, r1, 8*4, r5, r6, r10, r4 + sha256_extend_doubleround 38, r1, 8*4, r7, r8, r5, r6 + sha256_extend_doubleround 40, r1, 8*4, r9, r10, r7, r8 + sha256_extend_doubleround 42, r1, 8*4, r4, r5, r9, r10 + + lis r18, HI(sha256_h) + addi r18, r18, LO(sha256_h) + + lwz r14, 8*4+(44+1)*4(r1) + rotrwi r12, r4, 17 + rotrwi r13, r4, 19 + add r15, r11, r6 + rotrwi r6, r14, 7 + rotrwi r11, r14, 18 + xor r12, r12, r13 + xor r6, r6, r11 + + lwz r8, 4*4(r18) + lwz r9, 5*4(r18) + lwz r10, 6*4(r18) + lwz r11, 7*4(r18) + + srwi r13, r4, 10 + srwi r14, r14, 3 + xor r12, r12, r13 + xor r6, r6, r14 + add r12, r12, r15 + add r6, r6, r12 + stw r6, 8*4+(44+16)*4(r1) + + lwz r4, 0*4(r18) + lwz r5, 1*4(r18) + lwz r6, 2*4(r18) + lwz r7, 3*4(r18) + + sha256_main_quadround 0, r16, r1, 8*4 + sha256_main_quadround 4, r16, r1, 8*4 + sha256_main_quadround 8, r16, r1, 8*4 + sha256_main_quadround 12, r16, r1, 8*4 + sha256_main_quadround 16, r16, r1, 8*4 + sha256_main_quadround 20, r16, r1, 8*4 + sha256_main_quadround 24, r16, r1, 8*4 + sha256_main_quadround 28, r16, r1, 8*4 + sha256_main_quadround 32, r16, r1, 8*4 + sha256_main_quadround 36, r16, r1, 8*4 + sha256_main_quadround 40, r16, r1, 8*4 + sha256_main_quadround 44, r16, r1, 8*4 + sha256_main_quadround 48, r16, r1, 8*4 + sha256_main_quadround 52, r16, r1, 8*4 + sha256_main_round 56, r16, r1, 8*4, r4, r5, r6, r7, r8, r9, r10, r11 + +.macro sha256_main_round_red i, rk, rw, wo, rd, re, rf, rg, rh + lwz r12, \wo+(\i)*4(\rw) + and r15, \rf, \re + andc r14, \rg, \re + add \rh, \rh, \rd + or r14, r14, r15 + lwz r15, (\i)*4(\rk) + rotrwi r13, \re, 5 + add \rh, \rh, r14 + xor r14, \re, r13 + rotrwi r13, \re, 19 + add \rh, \rh, r12 + xor r14, r14, r13 + add \rh, \rh, r15 + rotrwi r13, r14, 6 + add \rh, \rh, r13 +.endm + + sha256_main_round_red 57, r16, r1, 8*4, r6, r11, r8, r9, r10 + sha256_main_round_red 58, r16, r1, 8*4, r5, r10, r11, r8, r9 + sha256_main_round_red 59, r16, r1, 8*4, r4, r9, r10, r11, r8 + lwz r5, 7*4(r18) + sha256_main_round_red 60, r16, r1, 8*4, r7, r8, r9, r10, r11 + + add r11, r11, r5 + stw r11, 7*4(r3) + + lwz r13, 2*4(r1) + lwz r14, 3*4(r1) + lwz r15, 4*4(r1) + lwz r16, 5*4(r1) + lwz r17, 6*4(r1) + lwz r18, 7*4(r1) + addi r1, r1, 72*4 + blr + + +#ifdef __ALTIVEC__ + +#ifdef __APPLE__ + .machine ppc7400 +#endif + + .data + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .align 4 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .align 4 +sha256d_4preext2: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + .data + .align 4 +br_perm: + .long 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c + + +.macro sha256_4way_extend_setup + vspltisw v0, 10 + vspltisw v1, -7 + vspltisw v16, 3 + vspltisw v17, 15 + vspltisw v18, 14 + vspltisw v19, 13 +.endm + +.macro sha256_4way_extend_doubleround i, rw, va, vb, vy, vz + lvx v14, \rw, r7 + vrlw v12, \vy, v17 + vrlw v13, \vy, v19 + vadduwm v11, v11, \va + vxor v12, v12, v13 + vsrw v13, \vy, v0 + vrlw \va, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vadduwm v12, v12, v11 + vxor \va, \va, v13 + vsrw v13, v14, v16 + lvx v11, \rw, r8 + vxor \va, \va, v13 + vrlw v13, \vz, v19 + vadduwm \va, \va, v12 + + vrlw v12, \vz, v17 + vadduwm v14, v14, \vb + vxor v12, v12, v13 + vsrw v13, \vz, v0 + vrlw \vb, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + stvx \va, \rw, r10 + vxor \vb, \vb, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor \vb, \vb, v13 + vadduwm \vb, \vb, v14 + stvx \vb, \rw, r11 + addi \rw, \rw, 2*16 +.endm + + +.macro sha256_4way_main_setup + vspltisw v2, 12 + vspltisw v3, -5 + vspltisw v16, -6 + vspltisw v17, -11 + vspltisw v18, -2 +.endm + +.macro sha256_4way_main_round i, rk, rw, va, vb, vc, vd, ve, vf, vg, vh + li r6, (\i)*16 + lvx v12, \rw, r6 + vand v13, \vf, \ve + vandc v14, \vg, \ve + lvx v15, \rk, r6 + vor v14, v14, v13 + vrlw v13, \ve, v3 + vadduwm \vh, \vh, v14 + vxor v14, \ve, v13 + vrlw v13, \ve, v19 + vadduwm \vh, \vh, v12 + vxor v14, v14, v13 + vadduwm \vh, \vh, v15 + vrlw v13, v14, v16 + vxor v15, \va, \vb + vadduwm \vh, \vh, v13 + + vrlw v13, \va, v17 + vand v15, v15, \vc + vxor v12, \va, v13 + vrlw v13, \va, v2 + vand v14, \va, \vb + vxor v12, v12, v13 + vxor v14, v14, v15 + vrlw v13, v12, v18 + vadduwm v15, \vh, v14 + vadduwm \vh, \vh, \vd + vadduwm \vd, v15, v13 +.endm + +.macro sha256_4way_main_quadround i, rk, rw + sha256_4way_main_round \i+0, \rk, \rw, v4, v5, v6, v7, v8, v9, v10, v11 + sha256_4way_main_round \i+1, \rk, \rw, v7, v4, v5, v6, v11, v8, v9, v10 + sha256_4way_main_round \i+2, \rk, \rw, v6, v7, v4, v5, v10, v11, v8, v9 + sha256_4way_main_round \i+3, \rk, \rw, v5, v6, v7, v4, v9, v10, v11, v8 +.endm + + + .text + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: + mfspr r0, 256 + oris r12, r0, 0xff00 + mtspr 256, r12 + + lis r4, HI(sha256_4h) + addi r4, r4, LO(sha256_4h) + li r5, 1*16 + li r6, 2*16 + li r7, 3*16 + li r8, 4*16 + li r9, 5*16 + li r10, 6*16 + li r11, 7*16 + lvx v0, 0, r4 + lvx v1, r4, r5 + lvx v2, r4, r6 + lvx v3, r4, r7 + lvx v4, r4, r8 + lvx v5, r4, r9 + lvx v6, r4, r10 + lvx v7, r4, r11 + stvx v0, 0, r3 + stvx v1, r3, r5 + stvx v2, r3, r6 + stvx v3, r3, r7 + stvx v4, r3, r8 + stvx v5, r3, r9 + stvx v6, r3, r10 + stvx v7, r3, r11 + + mtspr 256, r0 + blr + + + .text + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r6, r1, 15 + cmpwi 0, r5, 0 + li r7, -(4*4+64*16) + subf r6, r6, r7 + stwux r1, r1, r6 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 4*16 + li r11, 5*16 + li r12, 6*16 + li r6, 7*16 + + bne 0, sha256_transform_4way_swap + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + b sha256_transform_4way_extend + +sha256_transform_4way_swap: + lis r5, HI(br_perm) + addi r5, r5, LO(br_perm) + lvx v19, 0, r5 + + lvx v11, 0, r4 + lvx v1, r4, r7 + lvx v2, r4, r8 + lvx v3, r4, r9 + lvx v4, r4, r10 + lvx v5, r4, r11 + lvx v6, r4, r12 + lvx v7, r4, r6 + vperm v11, v11, v11, v19 + vperm v1, v1, v1, v19 + vperm v2, v2, v2, v19 + vperm v3, v3, v3, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + addi r5, r1, 4*4 + stvx v11, 0, r5 + stvx v1, r5, r7 + stvx v2, r5, r8 + stvx v3, r5, r9 + stvx v4, r5, r10 + stvx v5, r5, r11 + stvx v6, r5, r12 + stvx v7, r5, r6 + addi r4, r4, 8*16 + lvx v0, 0, r4 + lvx v4, r4, r7 + lvx v5, r4, r8 + lvx v6, r4, r9 + lvx v7, r4, r10 + lvx v8, r4, r11 + lvx v9, r4, r12 + lvx v10, r4, r6 + vperm v0, v0, v0, v19 + vperm v4, v4, v4, v19 + vperm v5, v5, v5, v19 + vperm v6, v6, v6, v19 + vperm v7, v7, v7, v19 + vperm v8, v8, v8, v19 + vperm v9, v9, v9, v19 + vperm v10, v10, v10, v19 + addi r4, r1, 4*4+8*16 + stvx v0, 0, r4 + stvx v4, r4, r7 + stvx v5, r4, r8 + stvx v6, r4, r9 + stvx v7, r4, r10 + stvx v8, r4, r11 + stvx v9, r4, r12 + stvx v10, r4, r6 + +sha256_transform_4way_extend: + li r10, 16*16 + li r11, 17*16 + sha256_4way_extend_setup + sha256_4way_extend_doubleround 0, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 2, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 4, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 6, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 8, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 10, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 12, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 14, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 16, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r5, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r5, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r5, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r5, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r5, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r5, v4, v5, v9, v10 + sha256_4way_extend_doubleround 44, r5, v6, v7, v4, v5 + sha256_4way_extend_doubleround 46, r5, v8, v9, v6, v7 + + addi r11, r3, 4*16 + lvx v4, 0, r3 + lvx v5, r3, r7 + lvx v6, r3, r8 + lvx v7, r3, r9 + lvx v8, 0, r11 + lvx v9, r11, r7 + lvx v10, r11, r8 + lvx v11, r11, r9 + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) + addi r5, r1, 4*4 + sha256_4way_main_setup + sha256_4way_main_quadround 0, r12, r5 + sha256_4way_main_quadround 4, r12, r5 + sha256_4way_main_quadround 8, r12, r5 + sha256_4way_main_quadround 12, r12, r5 + sha256_4way_main_quadround 16, r12, r5 + sha256_4way_main_quadround 20, r12, r5 + sha256_4way_main_quadround 24, r12, r5 + sha256_4way_main_quadround 28, r12, r5 + sha256_4way_main_quadround 32, r12, r5 + sha256_4way_main_quadround 36, r12, r5 + sha256_4way_main_quadround 40, r12, r5 + sha256_4way_main_quadround 44, r12, r5 + sha256_4way_main_quadround 48, r12, r5 + sha256_4way_main_quadround 52, r12, r5 + sha256_4way_main_quadround 56, r12, r5 + sha256_4way_main_quadround 60, r12, r5 + + lvx v12, 0, r3 + lvx v13, r3, r7 + lvx v14, r3, r8 + lvx v15, r3, r9 + lvx v16, 0, r11 + lvx v17, r11, r7 + lvx v18, r11, r8 + lvx v19, r11, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + stvx v4, 0, r3 + stvx v5, r3, r7 + stvx v6, r3, r8 + stvx v7, r3, r9 + stvx v8, 0, r11 + stvx v9, r11, r7 + stvx v10, r11, r8 + stvx v11, r11, r9 + + lwz r1, 0(r1) + mtspr 256, r0 + blr + + + .text + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: + mfspr r0, 256 + oris r12, r0, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 + + andi. r12, r1, 15 + li r11, -(4*4+64*16) + subf r12, r12, r11 + stwux r1, r1, r12 + + li r7, 1*16 + li r8, 2*16 + li r9, 3*16 + li r10, 16*16 + li r11, 17*16 + + sha256_4way_extend_setup + + addi r4, r4, 2*16 + addi r12, r1, 4*4+18*16 + lvx v14, r4, r7 + lvx v6, r4, r10 + lvx v7, r4, r11 + + vrlw v12, v14, v1 + vrlw v13, v14, v18 + stvx v6, 0, r12 + vxor v12, v12, v13 + vsrw v13, v14, v16 + stvx v7, r12, r7 + vxor v12, v12, v13 + vadduwm v6, v6, v12 + vadduwm v7, v7, v14 + stvx v6, r4, r10 + + vrlw v12, v6, v17 + vrlw v13, v6, v19 + stvx v7, r4, r11 + addi r4, r4, 18*16 + lvx v8, 0, r4 + vxor v12, v12, v13 + vsrw v13, v6, v0 + stvx v8, r12, r8 + vxor v12, v12, v13 + vadduwm v8, v8, v12 + + vrlw v9, v7, v17 + vrlw v13, v7, v19 + stvx v8, 0, r4 + vxor v9, v9, v13 + vsrw v13, v7, v0 + vxor v9, v9, v13 + + vrlw v12, v8, v17 + vrlw v13, v8, v19 + stvx v9, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + lvx v10, r4, r8 + lvx v4, r4, r9 + vxor v12, v12, v13 + stvx v10, r12, r9 + addi r12, r12, 4*16 + stvx v4, 0, r12 + vrlw v14, v9, v17 + vrlw v13, v9, v19 + vadduwm v10, v10, v12 + + vxor v14, v14, v13 + vsrw v13, v9, v0 + stvx v10, r4, r8 + vxor v14, v14, v13 + vadduwm v4, v4, v14 + + vrlw v12, v10, v17 + vrlw v13, v10, v19 + stvx v4, r4, r9 + vxor v12, v12, v13 + vsrw v13, v10, v0 + vrlw v14, v4, v17 + vxor v12, v12, v13 + vrlw v13, v4, v19 + addi r4, r4, 4*16 + lvx v5, 0, r4 + vxor v14, v14, v13 + stvx v5, r12, r7 + vsrw v13, v4, v0 + vadduwm v5, v5, v12 + vxor v14, v14, v13 + stvx v5, 0, r4 + vadduwm v6, v6, v14 + + vrlw v12, v5, v17 + vrlw v13, v5, v19 + stvx v6, r4, r7 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v14, v6, v17 + vxor v12, v12, v13 + vrlw v13, v6, v19 + vxor v14, v14, v13 + vsrw v13, v6, v0 + vadduwm v7, v7, v12 + vxor v14, v14, v13 + stvx v7, r4, r8 + vadduwm v8, v8, v14 + + vrlw v12, v7, v17 + vrlw v13, v7, v19 + stvx v8, r4, r9 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v14, v8, v17 + vxor v12, v12, v13 + vrlw v13, v8, v19 + vxor v14, v14, v13 + vsrw v13, v8, v0 + vadduwm v9, v9, v12 + vxor v14, v14, v13 + addi r4, r4, 4*16 + stvx v9, 0, r4 + vadduwm v10, v10, v14 + + vrlw v12, v9, v17 + vrlw v13, v9, v19 + stvx v10, r4, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + lvx v11, r4, r8 + lvx v14, r4, r9 + stvx v11, r12, r8 + stvx v14, r12, r9 + vxor v12, v12, v13 + vadduwm v11, v11, v12 + vadduwm v5, v5, v14 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v4, v11 + + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r8 + vxor v12, v12, v13 + vadduwm v5, v5, v12 + stvx v5, r4, r9 + addi r4, r4, -12*16 + lvx v11, 0, r4 + + sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 44, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 46, r4, v8, v9, v6, v7 + addi r4, r4, -48*16 + + lvx v4, 0, r6 + lvx v9, r6, r7 + lvx v10, r6, r8 + lvx v11, r6, r9 + addi r12, r6, 4*16 + lvx v8, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) + sha256_4way_main_setup + sha256_4way_main_round 3, r12, r4, v5, v6, v7, v4, v9, v10, v11, v8 + sha256_4way_main_quadround 4, r12, r4 + sha256_4way_main_quadround 8, r12, r4 + sha256_4way_main_quadround 12, r12, r4 + sha256_4way_main_quadround 16, r12, r4 + sha256_4way_main_quadround 20, r12, r4 + sha256_4way_main_quadround 24, r12, r4 + sha256_4way_main_quadround 28, r12, r4 + sha256_4way_main_quadround 32, r12, r4 + sha256_4way_main_quadround 36, r12, r4 + sha256_4way_main_quadround 40, r12, r4 + sha256_4way_main_quadround 44, r12, r4 + sha256_4way_main_quadround 48, r12, r4 + sha256_4way_main_quadround 52, r12, r4 + sha256_4way_main_quadround 56, r12, r4 + sha256_4way_main_quadround 60, r12, r4 + + lvx v12, 0, r5 + lvx v13, r5, r7 + lvx v14, r5, r8 + lvx v15, r5, r9 + addi r12, r5, 4*16 + lvx v16, 0, r12 + lvx v17, r12, r7 + lvx v18, r12, r8 + lvx v19, r12, r9 + vadduwm v4, v4, v12 + vadduwm v5, v5, v13 + vadduwm v6, v6, v14 + vadduwm v7, v7, v15 + vadduwm v8, v8, v16 + vadduwm v9, v9, v17 + vadduwm v10, v10, v18 + vadduwm v11, v11, v19 + addi r12, r1, 4*4 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + stvx v7, r12, r9 + addi r12, r12, 4*16 + stvx v8, 0, r12 + stvx v9, r12, r7 + stvx v10, r12, r8 + stvx v11, r12, r9 + + addi r12, r1, 4*4+18*16 + lvx v4, 0, r12 + lvx v5, r12, r7 + lvx v6, r12, r8 + lvx v7, r12, r9 + addi r12, r12, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 + addi r12, r4, 18*16 + stvx v4, 0, r12 + stvx v5, r12, r7 + stvx v6, r12, r8 + addi r12, r4, 22*16 + stvx v7, 0, r12 + stvx v8, r12, r7 + stvx v9, r12, r8 + addi r12, r4, 30*16 + stvx v10, 0, r12 + stvx v11, r12, r7 + + addi r4, r1, 4*4 + + sha256_4way_extend_setup + + lis r12, HI(sha256d_4preext2) + addi r12, r12, LO(sha256d_4preext2) + lvx v2, 0, r12 + + vxor v9, v9, v9 + vspltisw v3, 1 + lvx v4, r12, r8 + vsldoi v3, v3, v3, 1 + addi r5, r1, 4*4+8*16 + stvx v4, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v9, r5, r9 + addi r5, r5, 4*16 + stvx v9, 0, r5 + stvx v9, r5, r7 + stvx v9, r5, r8 + stvx v3, r5, r9 + + lvx v4, 0, r4 + lvx v14, r4, r7 + + lvx v11, r4, r8 + vrlw v12, v14, v1 + vrlw v13, v14, v18 + + vxor v12, v12, v13 + vsrw v13, v14, v16 + vadduwm v5, v14, v2 + vxor v12, v12, v13 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vadduwm v4, v4, v12 + vxor v14, v14, v13 + vsrw v13, v11, v16 + stvx v4, r4, r10 + vxor v14, v14, v13 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v5, v5, v14 + + stvx v5, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v6, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v6, v6, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v6, v6, v13 + vrlw v12, v5, v17 + vrlw v13, v5, v19 + vadduwm v6, v6, v11 + lvx v11, r4, r8 + + stvx v6, r4, r10 + vxor v12, v12, v13 + vsrw v13, v5, v0 + vrlw v7, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v7, v7, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v7, v7, v13 + vrlw v12, v6, v17 + vrlw v13, v6, v19 + vadduwm v7, v7, v14 + + stvx v7, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v8, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v8, v8, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v8, v8, v13 + vrlw v12, v7, v17 + vrlw v13, v7, v19 + vadduwm v8, v8, v11 + lvx v11, r4, r8 + + stvx v8, r4, r10 + vxor v12, v12, v13 + vsrw v13, v7, v0 + vrlw v9, v11, v1 + vxor v12, v12, v13 + vrlw v13, v11, v18 + vxor v9, v9, v13 + vsrw v13, v11, v16 + vadduwm v14, v14, v12 + vxor v9, v9, v13 + vrlw v12, v8, v17 + vrlw v13, v8, v19 + vadduwm v9, v9, v14 + + stvx v9, r4, r11 + addi r4, r4, 2*16 + lvx v14, r4, r7 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vrlw v10, v14, v1 + vxor v12, v12, v13 + vrlw v13, v14, v18 + vxor v10, v10, v13 + vsrw v13, v14, v16 + vadduwm v11, v11, v12 + vxor v10, v10, v13 + vrlw v12, v9, v17 + vrlw v13, v9, v19 + vadduwm v11, v11, v3 + vadduwm v14, v14, v4 + vadduwm v10, v10, v11 + + lvx v2, r12, r7 + vxor v12, v12, v13 + vsrw v13, v9, v0 + stvx v10, r4, r10 + vxor v12, v12, v13 + vadduwm v14, v14, v12 + vrlw v12, v10, v17 + vrlw v13, v10, v19 + vadduwm v4, v14, v2 + lvx v2, r12, r8 + vxor v12, v12, v13 + vsrw v13, v10, v0 + stvx v4, r4, r11 + vadduwm v5, v5, v2 + vxor v12, v12, v13 + vadduwm v5, v5, v12 + + vrlw v12, v4, v17 + vrlw v13, v4, v19 + addi r4, r4, 2*16 + stvx v5, r4, r10 + vxor v12, v12, v13 + vsrw v13, v4, v0 + vrlw v11, v5, v17 + vxor v12, v12, v13 + vrlw v13, v5, v19 + vxor v11, v11, v13 + vsrw v13, v5, v0 + vadduwm v6, v6, v12 + vxor v11, v11, v13 + stvx v6, r4, r11 + vadduwm v7, v7, v11 + + vrlw v12, v6, v17 + vrlw v13, v6, v19 + addi r4, r4, 2*16 + stvx v7, r4, r10 + vxor v12, v12, v13 + vsrw v13, v6, v0 + vrlw v11, v7, v17 + vxor v12, v12, v13 + vrlw v13, v7, v19 + vxor v11, v11, v13 + vsrw v13, v7, v0 + vadduwm v8, v8, v12 + vxor v11, v11, v13 + stvx v8, r4, r11 + vadduwm v9, v9, v11 + + lvx v2, r12, r9 + vrlw v14, v8, v17 + vrlw v13, v8, v19 + vrlw v12, v9, v17 + addi r4, r4, 2*16 + stvx v9, r4, r10 + vxor v14, v14, v13 + vrlw v13, v9, v19 + vxor v12, v12, v13 + vsrw v13, v8, v0 + vxor v14, v14, v13 + vsrw v13, v9, v0 + vxor v12, v12, v13 + vadduwm v4, v4, v2 + vadduwm v10, v10, v14 + vadduwm v4, v4, v12 + stvx v10, r4, r11 + addi r4, r4, 2*16 + lvx v11, r4, r8 + + vadduwm v5, v5, v3 + stvx v4, r4, r10 + vrlw v14, v11, v1 + vrlw v13, v11, v18 + vrlw v12, v10, v17 + vxor v14, v14, v13 + vrlw v13, v10, v19 + vxor v12, v12, v13 + vsrw v13, v11, v16 + vxor v14, v14, v13 + vsrw v13, v10, v0 + vxor v12, v12, v13 + vadduwm v5, v5, v14 + vadduwm v5, v5, v12 + stvx v5, r4, r11 + addi r4, r4, 2*16 + + sha256_4way_extend_doubleround 16, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 18, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 20, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 22, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 24, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 26, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 28, r4, v4, v5, v9, v10 + sha256_4way_extend_doubleround 30, r4, v6, v7, v4, v5 + sha256_4way_extend_doubleround 32, r4, v8, v9, v6, v7 + sha256_4way_extend_doubleround 34, r4, v10, v4, v8, v9 + sha256_4way_extend_doubleround 36, r4, v5, v6, v10, v4 + sha256_4way_extend_doubleround 38, r4, v7, v8, v5, v6 + sha256_4way_extend_doubleround 40, r4, v9, v10, v7, v8 + sha256_4way_extend_doubleround 42, r4, v4, v5, v9, v10 + + lvx v14, r4, r7 + vrlw v12, v4, v17 + vrlw v13, v4, v19 + vadduwm v15, v11, v6 + vrlw v6, v14, v1 + vrlw v11, v14, v18 + vxor v12, v12, v13 + vxor v6, v6, v11 + vsrw v13, v4, v0 + vsrw v14, v14, v16 + vxor v12, v12, v13 + vxor v6, v6, v14 + vadduwm v12, v12, v15 + vadduwm v6, v6, v12 + stvx v6, r4, r10 + addi r4, r4, -44*16 + + lis r5, HI(sha256_4h) + addi r5, r5, LO(sha256_4h) + lvx v4, 0, r5 + lvx v5, r5, r7 + lvx v6, r5, r8 + lvx v7, r5, r9 + addi r12, r5, 4*16 + lvx v8, 0, r12 + lvx v9, r12, r7 + lvx v10, r12, r8 + lvx v11, r12, r9 + lis r12, HI(sha256_4k) + addi r12, r12, LO(sha256_4k) + sha256_4way_main_setup + sha256_4way_main_quadround 0, r12, r4 + sha256_4way_main_quadround 4, r12, r4 + sha256_4way_main_quadround 8, r12, r4 + sha256_4way_main_quadround 12, r12, r4 + sha256_4way_main_quadround 16, r12, r4 + sha256_4way_main_quadround 20, r12, r4 + sha256_4way_main_quadround 24, r12, r4 + sha256_4way_main_quadround 28, r12, r4 + sha256_4way_main_quadround 32, r12, r4 + sha256_4way_main_quadround 36, r12, r4 + sha256_4way_main_quadround 40, r12, r4 + sha256_4way_main_quadround 44, r12, r4 + sha256_4way_main_quadround 48, r12, r4 + sha256_4way_main_quadround 52, r12, r4 + sha256_4way_main_round 56, r12, r4, v4, v5, v6, v7, v8, v9, v10, v11 + +.macro sha256_4way_main_round_red i, rk, rw, vd, ve, vf, vg, vh + li r6, (\i)*16 + vand v15, \vf, \ve + vandc v14, \vg, \ve + lvx v12, \rw, r6 + vadduwm \vh, \vh, \vd + vor v14, v14, v15 + lvx v15, \rk, r6 + vrlw v13, \ve, v3 + vadduwm \vh, \vh, v14 + vxor v14, \ve, v13 + vrlw v13, \ve, v19 + vadduwm \vh, \vh, v12 + vxor v14, v14, v13 + vadduwm \vh, \vh, v15 + vrlw v13, v14, v16 + vadduwm \vh, \vh, v13 +.endm + + sha256_4way_main_round_red 57, r12, r4, v6, v11, v8, v9, v10 + sha256_4way_main_round_red 58, r12, r4, v5, v10, v11, v8, v9 + sha256_4way_main_round_red 59, r12, r4, v4, v9, v10, v11, v8 + sha256_4way_main_round_red 60, r12, r4, v7, v8, v9, v10, v11 + + li r12, 7*16 + lvx v19, r5, r12 + vadduwm v11, v11, v19 + stvx v11, r3, r12 + + lwz r1, 0(r1) + mtspr 256, r0 + blr + + + .text + .align 2 + .globl sha256_use_4way + .globl _sha256_use_4way +#ifdef __ELF__ + .type sha256_use_4way, %function +#endif +sha256_use_4way: +_sha256_use_4way: + li r3, 1 + blr + +#endif /* __ALTIVEC__ */ + +#endif diff --git a/sha2.c b/sha2.c index 367efda..4bd86d2 100644 --- a/sha2.c +++ b/sha2.c @@ -14,7 +14,9 @@ #include #include -#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) +#if defined(USE_ASM) && \ + ((defined(__arm__) && defined(__APCS_32__)) || \ + (defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))) #define EXTERN_SHA256 #endif