Merge #13191: Specialized double-SHA256 with 64 byte inputs with SSE4.1 and AVX2

4defdfab94 [MOVEONLY] Move unused Merkle branch code to tests (Pieter Wuille)
4437d6e1f3 8-way AVX2 implementation for double SHA256 on 64-byte inputs (Pieter Wuille)
230294bf5f 4-way SSE4.1 implementation for double SHA256 on 64-byte inputs (Pieter Wuille)
1f0e7ca09c Use SHA256D64 in Merkle root computation (Pieter Wuille)
d0c9632883 Specialized double sha256 for 64 byte inputs (Pieter Wuille)
57f34630fb Refactor SHA256 code (Pieter Wuille)
0df017889b Benchmark Merkle root computation (Pieter Wuille)

Pull request description:

  This introduces a framework for specialized double-SHA256 with 64 byte inputs. 4 different implementations are provided:
  * Generic C++ (reusing the normal SHA256 code)
  * Specialized C++ for 64-byte inputs, but no special instructions
  * 4-way using SSE4.1 intrinsics
  * 8-way using AVX2 intrinsics

  On my own system (AVX2 capable), I get these benchmarks for computing the Merkle root of 9001 leaves (supported lengths / special instructions / parallellism):
  * 7.2 ms with varsize/naive/1way (master, non-SSE4 hardware)
  * 5.8 ms with size64/naive/1way (this PR, non-SSE4 capable systems)
  * 4.8 ms with varsize/SSE4/1way (master, SSE4 hardware)
  * 2.9 ms with size64/SSE4/4way (this PR, SSE4 hardware)
  * 1.1 ms with size64/AVX2/8way (this PR, AVX2 hardware)

Tree-SHA512: efa32d48b32820d9ce788ead4eb583949265be8c2e5f538c94bc914e92d131a57f8c1ee26c6f998e81fb0e30675d4e2eddc3360bcf632676249036018cff343e
This commit is contained in:
Wladimir J. van der Laan 2018-06-04 09:11:18 +02:00
commit 0de7cc848e
No known key found for this signature in database
GPG key ID: 1E4AED62986CD25D
16 changed files with 1344 additions and 201 deletions

View file

@ -312,6 +312,8 @@ fi
# be compiled with them, rather that specific objects/libs may use them after checking for runtime
# compatibility.
AX_CHECK_COMPILE_FLAG([-msse4.2],[[SSE42_CXXFLAGS="-msse4.2"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-msse4.1],[[SSE41_CXXFLAGS="-msse4.1"]],,[[$CXXFLAG_WERROR]])
AX_CHECK_COMPILE_FLAG([-mavx -mavx2],[[AVX2_CXXFLAGS="-mavx -mavx2"]],,[[$CXXFLAG_WERROR]])
TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE42_CXXFLAGS"
@ -335,6 +337,44 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
)
CXXFLAGS="$TEMP_CXXFLAGS"
TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $SSE41_CXXFLAGS"
AC_MSG_CHECKING(for SSE4.1 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
]],[[
__m128i l = _mm_set1_epi32(0);
return _mm_extract_epi32(l, 3);
]])],
[ AC_MSG_RESULT(yes); enable_sse41=yes; AC_DEFINE(ENABLE_SSE41, 1, [Define this symbol to build code that uses SSE4.1 intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"
TEMP_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS $AVX2_CXXFLAGS"
AC_MSG_CHECKING(for AVX2 intrinsics)
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__) && defined(__AVX2__)
#include <x86intrin.h>
#endif
]],[[
__m256i l = _mm256_set1_epi32(0);
return _mm256_extract_epi32(l, 7);
]])],
[ AC_MSG_RESULT(yes); enable_avx2=yes; AC_DEFINE(ENABLE_AVX2, 1, [Define this symbol to build code that uses AVX2 intrinsics]) ],
[ AC_MSG_RESULT(no)]
)
CXXFLAGS="$TEMP_CXXFLAGS"
CPPFLAGS="$CPPFLAGS -DHAVE_BUILD_INFO -D__STDC_FORMAT_MACROS"
AC_ARG_WITH([utils],
@ -1253,6 +1293,8 @@ AM_CONDITIONAL([USE_LCOV],[test x$use_lcov = xyes])
AM_CONDITIONAL([GLIBC_BACK_COMPAT],[test x$use_glibc_compat = xyes])
AM_CONDITIONAL([HARDEN],[test x$use_hardening = xyes])
AM_CONDITIONAL([ENABLE_HWCRC32],[test x$enable_hwcrc32 = xyes])
AM_CONDITIONAL([ENABLE_SSE41],[test x$enable_sse41 = xyes])
AM_CONDITIONAL([ENABLE_AVX2],[test x$enable_avx2 = xyes])
AM_CONDITIONAL([USE_ASM],[test x$use_asm = xyes])
AC_DEFINE(CLIENT_VERSION_MAJOR, _CLIENT_VERSION_MAJOR, [Major version])
@ -1295,6 +1337,8 @@ AC_SUBST(PIE_FLAGS)
AC_SUBST(SANITIZER_CXXFLAGS)
AC_SUBST(SANITIZER_LDFLAGS)
AC_SUBST(SSE42_CXXFLAGS)
AC_SUBST(SSE41_CXXFLAGS)
AC_SUBST(AVX2_CXXFLAGS)
AC_SUBST(LIBTOOL_APP_LDFLAGS)
AC_SUBST(USE_UPNP)
AC_SUBST(USE_QRCODE)

View file

@ -30,6 +30,8 @@ LIBBITCOIN_CONSENSUS=libbitcoin_consensus.a
LIBBITCOIN_CLI=libbitcoin_cli.a
LIBBITCOIN_UTIL=libbitcoin_util.a
LIBBITCOIN_CRYPTO=crypto/libbitcoin_crypto.a
LIBBITCOIN_CRYPTO_SSE41=crypto/libbitcoin_crypto_sse41.a
LIBBITCOIN_CRYPTO_AVX2=crypto/libbitcoin_crypto_avx2.a
LIBBITCOINQT=qt/libbitcoinqt.a
LIBSECP256K1=secp256k1/libsecp256k1.la
@ -50,6 +52,8 @@ $(LIBSECP256K1): $(wildcard secp256k1/src/*) $(wildcard secp256k1/include/*)
# But to build the less dependent modules first, we manually select their order here:
EXTRA_LIBRARIES += \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_COMMON) \
$(LIBBITCOIN_CONSENSUS) \
@ -289,6 +293,22 @@ if USE_ASM
crypto_libbitcoin_crypto_a_SOURCES += crypto/sha256_sse4.cpp
endif
crypto_libbitcoin_crypto_sse41_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS = $(AM_CPPFLAGS)
if ENABLE_SSE41
crypto_libbitcoin_crypto_sse41_a_CXXFLAGS += $(SSE41_CXXFLAGS)
crypto_libbitcoin_crypto_sse41_a_CPPFLAGS += -DENABLE_SSE41
endif
crypto_libbitcoin_crypto_sse41_a_SOURCES = crypto/sha256_sse41.cpp
crypto_libbitcoin_crypto_avx2_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS = $(AM_CPPFLAGS)
if ENABLE_AVX2
crypto_libbitcoin_crypto_avx2_a_CXXFLAGS += $(AVX2_CXXFLAGS)
crypto_libbitcoin_crypto_avx2_a_CPPFLAGS += -DENABLE_AVX2
endif
crypto_libbitcoin_crypto_avx2_a_SOURCES = crypto/sha256_avx2.cpp
# consensus: shared between all executables that validate any consensus rules.
libbitcoin_consensus_a_CPPFLAGS = $(AM_CPPFLAGS) $(BITCOIN_INCLUDES)
libbitcoin_consensus_a_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
@ -411,6 +431,8 @@ bitcoind_LDADD = \
$(LIBBITCOIN_ZMQ) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \
@ -432,7 +454,9 @@ bitcoin_cli_LDADD = \
$(LIBBITCOIN_CLI) \
$(LIBUNIVALUE) \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CRYPTO)
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2)
bitcoin_cli_LDADD += $(BOOST_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(EVENT_LIBS)
#
@ -453,6 +477,8 @@ bitcoin_tx_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBSECP256K1)
bitcoin_tx_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)

View file

@ -21,6 +21,7 @@ bench_bench_bitcoin_SOURCES = \
bench/rollingbloom.cpp \
bench/crypto_hash.cpp \
bench/ccoins_caching.cpp \
bench/merkle_root.cpp \
bench/mempool_eviction.cpp \
bench/verify_script.cpp \
bench/base58.cpp \
@ -38,6 +39,8 @@ bench_bench_bitcoin_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) \
$(LIBMEMENV) \

View file

@ -408,7 +408,7 @@ endif
if ENABLE_ZMQ
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
qt_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) \
$(BOOST_LIBS) $(QT_LIBS) $(QT_DBUS_LIBS) $(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)
qt_bitcoin_qt_LDFLAGS = $(RELDFLAGS) $(AM_LDFLAGS) $(QT_LDFLAGS) $(LIBTOOL_APP_LDFLAGS)

View file

@ -62,7 +62,7 @@ endif
if ENABLE_ZMQ
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_ZMQ) $(ZMQ_LIBS)
endif
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) $(LIBLEVELDB) \
qt_test_test_bitcoin_qt_LDADD += $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) $(LIBLEVELDB) \
$(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(QT_DBUS_LIBS) $(QT_TEST_LIBS) $(QT_LIBS) \
$(QR_LIBS) $(PROTOBUF_LIBS) $(BDB_LIBS) $(SSL_LIBS) $(CRYPTO_LIBS) $(MINIUPNPC_LIBS) $(LIBSECP256K1) \
$(EVENT_PTHREADS_LIBS) $(EVENT_LIBS)

View file

@ -109,7 +109,8 @@ test_test_bitcoin_LDADD =
if ENABLE_WALLET
test_test_bitcoin_LDADD += $(LIBBITCOIN_WALLET)
endif
test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBUNIVALUE) \
test_test_bitcoin_LDADD += $(LIBBITCOIN_SERVER) $(LIBBITCOIN_CLI) $(LIBBITCOIN_COMMON) $(LIBBITCOIN_UTIL) $(LIBBITCOIN_CONSENSUS) $(LIBBITCOIN_CRYPTO) $(LIBBITCOIN_CRYPTO_SSE41) $(LIBBITCOIN_CRYPTO_AVX2) $(LIBUNIVALUE) \
$(LIBLEVELDB) $(LIBLEVELDB_SSE42) $(LIBMEMENV) $(BOOST_LIBS) $(BOOST_UNIT_TEST_FRAMEWORK_LIB) $(LIBSECP256K1) $(EVENT_LIBS) $(EVENT_PTHREADS_LIBS)
test_test_bitcoin_CXXFLAGS = $(AM_CXXFLAGS) $(PIE_FLAGS)
@ -134,6 +135,8 @@ test_test_bitcoin_fuzzy_LDADD = \
$(LIBBITCOIN_UTIL) \
$(LIBBITCOIN_CONSENSUS) \
$(LIBBITCOIN_CRYPTO) \
$(LIBBITCOIN_CRYPTO_SSE41) \
$(LIBBITCOIN_CRYPTO_AVX2) \
$(LIBSECP256K1)
test_test_bitcoin_fuzzy_LDADD += $(BOOST_LIBS) $(CRYPTO_LIBS)

View file

@ -52,6 +52,14 @@ static void SHA256_32b(benchmark::State& state)
}
}
static void SHA256D64_1024(benchmark::State& state)
{
std::vector<uint8_t> in(64 * 1024, 0);
while (state.KeepRunning()) {
SHA256D64(in.data(), in.data(), 1024);
}
}
static void SHA512(benchmark::State& state)
{
uint8_t hash[CSHA512::OUTPUT_SIZE];
@ -94,5 +102,6 @@ BENCHMARK(SHA512, 330);
BENCHMARK(SHA256_32b, 4700 * 1000);
BENCHMARK(SipHash_32b, 40 * 1000 * 1000);
BENCHMARK(SHA256D64_1024, 7400);
BENCHMARK(FastRandom_32bit, 110 * 1000 * 1000);
BENCHMARK(FastRandom_1bit, 440 * 1000 * 1000);

26
src/bench/merkle_root.cpp Normal file
View file

@ -0,0 +1,26 @@
// Copyright (c) 2016 The Bitcoin Core developers
// Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
#include "bench.h"
#include "uint256.h"
#include "random.h"
#include "consensus/merkle.h"
static void MerkleRoot(benchmark::State& state)
{
FastRandomContext rng(true);
std::vector<uint256> leaves;
leaves.resize(9001);
for (auto& item : leaves) {
item = rng.rand256();
}
while (state.KeepRunning()) {
bool mutation = false;
uint256 hash = ComputeMerkleRoot(std::vector<uint256>(leaves), &mutation);
leaves[mutation] = hash;
}
}
BENCHMARK(MerkleRoot, 800);

View file

@ -42,118 +42,26 @@
root.
*/
/* This implements a constant-space merkle root/path calculator, limited to 2^32 leaves. */
static void MerkleComputation(const std::vector<uint256>& leaves, uint256* proot, bool* pmutated, uint32_t branchpos, std::vector<uint256>* pbranch) {
if (pbranch) pbranch->clear();
if (leaves.size() == 0) {
if (pmutated) *pmutated = false;
if (proot) *proot = uint256();
return;
}
bool mutated = false;
// count is the number of leaves processed so far.
uint32_t count = 0;
// inner is an array of eagerly computed subtree hashes, indexed by tree
// level (0 being the leaves).
// For example, when count is 25 (11001 in binary), inner[4] is the hash of
// the first 16 leaves, inner[3] of the next 8 leaves, and inner[0] equal to
// the last leaf. The other inner entries are undefined.
uint256 inner[32];
// Which position in inner is a hash that depends on the matching leaf.
int matchlevel = -1;
// First process all leaves into 'inner' values.
while (count < leaves.size()) {
uint256 h = leaves[count];
bool matchh = count == branchpos;
count++;
int level;
// For each of the lower bits in count that are 0, do 1 step. Each
// corresponds to an inner value that existed before processing the
// current leaf, and each needs a hash to combine it.
for (level = 0; !(count & (((uint32_t)1) << level)); level++) {
if (pbranch) {
if (matchh) {
pbranch->push_back(inner[level]);
} else if (matchlevel == level) {
pbranch->push_back(h);
matchh = true;
uint256 ComputeMerkleRoot(std::vector<uint256> hashes, bool* mutated) {
bool mutation = false;
while (hashes.size() > 1) {
if (mutated) {
for (size_t pos = 0; pos + 1 < hashes.size(); pos += 2) {
if (hashes[pos] == hashes[pos + 1]) mutation = true;
}
}
mutated |= (inner[level] == h);
CHash256().Write(inner[level].begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
if (hashes.size() & 1) {
hashes.push_back(hashes.back());
}
// Store the resulting hash at inner position level.
inner[level] = h;
if (matchh) {
matchlevel = level;
SHA256D64(hashes[0].begin(), hashes[0].begin(), hashes.size() / 2);
hashes.resize(hashes.size() / 2);
}
}
// Do a final 'sweep' over the rightmost branch of the tree to process
// odd levels, and reduce everything to a single top value.
// Level is the level (counted from the bottom) up to which we've sweeped.
int level = 0;
// As long as bit number level in count is zero, skip it. It means there
// is nothing left at this level.
while (!(count & (((uint32_t)1) << level))) {
level++;
}
uint256 h = inner[level];
bool matchh = matchlevel == level;
while (count != (((uint32_t)1) << level)) {
// If we reach this point, h is an inner value that is not the top.
// We combine it with itself (Bitcoin's special rule for odd levels in
// the tree) to produce a higher level one.
if (pbranch && matchh) {
pbranch->push_back(h);
}
CHash256().Write(h.begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
// Increment count to the value it would have if two entries at this
// level had existed.
count += (((uint32_t)1) << level);
level++;
// And propagate the result upwards accordingly.
while (!(count & (((uint32_t)1) << level))) {
if (pbranch) {
if (matchh) {
pbranch->push_back(inner[level]);
} else if (matchlevel == level) {
pbranch->push_back(h);
matchh = true;
}
}
CHash256().Write(inner[level].begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
level++;
}
}
// Return result.
if (pmutated) *pmutated = mutated;
if (proot) *proot = h;
if (mutated) *mutated = mutation;
if (hashes.size() == 0) return uint256();
return hashes[0];
}
uint256 ComputeMerkleRoot(const std::vector<uint256>& leaves, bool* mutated) {
uint256 hash;
MerkleComputation(leaves, &hash, mutated, -1, nullptr);
return hash;
}
std::vector<uint256> ComputeMerkleBranch(const std::vector<uint256>& leaves, uint32_t position) {
std::vector<uint256> ret;
MerkleComputation(leaves, nullptr, nullptr, position, &ret);
return ret;
}
uint256 ComputeMerkleRootFromBranch(const uint256& leaf, const std::vector<uint256>& vMerkleBranch, uint32_t nIndex) {
uint256 hash = leaf;
for (std::vector<uint256>::const_iterator it = vMerkleBranch.begin(); it != vMerkleBranch.end(); ++it) {
if (nIndex & 1) {
hash = Hash(BEGIN(*it), END(*it), BEGIN(hash), END(hash));
} else {
hash = Hash(BEGIN(hash), END(hash), BEGIN(*it), END(*it));
}
nIndex >>= 1;
}
return hash;
}
uint256 BlockMerkleRoot(const CBlock& block, bool* mutated)
{
@ -162,7 +70,7 @@ uint256 BlockMerkleRoot(const CBlock& block, bool* mutated)
for (size_t s = 0; s < block.vtx.size(); s++) {
leaves[s] = block.vtx[s]->GetHash();
}
return ComputeMerkleRoot(leaves, mutated);
return ComputeMerkleRoot(std::move(leaves), mutated);
}
uint256 BlockWitnessMerkleRoot(const CBlock& block, bool* mutated)
@ -173,15 +81,6 @@ uint256 BlockWitnessMerkleRoot(const CBlock& block, bool* mutated)
for (size_t s = 1; s < block.vtx.size(); s++) {
leaves[s] = block.vtx[s]->GetWitnessHash();
}
return ComputeMerkleRoot(leaves, mutated);
return ComputeMerkleRoot(std::move(leaves), mutated);
}
std::vector<uint256> BlockMerkleBranch(const CBlock& block, uint32_t position)
{
std::vector<uint256> leaves;
leaves.resize(block.vtx.size());
for (size_t s = 0; s < block.vtx.size(); s++) {
leaves[s] = block.vtx[s]->GetHash();
}
return ComputeMerkleBranch(leaves, position);
}

View file

@ -12,9 +12,7 @@
#include <primitives/block.h>
#include <uint256.h>
uint256 ComputeMerkleRoot(const std::vector<uint256>& leaves, bool* mutated = nullptr);
std::vector<uint256> ComputeMerkleBranch(const std::vector<uint256>& leaves, uint32_t position);
uint256 ComputeMerkleRootFromBranch(const uint256& leaf, const std::vector<uint256>& branch, uint32_t position);
uint256 ComputeMerkleRoot(std::vector<uint256> hashes, bool* mutated = nullptr);
/*
* Compute the Merkle root of the transactions in a block.
@ -28,11 +26,4 @@ uint256 BlockMerkleRoot(const CBlock& block, bool* mutated = nullptr);
*/
uint256 BlockWitnessMerkleRoot(const CBlock& block, bool* mutated = nullptr);
/*
* Compute the Merkle branch for the tree of transactions in a block, for a
* given position.
* This can be verified using ComputeMerkleRootFromBranch.
*/
std::vector<uint256> BlockMerkleBranch(const CBlock& block, uint32_t position);
#endif // BITCOIN_CONSENSUS_MERKLE_H

View file

@ -19,6 +19,16 @@ void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks);
#endif
#endif
namespace sha256d64_sse41
{
void Transform_4way(unsigned char* out, const unsigned char* in);
}
namespace sha256d64_avx2
{
void Transform_8way(unsigned char* out, const unsigned char* in);
}
// Internal implementation code.
namespace
{
@ -33,9 +43,9 @@ uint32_t inline sigma0(uint32_t x) { return (x >> 7 | x << 25) ^ (x >> 18 | x <<
uint32_t inline sigma1(uint32_t x) { return (x >> 17 | x << 15) ^ (x >> 19 | x << 13) ^ (x >> 10); }
/** One round of SHA-256. */
void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k, uint32_t w)
void inline Round(uint32_t a, uint32_t b, uint32_t c, uint32_t& d, uint32_t e, uint32_t f, uint32_t g, uint32_t& h, uint32_t k)
{
uint32_t t1 = h + Sigma1(e) + Ch(e, f, g) + k + w;
uint32_t t1 = h + Sigma1(e) + Ch(e, f, g) + k;
uint32_t t2 = Sigma0(a) + Maj(a, b, c);
d += t1;
h = t1 + t2;
@ -61,73 +71,73 @@ void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = ReadBE32(chunk + 0));
Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = ReadBE32(chunk + 4));
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = ReadBE32(chunk + 8));
Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = ReadBE32(chunk + 12));
Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = ReadBE32(chunk + 16));
Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = ReadBE32(chunk + 20));
Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = ReadBE32(chunk + 24));
Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = ReadBE32(chunk + 28));
Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = ReadBE32(chunk + 32));
Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = ReadBE32(chunk + 36));
Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = ReadBE32(chunk + 40));
Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = ReadBE32(chunk + 44));
Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = ReadBE32(chunk + 48));
Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = ReadBE32(chunk + 52));
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = ReadBE32(chunk + 56));
Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = ReadBE32(chunk + 60));
Round(a, b, c, d, e, f, g, h, 0x428a2f98 + (w0 = ReadBE32(chunk + 0)));
Round(h, a, b, c, d, e, f, g, 0x71374491 + (w1 = ReadBE32(chunk + 4)));
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf + (w2 = ReadBE32(chunk + 8)));
Round(f, g, h, a, b, c, d, e, 0xe9b5dba5 + (w3 = ReadBE32(chunk + 12)));
Round(e, f, g, h, a, b, c, d, 0x3956c25b + (w4 = ReadBE32(chunk + 16)));
Round(d, e, f, g, h, a, b, c, 0x59f111f1 + (w5 = ReadBE32(chunk + 20)));
Round(c, d, e, f, g, h, a, b, 0x923f82a4 + (w6 = ReadBE32(chunk + 24)));
Round(b, c, d, e, f, g, h, a, 0xab1c5ed5 + (w7 = ReadBE32(chunk + 28)));
Round(a, b, c, d, e, f, g, h, 0xd807aa98 + (w8 = ReadBE32(chunk + 32)));
Round(h, a, b, c, d, e, f, g, 0x12835b01 + (w9 = ReadBE32(chunk + 36)));
Round(g, h, a, b, c, d, e, f, 0x243185be + (w10 = ReadBE32(chunk + 40)));
Round(f, g, h, a, b, c, d, e, 0x550c7dc3 + (w11 = ReadBE32(chunk + 44)));
Round(e, f, g, h, a, b, c, d, 0x72be5d74 + (w12 = ReadBE32(chunk + 48)));
Round(d, e, f, g, h, a, b, c, 0x80deb1fe + (w13 = ReadBE32(chunk + 52)));
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7 + (w14 = ReadBE32(chunk + 56)));
Round(b, c, d, e, f, g, h, a, 0xc19bf174 + (w15 = ReadBE32(chunk + 60)));
Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3));
Round(f, g, h, a, b, c, d, e, 0x240ca1cc, w3 += sigma1(w1) + w12 + sigma0(w4));
Round(e, f, g, h, a, b, c, d, 0x2de92c6f, w4 += sigma1(w2) + w13 + sigma0(w5));
Round(d, e, f, g, h, a, b, c, 0x4a7484aa, w5 += sigma1(w3) + w14 + sigma0(w6));
Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc, w6 += sigma1(w4) + w15 + sigma0(w7));
Round(b, c, d, e, f, g, h, a, 0x76f988da, w7 += sigma1(w5) + w0 + sigma0(w8));
Round(a, b, c, d, e, f, g, h, 0x983e5152, w8 += sigma1(w6) + w1 + sigma0(w9));
Round(h, a, b, c, d, e, f, g, 0xa831c66d, w9 += sigma1(w7) + w2 + sigma0(w10));
Round(g, h, a, b, c, d, e, f, 0xb00327c8, w10 += sigma1(w8) + w3 + sigma0(w11));
Round(f, g, h, a, b, c, d, e, 0xbf597fc7, w11 += sigma1(w9) + w4 + sigma0(w12));
Round(e, f, g, h, a, b, c, d, 0xc6e00bf3, w12 += sigma1(w10) + w5 + sigma0(w13));
Round(d, e, f, g, h, a, b, c, 0xd5a79147, w13 += sigma1(w11) + w6 + sigma0(w14));
Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15));
Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0));
Round(a, b, c, d, e, f, g, h, 0xe49b69c1 + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0xefbe4786 + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x0fc19dc6 + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x240ca1cc + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x2de92c6f + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4a7484aa + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5cb0a9dc + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x76f988da + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0x983e5152 + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0xa831c66d + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0xb00327c8 + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0xbf597fc7 + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0xc6e00bf3 + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xd5a79147 + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0x06ca6351 + (w14 += sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0x14292967 + (w15 += sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1));
Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2));
Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3));
Round(f, g, h, a, b, c, d, e, 0x53380d13, w3 += sigma1(w1) + w12 + sigma0(w4));
Round(e, f, g, h, a, b, c, d, 0x650a7354, w4 += sigma1(w2) + w13 + sigma0(w5));
Round(d, e, f, g, h, a, b, c, 0x766a0abb, w5 += sigma1(w3) + w14 + sigma0(w6));
Round(c, d, e, f, g, h, a, b, 0x81c2c92e, w6 += sigma1(w4) + w15 + sigma0(w7));
Round(b, c, d, e, f, g, h, a, 0x92722c85, w7 += sigma1(w5) + w0 + sigma0(w8));
Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1, w8 += sigma1(w6) + w1 + sigma0(w9));
Round(h, a, b, c, d, e, f, g, 0xa81a664b, w9 += sigma1(w7) + w2 + sigma0(w10));
Round(g, h, a, b, c, d, e, f, 0xc24b8b70, w10 += sigma1(w8) + w3 + sigma0(w11));
Round(f, g, h, a, b, c, d, e, 0xc76c51a3, w11 += sigma1(w9) + w4 + sigma0(w12));
Round(e, f, g, h, a, b, c, d, 0xd192e819, w12 += sigma1(w10) + w5 + sigma0(w13));
Round(d, e, f, g, h, a, b, c, 0xd6990624, w13 += sigma1(w11) + w6 + sigma0(w14));
Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15));
Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0));
Round(a, b, c, d, e, f, g, h, 0x27b70a85 + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x2e1b2138 + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x53380d13 + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x650a7354 + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x766a0abb + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x81c2c92e + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x92722c85 + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1 + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0xa81a664b + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0xc24b8b70 + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0xc76c51a3 + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0xd192e819 + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xd6990624 + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xf40e3585 + (w14 += sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0x106aa070 + (w15 += sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1));
Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2));
Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3));
Round(f, g, h, a, b, c, d, e, 0x34b0bcb5, w3 += sigma1(w1) + w12 + sigma0(w4));
Round(e, f, g, h, a, b, c, d, 0x391c0cb3, w4 += sigma1(w2) + w13 + sigma0(w5));
Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a, w5 += sigma1(w3) + w14 + sigma0(w6));
Round(c, d, e, f, g, h, a, b, 0x5b9cca4f, w6 += sigma1(w4) + w15 + sigma0(w7));
Round(b, c, d, e, f, g, h, a, 0x682e6ff3, w7 += sigma1(w5) + w0 + sigma0(w8));
Round(a, b, c, d, e, f, g, h, 0x748f82ee, w8 += sigma1(w6) + w1 + sigma0(w9));
Round(h, a, b, c, d, e, f, g, 0x78a5636f, w9 += sigma1(w7) + w2 + sigma0(w10));
Round(g, h, a, b, c, d, e, f, 0x84c87814, w10 += sigma1(w8) + w3 + sigma0(w11));
Round(f, g, h, a, b, c, d, e, 0x8cc70208, w11 += sigma1(w9) + w4 + sigma0(w12));
Round(e, f, g, h, a, b, c, d, 0x90befffa, w12 += sigma1(w10) + w5 + sigma0(w13));
Round(d, e, f, g, h, a, b, c, 0xa4506ceb, w13 += sigma1(w11) + w6 + sigma0(w14));
Round(c, d, e, f, g, h, a, b, 0xbef9a3f7, w14 + sigma1(w12) + w7 + sigma0(w15));
Round(b, c, d, e, f, g, h, a, 0xc67178f2, w15 + sigma1(w13) + w8 + sigma0(w0));
Round(a, b, c, d, e, f, g, h, 0x19a4c116 + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x1e376c08 + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x2748774c + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x34b0bcb5 + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x391c0cb3 + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4ed8aa4a + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5b9cca4f + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x682e6ff3 + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0x748f82ee + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0x78a5636f + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0x84c87814 + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0x8cc70208 + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0x90befffa + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xa4506ceb + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xbef9a3f7 + (w14 + sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0xc67178f2 + (w15 + sigma1(w13) + w8 + sigma0(w0)));
s[0] += a;
s[1] += b;
@ -141,9 +151,300 @@ void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
}
}
void TransformD64(unsigned char* out, const unsigned char* in)
{
// Transform 1
uint32_t a = 0x6a09e667ul;
uint32_t b = 0xbb67ae85ul;
uint32_t c = 0x3c6ef372ul;
uint32_t d = 0xa54ff53aul;
uint32_t e = 0x510e527ful;
uint32_t f = 0x9b05688cul;
uint32_t g = 0x1f83d9abul;
uint32_t h = 0x5be0cd19ul;
uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
Round(a, b, c, d, e, f, g, h, 0x428a2f98ul + (w0 = ReadBE32(in + 0)));
Round(h, a, b, c, d, e, f, g, 0x71374491ul + (w1 = ReadBE32(in + 4)));
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcful + (w2 = ReadBE32(in + 8)));
Round(f, g, h, a, b, c, d, e, 0xe9b5dba5ul + (w3 = ReadBE32(in + 12)));
Round(e, f, g, h, a, b, c, d, 0x3956c25bul + (w4 = ReadBE32(in + 16)));
Round(d, e, f, g, h, a, b, c, 0x59f111f1ul + (w5 = ReadBE32(in + 20)));
Round(c, d, e, f, g, h, a, b, 0x923f82a4ul + (w6 = ReadBE32(in + 24)));
Round(b, c, d, e, f, g, h, a, 0xab1c5ed5ul + (w7 = ReadBE32(in + 28)));
Round(a, b, c, d, e, f, g, h, 0xd807aa98ul + (w8 = ReadBE32(in + 32)));
Round(h, a, b, c, d, e, f, g, 0x12835b01ul + (w9 = ReadBE32(in + 36)));
Round(g, h, a, b, c, d, e, f, 0x243185beul + (w10 = ReadBE32(in + 40)));
Round(f, g, h, a, b, c, d, e, 0x550c7dc3ul + (w11 = ReadBE32(in + 44)));
Round(e, f, g, h, a, b, c, d, 0x72be5d74ul + (w12 = ReadBE32(in + 48)));
Round(d, e, f, g, h, a, b, c, 0x80deb1feul + (w13 = ReadBE32(in + 52)));
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7ul + (w14 = ReadBE32(in + 56)));
Round(b, c, d, e, f, g, h, a, 0xc19bf174ul + (w15 = ReadBE32(in + 60)));
Round(a, b, c, d, e, f, g, h, 0xe49b69c1ul + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0xefbe4786ul + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x0fc19dc6ul + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x240ca1ccul + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x2de92c6ful + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4a7484aaul + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5cb0a9dcul + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x76f988daul + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0x983e5152ul + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0xa831c66dul + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0xb00327c8ul + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0xbf597fc7ul + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0xc6e00bf3ul + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xd5a79147ul + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0x06ca6351ul + (w14 += sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0x14292967ul + (w15 += sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x27b70a85ul + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x2e1b2138ul + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x4d2c6dfcul + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x53380d13ul + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x650a7354ul + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x766a0abbul + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x81c2c92eul + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x92722c85ul + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1ul + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0xa81a664bul + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0xc24b8b70ul + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0xc76c51a3ul + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0xd192e819ul + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xd6990624ul + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xf40e3585ul + (w14 += sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0x106aa070ul + (w15 += sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x19a4c116ul + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x1e376c08ul + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x2748774cul + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x34b0bcb5ul + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x391c0cb3ul + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4ed8aa4aul + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5b9cca4ful + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x682e6ff3ul + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0x748f82eeul + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0x78a5636ful + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0x84c87814ul + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0x8cc70208ul + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0x90befffaul + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xa4506cebul + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xbef9a3f7ul + (w14 + sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0xc67178f2ul + (w15 + sigma1(w13) + w8 + sigma0(w0)));
a += 0x6a09e667ul;
b += 0xbb67ae85ul;
c += 0x3c6ef372ul;
d += 0xa54ff53aul;
e += 0x510e527ful;
f += 0x9b05688cul;
g += 0x1f83d9abul;
h += 0x5be0cd19ul;
uint32_t t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h;
// Transform 2
Round(a, b, c, d, e, f, g, h, 0xc28a2f98ul);
Round(h, a, b, c, d, e, f, g, 0x71374491ul);
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcful);
Round(f, g, h, a, b, c, d, e, 0xe9b5dba5ul);
Round(e, f, g, h, a, b, c, d, 0x3956c25bul);
Round(d, e, f, g, h, a, b, c, 0x59f111f1ul);
Round(c, d, e, f, g, h, a, b, 0x923f82a4ul);
Round(b, c, d, e, f, g, h, a, 0xab1c5ed5ul);
Round(a, b, c, d, e, f, g, h, 0xd807aa98ul);
Round(h, a, b, c, d, e, f, g, 0x12835b01ul);
Round(g, h, a, b, c, d, e, f, 0x243185beul);
Round(f, g, h, a, b, c, d, e, 0x550c7dc3ul);
Round(e, f, g, h, a, b, c, d, 0x72be5d74ul);
Round(d, e, f, g, h, a, b, c, 0x80deb1feul);
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7ul);
Round(b, c, d, e, f, g, h, a, 0xc19bf374ul);
Round(a, b, c, d, e, f, g, h, 0x649b69c1ul);
Round(h, a, b, c, d, e, f, g, 0xf0fe4786ul);
Round(g, h, a, b, c, d, e, f, 0x0fe1edc6ul);
Round(f, g, h, a, b, c, d, e, 0x240cf254ul);
Round(e, f, g, h, a, b, c, d, 0x4fe9346ful);
Round(d, e, f, g, h, a, b, c, 0x6cc984beul);
Round(c, d, e, f, g, h, a, b, 0x61b9411eul);
Round(b, c, d, e, f, g, h, a, 0x16f988faul);
Round(a, b, c, d, e, f, g, h, 0xf2c65152ul);
Round(h, a, b, c, d, e, f, g, 0xa88e5a6dul);
Round(g, h, a, b, c, d, e, f, 0xb019fc65ul);
Round(f, g, h, a, b, c, d, e, 0xb9d99ec7ul);
Round(e, f, g, h, a, b, c, d, 0x9a1231c3ul);
Round(d, e, f, g, h, a, b, c, 0xe70eeaa0ul);
Round(c, d, e, f, g, h, a, b, 0xfdb1232bul);
Round(b, c, d, e, f, g, h, a, 0xc7353eb0ul);
Round(a, b, c, d, e, f, g, h, 0x3069bad5ul);
Round(h, a, b, c, d, e, f, g, 0xcb976d5ful);
Round(g, h, a, b, c, d, e, f, 0x5a0f118ful);
Round(f, g, h, a, b, c, d, e, 0xdc1eeefdul);
Round(e, f, g, h, a, b, c, d, 0x0a35b689ul);
Round(d, e, f, g, h, a, b, c, 0xde0b7a04ul);
Round(c, d, e, f, g, h, a, b, 0x58f4ca9dul);
Round(b, c, d, e, f, g, h, a, 0xe15d5b16ul);
Round(a, b, c, d, e, f, g, h, 0x007f3e86ul);
Round(h, a, b, c, d, e, f, g, 0x37088980ul);
Round(g, h, a, b, c, d, e, f, 0xa507ea32ul);
Round(f, g, h, a, b, c, d, e, 0x6fab9537ul);
Round(e, f, g, h, a, b, c, d, 0x17406110ul);
Round(d, e, f, g, h, a, b, c, 0x0d8cd6f1ul);
Round(c, d, e, f, g, h, a, b, 0xcdaa3b6dul);
Round(b, c, d, e, f, g, h, a, 0xc0bbbe37ul);
Round(a, b, c, d, e, f, g, h, 0x83613bdaul);
Round(h, a, b, c, d, e, f, g, 0xdb48a363ul);
Round(g, h, a, b, c, d, e, f, 0x0b02e931ul);
Round(f, g, h, a, b, c, d, e, 0x6fd15ca7ul);
Round(e, f, g, h, a, b, c, d, 0x521afacaul);
Round(d, e, f, g, h, a, b, c, 0x31338431ul);
Round(c, d, e, f, g, h, a, b, 0x6ed41a95ul);
Round(b, c, d, e, f, g, h, a, 0x6d437890ul);
Round(a, b, c, d, e, f, g, h, 0xc39c91f2ul);
Round(h, a, b, c, d, e, f, g, 0x9eccabbdul);
Round(g, h, a, b, c, d, e, f, 0xb5c9a0e6ul);
Round(f, g, h, a, b, c, d, e, 0x532fb63cul);
Round(e, f, g, h, a, b, c, d, 0xd2c741c6ul);
Round(d, e, f, g, h, a, b, c, 0x07237ea3ul);
Round(c, d, e, f, g, h, a, b, 0xa4954b68ul);
Round(b, c, d, e, f, g, h, a, 0x4c191d76ul);
w0 = t0 + a;
w1 = t1 + b;
w2 = t2 + c;
w3 = t3 + d;
w4 = t4 + e;
w5 = t5 + f;
w6 = t6 + g;
w7 = t7 + h;
// Transform 3
a = 0x6a09e667ul;
b = 0xbb67ae85ul;
c = 0x3c6ef372ul;
d = 0xa54ff53aul;
e = 0x510e527ful;
f = 0x9b05688cul;
g = 0x1f83d9abul;
h = 0x5be0cd19ul;
Round(a, b, c, d, e, f, g, h, 0x428a2f98ul + w0);
Round(h, a, b, c, d, e, f, g, 0x71374491ul + w1);
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcful + w2);
Round(f, g, h, a, b, c, d, e, 0xe9b5dba5ul + w3);
Round(e, f, g, h, a, b, c, d, 0x3956c25bul + w4);
Round(d, e, f, g, h, a, b, c, 0x59f111f1ul + w5);
Round(c, d, e, f, g, h, a, b, 0x923f82a4ul + w6);
Round(b, c, d, e, f, g, h, a, 0xab1c5ed5ul + w7);
Round(a, b, c, d, e, f, g, h, 0x5807aa98ul);
Round(h, a, b, c, d, e, f, g, 0x12835b01ul);
Round(g, h, a, b, c, d, e, f, 0x243185beul);
Round(f, g, h, a, b, c, d, e, 0x550c7dc3ul);
Round(e, f, g, h, a, b, c, d, 0x72be5d74ul);
Round(d, e, f, g, h, a, b, c, 0x80deb1feul);
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7ul);
Round(b, c, d, e, f, g, h, a, 0xc19bf274ul);
Round(a, b, c, d, e, f, g, h, 0xe49b69c1ul + (w0 += sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0xefbe4786ul + (w1 += 0xa00000ul + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x0fc19dc6ul + (w2 += sigma1(w0) + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x240ca1ccul + (w3 += sigma1(w1) + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x2de92c6ful + (w4 += sigma1(w2) + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4a7484aaul + (w5 += sigma1(w3) + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5cb0a9dcul + (w6 += sigma1(w4) + 0x100ul + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x76f988daul + (w7 += sigma1(w5) + w0 + 0x11002000ul));
Round(a, b, c, d, e, f, g, h, 0x983e5152ul + (w8 = 0x80000000ul + sigma1(w6) + w1));
Round(h, a, b, c, d, e, f, g, 0xa831c66dul + (w9 = sigma1(w7) + w2));
Round(g, h, a, b, c, d, e, f, 0xb00327c8ul + (w10 = sigma1(w8) + w3));
Round(f, g, h, a, b, c, d, e, 0xbf597fc7ul + (w11 = sigma1(w9) + w4));
Round(e, f, g, h, a, b, c, d, 0xc6e00bf3ul + (w12 = sigma1(w10) + w5));
Round(d, e, f, g, h, a, b, c, 0xd5a79147ul + (w13 = sigma1(w11) + w6));
Round(c, d, e, f, g, h, a, b, 0x06ca6351ul + (w14 = sigma1(w12) + w7 + 0x400022ul));
Round(b, c, d, e, f, g, h, a, 0x14292967ul + (w15 = 0x100ul + sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x27b70a85ul + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x2e1b2138ul + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x4d2c6dfcul + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x53380d13ul + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x650a7354ul + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x766a0abbul + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x81c2c92eul + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x92722c85ul + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0xa2bfe8a1ul + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0xa81a664bul + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0xc24b8b70ul + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0xc76c51a3ul + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0xd192e819ul + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xd6990624ul + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xf40e3585ul + (w14 += sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0x106aa070ul + (w15 += sigma1(w13) + w8 + sigma0(w0)));
Round(a, b, c, d, e, f, g, h, 0x19a4c116ul + (w0 += sigma1(w14) + w9 + sigma0(w1)));
Round(h, a, b, c, d, e, f, g, 0x1e376c08ul + (w1 += sigma1(w15) + w10 + sigma0(w2)));
Round(g, h, a, b, c, d, e, f, 0x2748774cul + (w2 += sigma1(w0) + w11 + sigma0(w3)));
Round(f, g, h, a, b, c, d, e, 0x34b0bcb5ul + (w3 += sigma1(w1) + w12 + sigma0(w4)));
Round(e, f, g, h, a, b, c, d, 0x391c0cb3ul + (w4 += sigma1(w2) + w13 + sigma0(w5)));
Round(d, e, f, g, h, a, b, c, 0x4ed8aa4aul + (w5 += sigma1(w3) + w14 + sigma0(w6)));
Round(c, d, e, f, g, h, a, b, 0x5b9cca4ful + (w6 += sigma1(w4) + w15 + sigma0(w7)));
Round(b, c, d, e, f, g, h, a, 0x682e6ff3ul + (w7 += sigma1(w5) + w0 + sigma0(w8)));
Round(a, b, c, d, e, f, g, h, 0x748f82eeul + (w8 += sigma1(w6) + w1 + sigma0(w9)));
Round(h, a, b, c, d, e, f, g, 0x78a5636ful + (w9 += sigma1(w7) + w2 + sigma0(w10)));
Round(g, h, a, b, c, d, e, f, 0x84c87814ul + (w10 += sigma1(w8) + w3 + sigma0(w11)));
Round(f, g, h, a, b, c, d, e, 0x8cc70208ul + (w11 += sigma1(w9) + w4 + sigma0(w12)));
Round(e, f, g, h, a, b, c, d, 0x90befffaul + (w12 += sigma1(w10) + w5 + sigma0(w13)));
Round(d, e, f, g, h, a, b, c, 0xa4506cebul + (w13 += sigma1(w11) + w6 + sigma0(w14)));
Round(c, d, e, f, g, h, a, b, 0xbef9a3f7ul + (w14 + sigma1(w12) + w7 + sigma0(w15)));
Round(b, c, d, e, f, g, h, a, 0xc67178f2ul + (w15 + sigma1(w13) + w8 + sigma0(w0)));
// Output
WriteBE32(out + 0, a + 0x6a09e667ul);
WriteBE32(out + 4, b + 0xbb67ae85ul);
WriteBE32(out + 8, c + 0x3c6ef372ul);
WriteBE32(out + 12, d + 0xa54ff53aul);
WriteBE32(out + 16, e + 0x510e527ful);
WriteBE32(out + 20, f + 0x9b05688cul);
WriteBE32(out + 24, g + 0x1f83d9abul);
WriteBE32(out + 28, h + 0x5be0cd19ul);
}
} // namespace sha256
typedef void (*TransformType)(uint32_t*, const unsigned char*, size_t);
typedef void (*TransformD64Type)(unsigned char*, const unsigned char*);
template<TransformType tr>
void TransformD64Wrapper(unsigned char* out, const unsigned char* in)
{
uint32_t s[8];
static const unsigned char padding1[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0
};
unsigned char buffer2[64] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0
};
sha256::Initialize(s);
tr(s, in, 1);
tr(s, padding1, 1);
WriteBE32(buffer2 + 0, s[0]);
WriteBE32(buffer2 + 4, s[1]);
WriteBE32(buffer2 + 8, s[2]);
WriteBE32(buffer2 + 12, s[3]);
WriteBE32(buffer2 + 16, s[4]);
WriteBE32(buffer2 + 20, s[5]);
WriteBE32(buffer2 + 24, s[6]);
WriteBE32(buffer2 + 28, s[7]);
sha256::Initialize(s);
tr(s, buffer2, 1);
WriteBE32(out + 0, s[0]);
WriteBE32(out + 4, s[1]);
WriteBE32(out + 8, s[2]);
WriteBE32(out + 12, s[3]);
WriteBE32(out + 16, s[4]);
WriteBE32(out + 20, s[5]);
WriteBE32(out + 24, s[6]);
WriteBE32(out + 28, s[7]);
}
bool SelfTest(TransformType tr) {
static const unsigned char in1[65] = {0, 0x80};
@ -173,22 +474,47 @@ bool SelfTest(TransformType tr) {
}
TransformType Transform = sha256::Transform;
TransformD64Type TransformD64 = sha256::TransformD64;
TransformD64Type TransformD64_4way = nullptr;
TransformD64Type TransformD64_8way = nullptr;
#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__))
// We can't use cpuid.h's __get_cpuid as it does not support subleafs.
void inline cpuid(uint32_t leaf, uint32_t subleaf, uint32_t& a, uint32_t& b, uint32_t& c, uint32_t& d)
{
__asm__ ("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(leaf), "2"(subleaf));
}
#endif
} // namespace
std::string SHA256AutoDetect()
{
std::string ret = "standard";
#if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__))
uint32_t eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) && (ecx >> 19) & 1) {
cpuid(1, 0, eax, ebx, ecx, edx);
if ((ecx >> 19) & 1) {
Transform = sha256_sse4::Transform;
assert(SelfTest(Transform));
return "sse4";
TransformD64 = TransformD64Wrapper<sha256_sse4::Transform>;
#if defined(ENABLE_SSE41) && !defined(BUILD_BITCOIN_INTERNAL)
TransformD64_4way = sha256d64_sse41::Transform_4way;
ret = "sse4(1way+4way)";
#if defined(ENABLE_AVX2) && !defined(BUILD_BITCOIN_INTERNAL)
cpuid(7, 0, eax, ebx, ecx, edx);
if ((ebx >> 5) & 1) {
TransformD64_8way = sha256d64_avx2::Transform_8way;
ret += ",avx2(8way)";
}
#endif
#else
ret = "sse4";
#endif
}
#endif
assert(SelfTest(Transform));
return "standard";
return ret;
}
////// SHA-256
@ -247,3 +573,29 @@ CSHA256& CSHA256::Reset()
sha256::Initialize(s);
return *this;
}
void SHA256D64(unsigned char* out, const unsigned char* in, size_t blocks)
{
if (TransformD64_8way) {
while (blocks >= 8) {
TransformD64_8way(out, in);
out += 256;
in += 512;
blocks -= 8;
}
}
if (TransformD64_4way) {
while (blocks >= 4) {
TransformD64_4way(out, in);
out += 128;
in += 256;
blocks -= 4;
}
}
while (blocks) {
TransformD64(out, in);
out += 32;
in += 64;
--blocks;
}
}

View file

@ -31,4 +31,11 @@ public:
*/
std::string SHA256AutoDetect();
/** Compute multiple double-SHA256's of 64-byte blobs.
* output: pointer to a blocks*32 byte output buffer
* input: pointer to a blocks*64 byte input buffer
* blocks: the number of hashes to compute.
*/
void SHA256D64(unsigned char* output, const unsigned char* input, size_t blocks);
#endif // BITCOIN_CRYPTO_SHA256_H

329
src/crypto/sha256_avx2.cpp Normal file
View file

@ -0,0 +1,329 @@
#ifdef ENABLE_AVX2
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#include "crypto/sha256.h"
#include "crypto/common.h"
namespace sha256d64_avx2 {
namespace {
__m256i inline K(uint32_t x) { return _mm256_set1_epi32(x); }
__m256i inline Add(__m256i x, __m256i y) { return _mm256_add_epi32(x, y); }
__m256i inline Add(__m256i x, __m256i y, __m256i z) { return Add(Add(x, y), z); }
__m256i inline Add(__m256i x, __m256i y, __m256i z, __m256i w) { return Add(Add(x, y), Add(z, w)); }
__m256i inline Add(__m256i x, __m256i y, __m256i z, __m256i w, __m256i v) { return Add(Add(x, y, z), Add(w, v)); }
__m256i inline Inc(__m256i& x, __m256i y) { x = Add(x, y); return x; }
__m256i inline Inc(__m256i& x, __m256i y, __m256i z) { x = Add(x, y, z); return x; }
__m256i inline Inc(__m256i& x, __m256i y, __m256i z, __m256i w) { x = Add(x, y, z, w); return x; }
__m256i inline Xor(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
__m256i inline Xor(__m256i x, __m256i y, __m256i z) { return Xor(Xor(x, y), z); }
__m256i inline Or(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
__m256i inline And(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
__m256i inline ShR(__m256i x, int n) { return _mm256_srli_epi32(x, n); }
__m256i inline ShL(__m256i x, int n) { return _mm256_slli_epi32(x, n); }
__m256i inline Ch(__m256i x, __m256i y, __m256i z) { return Xor(z, And(x, Xor(y, z))); }
__m256i inline Maj(__m256i x, __m256i y, __m256i z) { return Or(And(x, y), And(z, Or(x, y))); }
__m256i inline Sigma0(__m256i x) { return Xor(Or(ShR(x, 2), ShL(x, 30)), Or(ShR(x, 13), ShL(x, 19)), Or(ShR(x, 22), ShL(x, 10))); }
__m256i inline Sigma1(__m256i x) { return Xor(Or(ShR(x, 6), ShL(x, 26)), Or(ShR(x, 11), ShL(x, 21)), Or(ShR(x, 25), ShL(x, 7))); }
__m256i inline sigma0(__m256i x) { return Xor(Or(ShR(x, 7), ShL(x, 25)), Or(ShR(x, 18), ShL(x, 14)), ShR(x, 3)); }
__m256i inline sigma1(__m256i x) { return Xor(Or(ShR(x, 17), ShL(x, 15)), Or(ShR(x, 19), ShL(x, 13)), ShR(x, 10)); }
/** One round of SHA-256. */
void inline __attribute__((always_inline)) Round(__m256i a, __m256i b, __m256i c, __m256i& d, __m256i e, __m256i f, __m256i g, __m256i& h, __m256i k)
{
__m256i t1 = Add(h, Sigma1(e), Ch(e, f, g), k);
__m256i t2 = Add(Sigma0(a), Maj(a, b, c));
d = Add(d, t1);
h = Add(t1, t2);
}
__m256i inline Read8(const unsigned char* chunk, int offset) {
__m256i ret = _mm256_set_epi32(
ReadLE32(chunk + 0 + offset),
ReadLE32(chunk + 64 + offset),
ReadLE32(chunk + 128 + offset),
ReadLE32(chunk + 192 + offset),
ReadLE32(chunk + 256 + offset),
ReadLE32(chunk + 320 + offset),
ReadLE32(chunk + 384 + offset),
ReadLE32(chunk + 448 + offset)
);
return _mm256_shuffle_epi8(ret, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
}
void inline Write8(unsigned char* out, int offset, __m256i v) {
v = _mm256_shuffle_epi8(v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
WriteLE32(out + 0 + offset, _mm256_extract_epi32(v, 7));
WriteLE32(out + 32 + offset, _mm256_extract_epi32(v, 6));
WriteLE32(out + 64 + offset, _mm256_extract_epi32(v, 5));
WriteLE32(out + 96 + offset, _mm256_extract_epi32(v, 4));
WriteLE32(out + 128 + offset, _mm256_extract_epi32(v, 3));
WriteLE32(out + 160 + offset, _mm256_extract_epi32(v, 2));
WriteLE32(out + 192 + offset, _mm256_extract_epi32(v, 1));
WriteLE32(out + 224 + offset, _mm256_extract_epi32(v, 0));
}
}
void Transform_8way(unsigned char* out, const unsigned char* in)
{
// Transform 1
__m256i a = K(0x6a09e667ul);
__m256i b = K(0xbb67ae85ul);
__m256i c = K(0x3c6ef372ul);
__m256i d = K(0xa54ff53aul);
__m256i e = K(0x510e527ful);
__m256i f = K(0x9b05688cul);
__m256i g = K(0x1f83d9abul);
__m256i h = K(0x5be0cd19ul);
__m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
Round(a, b, c, d, e, f, g, h, Add(K(0x428a2f98ul), w0 = Read8(in, 0)));
Round(h, a, b, c, d, e, f, g, Add(K(0x71374491ul), w1 = Read8(in, 4)));
Round(g, h, a, b, c, d, e, f, Add(K(0xb5c0fbcful), w2 = Read8(in, 8)));
Round(f, g, h, a, b, c, d, e, Add(K(0xe9b5dba5ul), w3 = Read8(in, 12)));
Round(e, f, g, h, a, b, c, d, Add(K(0x3956c25bul), w4 = Read8(in, 16)));
Round(d, e, f, g, h, a, b, c, Add(K(0x59f111f1ul), w5 = Read8(in, 20)));
Round(c, d, e, f, g, h, a, b, Add(K(0x923f82a4ul), w6 = Read8(in, 24)));
Round(b, c, d, e, f, g, h, a, Add(K(0xab1c5ed5ul), w7 = Read8(in, 28)));
Round(a, b, c, d, e, f, g, h, Add(K(0xd807aa98ul), w8 = Read8(in, 32)));
Round(h, a, b, c, d, e, f, g, Add(K(0x12835b01ul), w9 = Read8(in, 36)));
Round(g, h, a, b, c, d, e, f, Add(K(0x243185beul), w10 = Read8(in, 40)));
Round(f, g, h, a, b, c, d, e, Add(K(0x550c7dc3ul), w11 = Read8(in, 44)));
Round(e, f, g, h, a, b, c, d, Add(K(0x72be5d74ul), w12 = Read8(in, 48)));
Round(d, e, f, g, h, a, b, c, Add(K(0x80deb1feul), w13 = Read8(in, 52)));
Round(c, d, e, f, g, h, a, b, Add(K(0x9bdc06a7ul), w14 = Read8(in, 56)));
Round(b, c, d, e, f, g, h, a, Add(K(0xc19bf174ul), w15 = Read8(in, 60)));
Round(a, b, c, d, e, f, g, h, Add(K(0xe49b69c1ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0xefbe4786ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x0fc19dc6ul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x240ca1ccul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x2de92c6ful), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4a7484aaul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5cb0a9dcul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x76f988daul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x983e5152ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa831c66dul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xb00327c8ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xbf597fc7ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xc6e00bf3ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd5a79147ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0x06ca6351ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x14292967ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x27b70a85ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x2e1b2138ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x4d2c6dfcul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x53380d13ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x650a7354ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x766a0abbul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x81c2c92eul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x92722c85ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0xa2bfe8a1ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa81a664bul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xc24b8b70ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xc76c51a3ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xd192e819ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd6990624ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xf40e3585ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x106aa070ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x19a4c116ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x1e376c08ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x2748774cul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x34b0bcb5ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x391c0cb3ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4ed8aa4aul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5b9cca4ful), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x682e6ff3ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x748f82eeul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0x78a5636ful), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0x84c87814ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0x8cc70208ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0x90befffaul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xa4506cebul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xbef9a3f7ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0xc67178f2ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
a = Add(a, K(0x6a09e667ul));
b = Add(b, K(0xbb67ae85ul));
c = Add(c, K(0x3c6ef372ul));
d = Add(d, K(0xa54ff53aul));
e = Add(e, K(0x510e527ful));
f = Add(f, K(0x9b05688cul));
g = Add(g, K(0x1f83d9abul));
h = Add(h, K(0x5be0cd19ul));
__m256i t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h;
// Transform 2
Round(a, b, c, d, e, f, g, h, K(0xc28a2f98ul));
Round(h, a, b, c, d, e, f, g, K(0x71374491ul));
Round(g, h, a, b, c, d, e, f, K(0xb5c0fbcful));
Round(f, g, h, a, b, c, d, e, K(0xe9b5dba5ul));
Round(e, f, g, h, a, b, c, d, K(0x3956c25bul));
Round(d, e, f, g, h, a, b, c, K(0x59f111f1ul));
Round(c, d, e, f, g, h, a, b, K(0x923f82a4ul));
Round(b, c, d, e, f, g, h, a, K(0xab1c5ed5ul));
Round(a, b, c, d, e, f, g, h, K(0xd807aa98ul));
Round(h, a, b, c, d, e, f, g, K(0x12835b01ul));
Round(g, h, a, b, c, d, e, f, K(0x243185beul));
Round(f, g, h, a, b, c, d, e, K(0x550c7dc3ul));
Round(e, f, g, h, a, b, c, d, K(0x72be5d74ul));
Round(d, e, f, g, h, a, b, c, K(0x80deb1feul));
Round(c, d, e, f, g, h, a, b, K(0x9bdc06a7ul));
Round(b, c, d, e, f, g, h, a, K(0xc19bf374ul));
Round(a, b, c, d, e, f, g, h, K(0x649b69c1ul));
Round(h, a, b, c, d, e, f, g, K(0xf0fe4786ul));
Round(g, h, a, b, c, d, e, f, K(0x0fe1edc6ul));
Round(f, g, h, a, b, c, d, e, K(0x240cf254ul));
Round(e, f, g, h, a, b, c, d, K(0x4fe9346ful));
Round(d, e, f, g, h, a, b, c, K(0x6cc984beul));
Round(c, d, e, f, g, h, a, b, K(0x61b9411eul));
Round(b, c, d, e, f, g, h, a, K(0x16f988faul));
Round(a, b, c, d, e, f, g, h, K(0xf2c65152ul));
Round(h, a, b, c, d, e, f, g, K(0xa88e5a6dul));
Round(g, h, a, b, c, d, e, f, K(0xb019fc65ul));
Round(f, g, h, a, b, c, d, e, K(0xb9d99ec7ul));
Round(e, f, g, h, a, b, c, d, K(0x9a1231c3ul));
Round(d, e, f, g, h, a, b, c, K(0xe70eeaa0ul));
Round(c, d, e, f, g, h, a, b, K(0xfdb1232bul));
Round(b, c, d, e, f, g, h, a, K(0xc7353eb0ul));
Round(a, b, c, d, e, f, g, h, K(0x3069bad5ul));
Round(h, a, b, c, d, e, f, g, K(0xcb976d5ful));
Round(g, h, a, b, c, d, e, f, K(0x5a0f118ful));
Round(f, g, h, a, b, c, d, e, K(0xdc1eeefdul));
Round(e, f, g, h, a, b, c, d, K(0x0a35b689ul));
Round(d, e, f, g, h, a, b, c, K(0xde0b7a04ul));
Round(c, d, e, f, g, h, a, b, K(0x58f4ca9dul));
Round(b, c, d, e, f, g, h, a, K(0xe15d5b16ul));
Round(a, b, c, d, e, f, g, h, K(0x007f3e86ul));
Round(h, a, b, c, d, e, f, g, K(0x37088980ul));
Round(g, h, a, b, c, d, e, f, K(0xa507ea32ul));
Round(f, g, h, a, b, c, d, e, K(0x6fab9537ul));
Round(e, f, g, h, a, b, c, d, K(0x17406110ul));
Round(d, e, f, g, h, a, b, c, K(0x0d8cd6f1ul));
Round(c, d, e, f, g, h, a, b, K(0xcdaa3b6dul));
Round(b, c, d, e, f, g, h, a, K(0xc0bbbe37ul));
Round(a, b, c, d, e, f, g, h, K(0x83613bdaul));
Round(h, a, b, c, d, e, f, g, K(0xdb48a363ul));
Round(g, h, a, b, c, d, e, f, K(0x0b02e931ul));
Round(f, g, h, a, b, c, d, e, K(0x6fd15ca7ul));
Round(e, f, g, h, a, b, c, d, K(0x521afacaul));
Round(d, e, f, g, h, a, b, c, K(0x31338431ul));
Round(c, d, e, f, g, h, a, b, K(0x6ed41a95ul));
Round(b, c, d, e, f, g, h, a, K(0x6d437890ul));
Round(a, b, c, d, e, f, g, h, K(0xc39c91f2ul));
Round(h, a, b, c, d, e, f, g, K(0x9eccabbdul));
Round(g, h, a, b, c, d, e, f, K(0xb5c9a0e6ul));
Round(f, g, h, a, b, c, d, e, K(0x532fb63cul));
Round(e, f, g, h, a, b, c, d, K(0xd2c741c6ul));
Round(d, e, f, g, h, a, b, c, K(0x07237ea3ul));
Round(c, d, e, f, g, h, a, b, K(0xa4954b68ul));
Round(b, c, d, e, f, g, h, a, K(0x4c191d76ul));
w0 = Add(t0, a);
w1 = Add(t1, b);
w2 = Add(t2, c);
w3 = Add(t3, d);
w4 = Add(t4, e);
w5 = Add(t5, f);
w6 = Add(t6, g);
w7 = Add(t7, h);
// Transform 3
a = K(0x6a09e667ul);
b = K(0xbb67ae85ul);
c = K(0x3c6ef372ul);
d = K(0xa54ff53aul);
e = K(0x510e527ful);
f = K(0x9b05688cul);
g = K(0x1f83d9abul);
h = K(0x5be0cd19ul);
Round(a, b, c, d, e, f, g, h, Add(K(0x428a2f98ul), w0));
Round(h, a, b, c, d, e, f, g, Add(K(0x71374491ul), w1));
Round(g, h, a, b, c, d, e, f, Add(K(0xb5c0fbcful), w2));
Round(f, g, h, a, b, c, d, e, Add(K(0xe9b5dba5ul), w3));
Round(e, f, g, h, a, b, c, d, Add(K(0x3956c25bul), w4));
Round(d, e, f, g, h, a, b, c, Add(K(0x59f111f1ul), w5));
Round(c, d, e, f, g, h, a, b, Add(K(0x923f82a4ul), w6));
Round(b, c, d, e, f, g, h, a, Add(K(0xab1c5ed5ul), w7));
Round(a, b, c, d, e, f, g, h, K(0x5807aa98ul));
Round(h, a, b, c, d, e, f, g, K(0x12835b01ul));
Round(g, h, a, b, c, d, e, f, K(0x243185beul));
Round(f, g, h, a, b, c, d, e, K(0x550c7dc3ul));
Round(e, f, g, h, a, b, c, d, K(0x72be5d74ul));
Round(d, e, f, g, h, a, b, c, K(0x80deb1feul));
Round(c, d, e, f, g, h, a, b, K(0x9bdc06a7ul));
Round(b, c, d, e, f, g, h, a, K(0xc19bf274ul));
Round(a, b, c, d, e, f, g, h, Add(K(0xe49b69c1ul), Inc(w0, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0xefbe4786ul), Inc(w1, K(0xa00000ul), sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x0fc19dc6ul), Inc(w2, sigma1(w0), sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x240ca1ccul), Inc(w3, sigma1(w1), sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x2de92c6ful), Inc(w4, sigma1(w2), sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4a7484aaul), Inc(w5, sigma1(w3), sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5cb0a9dcul), Inc(w6, sigma1(w4), K(0x100ul), sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x76f988daul), Inc(w7, sigma1(w5), w0, K(0x11002000ul))));
Round(a, b, c, d, e, f, g, h, Add(K(0x983e5152ul), w8 = Add(K(0x80000000ul), sigma1(w6), w1)));
Round(h, a, b, c, d, e, f, g, Add(K(0xa831c66dul), w9 = Add(sigma1(w7), w2)));
Round(g, h, a, b, c, d, e, f, Add(K(0xb00327c8ul), w10 = Add(sigma1(w8), w3)));
Round(f, g, h, a, b, c, d, e, Add(K(0xbf597fc7ul), w11 = Add(sigma1(w9), w4)));
Round(e, f, g, h, a, b, c, d, Add(K(0xc6e00bf3ul), w12 = Add(sigma1(w10), w5)));
Round(d, e, f, g, h, a, b, c, Add(K(0xd5a79147ul), w13 = Add(sigma1(w11), w6)));
Round(c, d, e, f, g, h, a, b, Add(K(0x06ca6351ul), w14 = Add(sigma1(w12), w7, K(0x400022ul))));
Round(b, c, d, e, f, g, h, a, Add(K(0x14292967ul), w15 = Add(K(0x100ul), sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x27b70a85ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x2e1b2138ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x4d2c6dfcul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x53380d13ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x650a7354ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x766a0abbul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x81c2c92eul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x92722c85ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0xa2bfe8a1ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa81a664bul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xc24b8b70ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xc76c51a3ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xd192e819ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd6990624ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xf40e3585ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x106aa070ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x19a4c116ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x1e376c08ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x2748774cul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x34b0bcb5ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x391c0cb3ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4ed8aa4aul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5b9cca4ful), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x682e6ff3ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x748f82eeul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0x78a5636ful), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0x84c87814ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0x8cc70208ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0x90befffaul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xa4506cebul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xbef9a3f7ul), w14, sigma1(w12), w7, sigma0(w15)));
Round(b, c, d, e, f, g, h, a, Add(K(0xc67178f2ul), w15, sigma1(w13), w8, sigma0(w0)));
// Output
Write8(out, 0, Add(a, K(0x6a09e667ul)));
Write8(out, 4, Add(b, K(0xbb67ae85ul)));
Write8(out, 8, Add(c, K(0x3c6ef372ul)));
Write8(out, 12, Add(d, K(0xa54ff53aul)));
Write8(out, 16, Add(e, K(0x510e527ful)));
Write8(out, 20, Add(f, K(0x9b05688cul)));
Write8(out, 24, Add(g, K(0x1f83d9abul)));
Write8(out, 28, Add(h, K(0x5be0cd19ul)));
}
}
#endif

321
src/crypto/sha256_sse41.cpp Normal file
View file

@ -0,0 +1,321 @@
#ifdef ENABLE_SSE41
#include <stdint.h>
#if defined(_MSC_VER)
#include <immintrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#include "crypto/sha256.h"
#include "crypto/common.h"
namespace sha256d64_sse41 {
namespace {
__m128i inline K(uint32_t x) { return _mm_set1_epi32(x); }
__m128i inline Add(__m128i x, __m128i y) { return _mm_add_epi32(x, y); }
__m128i inline Add(__m128i x, __m128i y, __m128i z) { return Add(Add(x, y), z); }
__m128i inline Add(__m128i x, __m128i y, __m128i z, __m128i w) { return Add(Add(x, y), Add(z, w)); }
__m128i inline Add(__m128i x, __m128i y, __m128i z, __m128i w, __m128i v) { return Add(Add(x, y, z), Add(w, v)); }
__m128i inline Inc(__m128i& x, __m128i y) { x = Add(x, y); return x; }
__m128i inline Inc(__m128i& x, __m128i y, __m128i z) { x = Add(x, y, z); return x; }
__m128i inline Inc(__m128i& x, __m128i y, __m128i z, __m128i w) { x = Add(x, y, z, w); return x; }
__m128i inline Xor(__m128i x, __m128i y) { return _mm_xor_si128(x, y); }
__m128i inline Xor(__m128i x, __m128i y, __m128i z) { return Xor(Xor(x, y), z); }
__m128i inline Or(__m128i x, __m128i y) { return _mm_or_si128(x, y); }
__m128i inline And(__m128i x, __m128i y) { return _mm_and_si128(x, y); }
__m128i inline ShR(__m128i x, int n) { return _mm_srli_epi32(x, n); }
__m128i inline ShL(__m128i x, int n) { return _mm_slli_epi32(x, n); }
__m128i inline Ch(__m128i x, __m128i y, __m128i z) { return Xor(z, And(x, Xor(y, z))); }
__m128i inline Maj(__m128i x, __m128i y, __m128i z) { return Or(And(x, y), And(z, Or(x, y))); }
__m128i inline Sigma0(__m128i x) { return Xor(Or(ShR(x, 2), ShL(x, 30)), Or(ShR(x, 13), ShL(x, 19)), Or(ShR(x, 22), ShL(x, 10))); }
__m128i inline Sigma1(__m128i x) { return Xor(Or(ShR(x, 6), ShL(x, 26)), Or(ShR(x, 11), ShL(x, 21)), Or(ShR(x, 25), ShL(x, 7))); }
__m128i inline sigma0(__m128i x) { return Xor(Or(ShR(x, 7), ShL(x, 25)), Or(ShR(x, 18), ShL(x, 14)), ShR(x, 3)); }
__m128i inline sigma1(__m128i x) { return Xor(Or(ShR(x, 17), ShL(x, 15)), Or(ShR(x, 19), ShL(x, 13)), ShR(x, 10)); }
/** One round of SHA-256. */
void inline __attribute__((always_inline)) Round(__m128i a, __m128i b, __m128i c, __m128i& d, __m128i e, __m128i f, __m128i g, __m128i& h, __m128i k)
{
__m128i t1 = Add(h, Sigma1(e), Ch(e, f, g), k);
__m128i t2 = Add(Sigma0(a), Maj(a, b, c));
d = Add(d, t1);
h = Add(t1, t2);
}
__m128i inline Read4(const unsigned char* chunk, int offset) {
__m128i ret = _mm_set_epi32(
ReadLE32(chunk + 0 + offset),
ReadLE32(chunk + 64 + offset),
ReadLE32(chunk + 128 + offset),
ReadLE32(chunk + 192 + offset)
);
return _mm_shuffle_epi8(ret, _mm_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
}
void inline Write4(unsigned char* out, int offset, __m128i v) {
v = _mm_shuffle_epi8(v, _mm_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
WriteLE32(out + 0 + offset, _mm_extract_epi32(v, 3));
WriteLE32(out + 32 + offset, _mm_extract_epi32(v, 2));
WriteLE32(out + 64 + offset, _mm_extract_epi32(v, 1));
WriteLE32(out + 96 + offset, _mm_extract_epi32(v, 0));
}
}
void Transform_4way(unsigned char* out, const unsigned char* in)
{
// Transform 1
__m128i a = K(0x6a09e667ul);
__m128i b = K(0xbb67ae85ul);
__m128i c = K(0x3c6ef372ul);
__m128i d = K(0xa54ff53aul);
__m128i e = K(0x510e527ful);
__m128i f = K(0x9b05688cul);
__m128i g = K(0x1f83d9abul);
__m128i h = K(0x5be0cd19ul);
__m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
Round(a, b, c, d, e, f, g, h, Add(K(0x428a2f98ul), w0 = Read4(in, 0)));
Round(h, a, b, c, d, e, f, g, Add(K(0x71374491ul), w1 = Read4(in, 4)));
Round(g, h, a, b, c, d, e, f, Add(K(0xb5c0fbcful), w2 = Read4(in, 8)));
Round(f, g, h, a, b, c, d, e, Add(K(0xe9b5dba5ul), w3 = Read4(in, 12)));
Round(e, f, g, h, a, b, c, d, Add(K(0x3956c25bul), w4 = Read4(in, 16)));
Round(d, e, f, g, h, a, b, c, Add(K(0x59f111f1ul), w5 = Read4(in, 20)));
Round(c, d, e, f, g, h, a, b, Add(K(0x923f82a4ul), w6 = Read4(in, 24)));
Round(b, c, d, e, f, g, h, a, Add(K(0xab1c5ed5ul), w7 = Read4(in, 28)));
Round(a, b, c, d, e, f, g, h, Add(K(0xd807aa98ul), w8 = Read4(in, 32)));
Round(h, a, b, c, d, e, f, g, Add(K(0x12835b01ul), w9 = Read4(in, 36)));
Round(g, h, a, b, c, d, e, f, Add(K(0x243185beul), w10 = Read4(in, 40)));
Round(f, g, h, a, b, c, d, e, Add(K(0x550c7dc3ul), w11 = Read4(in, 44)));
Round(e, f, g, h, a, b, c, d, Add(K(0x72be5d74ul), w12 = Read4(in, 48)));
Round(d, e, f, g, h, a, b, c, Add(K(0x80deb1feul), w13 = Read4(in, 52)));
Round(c, d, e, f, g, h, a, b, Add(K(0x9bdc06a7ul), w14 = Read4(in, 56)));
Round(b, c, d, e, f, g, h, a, Add(K(0xc19bf174ul), w15 = Read4(in, 60)));
Round(a, b, c, d, e, f, g, h, Add(K(0xe49b69c1ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0xefbe4786ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x0fc19dc6ul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x240ca1ccul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x2de92c6ful), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4a7484aaul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5cb0a9dcul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x76f988daul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x983e5152ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa831c66dul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xb00327c8ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xbf597fc7ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xc6e00bf3ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd5a79147ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0x06ca6351ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x14292967ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x27b70a85ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x2e1b2138ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x4d2c6dfcul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x53380d13ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x650a7354ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x766a0abbul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x81c2c92eul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x92722c85ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0xa2bfe8a1ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa81a664bul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xc24b8b70ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xc76c51a3ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xd192e819ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd6990624ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xf40e3585ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x106aa070ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x19a4c116ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x1e376c08ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x2748774cul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x34b0bcb5ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x391c0cb3ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4ed8aa4aul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5b9cca4ful), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x682e6ff3ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x748f82eeul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0x78a5636ful), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0x84c87814ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0x8cc70208ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0x90befffaul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xa4506cebul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xbef9a3f7ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0xc67178f2ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
a = Add(a, K(0x6a09e667ul));
b = Add(b, K(0xbb67ae85ul));
c = Add(c, K(0x3c6ef372ul));
d = Add(d, K(0xa54ff53aul));
e = Add(e, K(0x510e527ful));
f = Add(f, K(0x9b05688cul));
g = Add(g, K(0x1f83d9abul));
h = Add(h, K(0x5be0cd19ul));
__m128i t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h;
// Transform 2
Round(a, b, c, d, e, f, g, h, K(0xc28a2f98ul));
Round(h, a, b, c, d, e, f, g, K(0x71374491ul));
Round(g, h, a, b, c, d, e, f, K(0xb5c0fbcful));
Round(f, g, h, a, b, c, d, e, K(0xe9b5dba5ul));
Round(e, f, g, h, a, b, c, d, K(0x3956c25bul));
Round(d, e, f, g, h, a, b, c, K(0x59f111f1ul));
Round(c, d, e, f, g, h, a, b, K(0x923f82a4ul));
Round(b, c, d, e, f, g, h, a, K(0xab1c5ed5ul));
Round(a, b, c, d, e, f, g, h, K(0xd807aa98ul));
Round(h, a, b, c, d, e, f, g, K(0x12835b01ul));
Round(g, h, a, b, c, d, e, f, K(0x243185beul));
Round(f, g, h, a, b, c, d, e, K(0x550c7dc3ul));
Round(e, f, g, h, a, b, c, d, K(0x72be5d74ul));
Round(d, e, f, g, h, a, b, c, K(0x80deb1feul));
Round(c, d, e, f, g, h, a, b, K(0x9bdc06a7ul));
Round(b, c, d, e, f, g, h, a, K(0xc19bf374ul));
Round(a, b, c, d, e, f, g, h, K(0x649b69c1ul));
Round(h, a, b, c, d, e, f, g, K(0xf0fe4786ul));
Round(g, h, a, b, c, d, e, f, K(0x0fe1edc6ul));
Round(f, g, h, a, b, c, d, e, K(0x240cf254ul));
Round(e, f, g, h, a, b, c, d, K(0x4fe9346ful));
Round(d, e, f, g, h, a, b, c, K(0x6cc984beul));
Round(c, d, e, f, g, h, a, b, K(0x61b9411eul));
Round(b, c, d, e, f, g, h, a, K(0x16f988faul));
Round(a, b, c, d, e, f, g, h, K(0xf2c65152ul));
Round(h, a, b, c, d, e, f, g, K(0xa88e5a6dul));
Round(g, h, a, b, c, d, e, f, K(0xb019fc65ul));
Round(f, g, h, a, b, c, d, e, K(0xb9d99ec7ul));
Round(e, f, g, h, a, b, c, d, K(0x9a1231c3ul));
Round(d, e, f, g, h, a, b, c, K(0xe70eeaa0ul));
Round(c, d, e, f, g, h, a, b, K(0xfdb1232bul));
Round(b, c, d, e, f, g, h, a, K(0xc7353eb0ul));
Round(a, b, c, d, e, f, g, h, K(0x3069bad5ul));
Round(h, a, b, c, d, e, f, g, K(0xcb976d5ful));
Round(g, h, a, b, c, d, e, f, K(0x5a0f118ful));
Round(f, g, h, a, b, c, d, e, K(0xdc1eeefdul));
Round(e, f, g, h, a, b, c, d, K(0x0a35b689ul));
Round(d, e, f, g, h, a, b, c, K(0xde0b7a04ul));
Round(c, d, e, f, g, h, a, b, K(0x58f4ca9dul));
Round(b, c, d, e, f, g, h, a, K(0xe15d5b16ul));
Round(a, b, c, d, e, f, g, h, K(0x007f3e86ul));
Round(h, a, b, c, d, e, f, g, K(0x37088980ul));
Round(g, h, a, b, c, d, e, f, K(0xa507ea32ul));
Round(f, g, h, a, b, c, d, e, K(0x6fab9537ul));
Round(e, f, g, h, a, b, c, d, K(0x17406110ul));
Round(d, e, f, g, h, a, b, c, K(0x0d8cd6f1ul));
Round(c, d, e, f, g, h, a, b, K(0xcdaa3b6dul));
Round(b, c, d, e, f, g, h, a, K(0xc0bbbe37ul));
Round(a, b, c, d, e, f, g, h, K(0x83613bdaul));
Round(h, a, b, c, d, e, f, g, K(0xdb48a363ul));
Round(g, h, a, b, c, d, e, f, K(0x0b02e931ul));
Round(f, g, h, a, b, c, d, e, K(0x6fd15ca7ul));
Round(e, f, g, h, a, b, c, d, K(0x521afacaul));
Round(d, e, f, g, h, a, b, c, K(0x31338431ul));
Round(c, d, e, f, g, h, a, b, K(0x6ed41a95ul));
Round(b, c, d, e, f, g, h, a, K(0x6d437890ul));
Round(a, b, c, d, e, f, g, h, K(0xc39c91f2ul));
Round(h, a, b, c, d, e, f, g, K(0x9eccabbdul));
Round(g, h, a, b, c, d, e, f, K(0xb5c9a0e6ul));
Round(f, g, h, a, b, c, d, e, K(0x532fb63cul));
Round(e, f, g, h, a, b, c, d, K(0xd2c741c6ul));
Round(d, e, f, g, h, a, b, c, K(0x07237ea3ul));
Round(c, d, e, f, g, h, a, b, K(0xa4954b68ul));
Round(b, c, d, e, f, g, h, a, K(0x4c191d76ul));
w0 = Add(t0, a);
w1 = Add(t1, b);
w2 = Add(t2, c);
w3 = Add(t3, d);
w4 = Add(t4, e);
w5 = Add(t5, f);
w6 = Add(t6, g);
w7 = Add(t7, h);
// Transform 3
a = K(0x6a09e667ul);
b = K(0xbb67ae85ul);
c = K(0x3c6ef372ul);
d = K(0xa54ff53aul);
e = K(0x510e527ful);
f = K(0x9b05688cul);
g = K(0x1f83d9abul);
h = K(0x5be0cd19ul);
Round(a, b, c, d, e, f, g, h, Add(K(0x428a2f98ul), w0));
Round(h, a, b, c, d, e, f, g, Add(K(0x71374491ul), w1));
Round(g, h, a, b, c, d, e, f, Add(K(0xb5c0fbcful), w2));
Round(f, g, h, a, b, c, d, e, Add(K(0xe9b5dba5ul), w3));
Round(e, f, g, h, a, b, c, d, Add(K(0x3956c25bul), w4));
Round(d, e, f, g, h, a, b, c, Add(K(0x59f111f1ul), w5));
Round(c, d, e, f, g, h, a, b, Add(K(0x923f82a4ul), w6));
Round(b, c, d, e, f, g, h, a, Add(K(0xab1c5ed5ul), w7));
Round(a, b, c, d, e, f, g, h, K(0x5807aa98ul));
Round(h, a, b, c, d, e, f, g, K(0x12835b01ul));
Round(g, h, a, b, c, d, e, f, K(0x243185beul));
Round(f, g, h, a, b, c, d, e, K(0x550c7dc3ul));
Round(e, f, g, h, a, b, c, d, K(0x72be5d74ul));
Round(d, e, f, g, h, a, b, c, K(0x80deb1feul));
Round(c, d, e, f, g, h, a, b, K(0x9bdc06a7ul));
Round(b, c, d, e, f, g, h, a, K(0xc19bf274ul));
Round(a, b, c, d, e, f, g, h, Add(K(0xe49b69c1ul), Inc(w0, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0xefbe4786ul), Inc(w1, K(0xa00000ul), sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x0fc19dc6ul), Inc(w2, sigma1(w0), sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x240ca1ccul), Inc(w3, sigma1(w1), sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x2de92c6ful), Inc(w4, sigma1(w2), sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4a7484aaul), Inc(w5, sigma1(w3), sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5cb0a9dcul), Inc(w6, sigma1(w4), K(0x100ul), sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x76f988daul), Inc(w7, sigma1(w5), w0, K(0x11002000ul))));
Round(a, b, c, d, e, f, g, h, Add(K(0x983e5152ul), w8 = Add(K(0x80000000ul), sigma1(w6), w1)));
Round(h, a, b, c, d, e, f, g, Add(K(0xa831c66dul), w9 = Add(sigma1(w7), w2)));
Round(g, h, a, b, c, d, e, f, Add(K(0xb00327c8ul), w10 = Add(sigma1(w8), w3)));
Round(f, g, h, a, b, c, d, e, Add(K(0xbf597fc7ul), w11 = Add(sigma1(w9), w4)));
Round(e, f, g, h, a, b, c, d, Add(K(0xc6e00bf3ul), w12 = Add(sigma1(w10), w5)));
Round(d, e, f, g, h, a, b, c, Add(K(0xd5a79147ul), w13 = Add(sigma1(w11), w6)));
Round(c, d, e, f, g, h, a, b, Add(K(0x06ca6351ul), w14 = Add(sigma1(w12), w7, K(0x400022ul))));
Round(b, c, d, e, f, g, h, a, Add(K(0x14292967ul), w15 = Add(K(0x100ul), sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x27b70a85ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x2e1b2138ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x4d2c6dfcul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x53380d13ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x650a7354ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x766a0abbul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x81c2c92eul), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x92722c85ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0xa2bfe8a1ul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0xa81a664bul), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0xc24b8b70ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0xc76c51a3ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0xd192e819ul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xd6990624ul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xf40e3585ul), Inc(w14, sigma1(w12), w7, sigma0(w15))));
Round(b, c, d, e, f, g, h, a, Add(K(0x106aa070ul), Inc(w15, sigma1(w13), w8, sigma0(w0))));
Round(a, b, c, d, e, f, g, h, Add(K(0x19a4c116ul), Inc(w0, sigma1(w14), w9, sigma0(w1))));
Round(h, a, b, c, d, e, f, g, Add(K(0x1e376c08ul), Inc(w1, sigma1(w15), w10, sigma0(w2))));
Round(g, h, a, b, c, d, e, f, Add(K(0x2748774cul), Inc(w2, sigma1(w0), w11, sigma0(w3))));
Round(f, g, h, a, b, c, d, e, Add(K(0x34b0bcb5ul), Inc(w3, sigma1(w1), w12, sigma0(w4))));
Round(e, f, g, h, a, b, c, d, Add(K(0x391c0cb3ul), Inc(w4, sigma1(w2), w13, sigma0(w5))));
Round(d, e, f, g, h, a, b, c, Add(K(0x4ed8aa4aul), Inc(w5, sigma1(w3), w14, sigma0(w6))));
Round(c, d, e, f, g, h, a, b, Add(K(0x5b9cca4ful), Inc(w6, sigma1(w4), w15, sigma0(w7))));
Round(b, c, d, e, f, g, h, a, Add(K(0x682e6ff3ul), Inc(w7, sigma1(w5), w0, sigma0(w8))));
Round(a, b, c, d, e, f, g, h, Add(K(0x748f82eeul), Inc(w8, sigma1(w6), w1, sigma0(w9))));
Round(h, a, b, c, d, e, f, g, Add(K(0x78a5636ful), Inc(w9, sigma1(w7), w2, sigma0(w10))));
Round(g, h, a, b, c, d, e, f, Add(K(0x84c87814ul), Inc(w10, sigma1(w8), w3, sigma0(w11))));
Round(f, g, h, a, b, c, d, e, Add(K(0x8cc70208ul), Inc(w11, sigma1(w9), w4, sigma0(w12))));
Round(e, f, g, h, a, b, c, d, Add(K(0x90befffaul), Inc(w12, sigma1(w10), w5, sigma0(w13))));
Round(d, e, f, g, h, a, b, c, Add(K(0xa4506cebul), Inc(w13, sigma1(w11), w6, sigma0(w14))));
Round(c, d, e, f, g, h, a, b, Add(K(0xbef9a3f7ul), w14, sigma1(w12), w7, sigma0(w15)));
Round(b, c, d, e, f, g, h, a, Add(K(0xc67178f2ul), w15, sigma1(w13), w8, sigma0(w0)));
// Output
Write4(out, 0, Add(a, K(0x6a09e667ul)));
Write4(out, 4, Add(b, K(0xbb67ae85ul)));
Write4(out, 8, Add(c, K(0x3c6ef372ul)));
Write4(out, 12, Add(d, K(0xa54ff53aul)));
Write4(out, 16, Add(e, K(0x510e527ful)));
Write4(out, 20, Add(f, K(0x9b05688cul)));
Write4(out, 24, Add(g, K(0x1f83d9abul)));
Write4(out, 28, Add(h, K(0x5be0cd19ul)));
}
}
#endif

View file

@ -546,4 +546,20 @@ BOOST_AUTO_TEST_CASE(countbits_tests)
}
}
BOOST_AUTO_TEST_CASE(sha256d64)
{
for (int i = 0; i <= 32; ++i) {
unsigned char in[64 * 32];
unsigned char out1[32 * 32], out2[32 * 32];
for (int j = 0; j < 64 * i; ++j) {
in[j] = InsecureRandBits(8);
}
for (int j = 0; j < i; ++j) {
CHash256().Write(in + 64 * j, 64).Finalize(out1 + 32 * j);
}
SHA256D64(out2, in, i);
BOOST_CHECK(memcmp(out1, out2, 32 * i) == 0);
}
}
BOOST_AUTO_TEST_SUITE_END()

View file

@ -9,6 +9,123 @@
BOOST_FIXTURE_TEST_SUITE(merkle_tests, TestingSetup)
static uint256 ComputeMerkleRootFromBranch(const uint256& leaf, const std::vector<uint256>& vMerkleBranch, uint32_t nIndex) {
uint256 hash = leaf;
for (std::vector<uint256>::const_iterator it = vMerkleBranch.begin(); it != vMerkleBranch.end(); ++it) {
if (nIndex & 1) {
hash = Hash(BEGIN(*it), END(*it), BEGIN(hash), END(hash));
} else {
hash = Hash(BEGIN(hash), END(hash), BEGIN(*it), END(*it));
}
nIndex >>= 1;
}
return hash;
}
/* This implements a constant-space merkle root/path calculator, limited to 2^32 leaves. */
static void MerkleComputation(const std::vector<uint256>& leaves, uint256* proot, bool* pmutated, uint32_t branchpos, std::vector<uint256>* pbranch) {
if (pbranch) pbranch->clear();
if (leaves.size() == 0) {
if (pmutated) *pmutated = false;
if (proot) *proot = uint256();
return;
}
bool mutated = false;
// count is the number of leaves processed so far.
uint32_t count = 0;
// inner is an array of eagerly computed subtree hashes, indexed by tree
// level (0 being the leaves).
// For example, when count is 25 (11001 in binary), inner[4] is the hash of
// the first 16 leaves, inner[3] of the next 8 leaves, and inner[0] equal to
// the last leaf. The other inner entries are undefined.
uint256 inner[32];
// Which position in inner is a hash that depends on the matching leaf.
int matchlevel = -1;
// First process all leaves into 'inner' values.
while (count < leaves.size()) {
uint256 h = leaves[count];
bool matchh = count == branchpos;
count++;
int level;
// For each of the lower bits in count that are 0, do 1 step. Each
// corresponds to an inner value that existed before processing the
// current leaf, and each needs a hash to combine it.
for (level = 0; !(count & (((uint32_t)1) << level)); level++) {
if (pbranch) {
if (matchh) {
pbranch->push_back(inner[level]);
} else if (matchlevel == level) {
pbranch->push_back(h);
matchh = true;
}
}
mutated |= (inner[level] == h);
CHash256().Write(inner[level].begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
}
// Store the resulting hash at inner position level.
inner[level] = h;
if (matchh) {
matchlevel = level;
}
}
// Do a final 'sweep' over the rightmost branch of the tree to process
// odd levels, and reduce everything to a single top value.
// Level is the level (counted from the bottom) up to which we've sweeped.
int level = 0;
// As long as bit number level in count is zero, skip it. It means there
// is nothing left at this level.
while (!(count & (((uint32_t)1) << level))) {
level++;
}
uint256 h = inner[level];
bool matchh = matchlevel == level;
while (count != (((uint32_t)1) << level)) {
// If we reach this point, h is an inner value that is not the top.
// We combine it with itself (Bitcoin's special rule for odd levels in
// the tree) to produce a higher level one.
if (pbranch && matchh) {
pbranch->push_back(h);
}
CHash256().Write(h.begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
// Increment count to the value it would have if two entries at this
// level had existed.
count += (((uint32_t)1) << level);
level++;
// And propagate the result upwards accordingly.
while (!(count & (((uint32_t)1) << level))) {
if (pbranch) {
if (matchh) {
pbranch->push_back(inner[level]);
} else if (matchlevel == level) {
pbranch->push_back(h);
matchh = true;
}
}
CHash256().Write(inner[level].begin(), 32).Write(h.begin(), 32).Finalize(h.begin());
level++;
}
}
// Return result.
if (pmutated) *pmutated = mutated;
if (proot) *proot = h;
}
static std::vector<uint256> ComputeMerkleBranch(const std::vector<uint256>& leaves, uint32_t position) {
std::vector<uint256> ret;
MerkleComputation(leaves, nullptr, nullptr, position, &ret);
return ret;
}
static std::vector<uint256> BlockMerkleBranch(const CBlock& block, uint32_t position)
{
std::vector<uint256> leaves;
leaves.resize(block.vtx.size());
for (size_t s = 0; s < block.vtx.size(); s++) {
leaves[s] = block.vtx[s]->GetHash();
}
return ComputeMerkleBranch(leaves, position);
}
// Older version of the merkle root computation code, for comparison.
static uint256 BlockBuildMerkleTree(const CBlock& block, bool* fMutated, std::vector<uint256>& vMerkleTree)
{