Use RdSeed when available, and reduce RdRand load

This introduces support for autodetecting and using the RdSeed instruction.

In addition:
* In SeedFast, only 64 bits of entropy are generated through RdRand (256 was relatively slow).
* In SeedStartup, 256 bits of entropy are generated, using RdSeed (preferably) or RdRand (otherwise).
This commit is contained in:
Pieter Wuille 2019-01-24 18:40:02 -08:00
parent 72bd4ab867
commit 1435fabc19
2 changed files with 137 additions and 40 deletions

View file

@ -78,25 +78,119 @@ static inline int64_t GetPerformanceCounter() noexcept
} }
#if defined(__x86_64__) || defined(__amd64__) || defined(__i386__) #if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
static bool rdrand_supported = false; static bool g_rdrand_supported = false;
static bool g_rdseed_supported = false;
static constexpr uint32_t CPUID_F1_ECX_RDRAND = 0x40000000; static constexpr uint32_t CPUID_F1_ECX_RDRAND = 0x40000000;
static constexpr uint32_t CPUID_F7_EBX_RDSEED = 0x00040000;
#ifdef bit_RDRND
static_assert(CPUID_F1_ECX_RDRAND == bit_RDRND, "Unexpected value for bit_RDRND");
#endif
#ifdef bit_RDSEED
static_assert(CPUID_F7_EBX_RDSEED == bit_RDSEED, "Unexpected value for bit_RDSEED");
#endif
static void inline GetCPUID(uint32_t leaf, uint32_t subleaf, uint32_t& a, uint32_t& b, uint32_t& c, uint32_t& d)
{
// We can't use __get_cpuid as it doesn't support subleafs.
#ifdef __GNUC__
__cpuid_count(leaf, subleaf, a, b, c, d);
#else
__asm__ ("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(leaf), "2"(subleaf));
#endif
}
static void InitHardwareRand() static void InitHardwareRand()
{ {
uint32_t eax, ebx, ecx, edx; uint32_t eax, ebx, ecx, edx;
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) && (ecx & CPUID_F1_ECX_RDRAND)) { GetCPUID(1, 0, eax, ebx, ecx, edx);
rdrand_supported = true; if (ecx & CPUID_F1_ECX_RDRAND) {
g_rdrand_supported = true;
}
GetCPUID(7, 0, eax, ebx, ecx, edx);
if (ebx & CPUID_F7_EBX_RDSEED) {
g_rdseed_supported = true;
} }
} }
static void ReportHardwareRand() static void ReportHardwareRand()
{ {
if (rdrand_supported) { // This must be done in a separate function, as HWRandInit() may be indirectly called
// This must be done in a separate function, as HWRandInit() may be indirectly called // from global constructors, before logging is initialized.
// from global constructors, before logging is initialized. if (g_rdseed_supported) {
LogPrintf("Using RdSeed as additional entropy source\n");
}
if (g_rdrand_supported) {
LogPrintf("Using RdRand as an additional entropy source\n"); LogPrintf("Using RdRand as an additional entropy source\n");
} }
} }
/** Read 64 bits of entropy using rdrand.
*
* Must only be called when RdRand is supported.
*/
static uint64_t GetRdRand() noexcept
{
// RdRand may very rarely fail. Invoke it up to 10 times in a loop to reduce this risk.
#ifdef __i386__
uint8_t ok;
uint32_t r1, r2;
for (int i = 0; i < 10; ++i) {
__asm__ volatile (".byte 0x0f, 0xc7, 0xf0; setc %1" : "=a"(r1), "=q"(ok) :: "cc"); // rdrand %eax
if (ok) break;
}
for (int i = 0; i < 10; ++i) {
__asm__ volatile (".byte 0x0f, 0xc7, 0xf0; setc %1" : "=a"(r2), "=q"(ok) :: "cc"); // rdrand %eax
if (ok) break;
}
return (((uint64_t)r2) << 32) | r1;
#elif defined(__x86_64__) || defined(__amd64__)
uint8_t ok;
uint64_t r1;
for (int i = 0; i < 10; ++i) {
__asm__ volatile (".byte 0x48, 0x0f, 0xc7, 0xf0; setc %1" : "=a"(r1), "=q"(ok) :: "cc"); // rdrand %rax
if (ok) break;
}
return r1;
#else
#error "RdRand is only supported on x86 and x86_64"
#endif
}
/** Read 64 bits of entropy using rdseed.
*
* Must only be called when RdSeed is supported.
*/
static uint64_t GetRdSeed() noexcept
{
// RdSeed may fail when the HW RNG is overloaded. Loop indefinitely until enough entropy is gathered,
// but pause after every failure.
#ifdef __i386__
uint8_t ok;
uint32_t r1, r2;
do {
__asm__ volatile (".byte 0x0f, 0xc7, 0xf8; setc %1" : "=a"(r1), "=q"(ok) :: "cc"); // rdseed %eax
if (ok) break;
__asm__ volatile ("pause");
} while(true);
do {
__asm__ volatile (".byte 0x0f, 0xc7, 0xf8; setc %1" : "=a"(r2), "=q"(ok) :: "cc"); // rdseed %eax
if (ok) break;
__asm__ volatile ("pause");
} while(true);
return (((uint64_t)r2) << 32) | r1;
#elif defined(__x86_64__) || defined(__amd64__)
uint8_t ok;
uint64_t r1;
do {
__asm__ volatile (".byte 0x48, 0x0f, 0xc7, 0xf8; setc %1" : "=a"(r1), "=q"(ok) :: "cc"); // rdseed %rax
if (ok) break;
__asm__ volatile ("pause");
} while(true);
return r1;
#else
#error "RdSeed is only supported on x86 and x86_64"
#endif
}
#else #else
/* Access to other hardware random number generators could be added here later, /* Access to other hardware random number generators could be added here later,
* assuming it is sufficiently fast (in the order of a few hundred CPU cycles). * assuming it is sufficiently fast (in the order of a few hundred CPU cycles).
@ -107,40 +201,40 @@ static void InitHardwareRand() {}
static void ReportHardwareRand() {} static void ReportHardwareRand() {}
#endif #endif
static bool GetHardwareRand(unsigned char* ent32) noexcept { /** Add 64 bits of entropy gathered from hardware to hasher. Do nothing if not supported. */
static void SeedHardwareFast(CSHA512& hasher) noexcept {
#if defined(__x86_64__) || defined(__amd64__) || defined(__i386__) #if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
if (rdrand_supported) { if (g_rdrand_supported) {
uint8_t ok; uint64_t out = GetRdRand();
// Not all assemblers support the rdrand instruction, write it in hex. hasher.Write((const unsigned char*)&out, sizeof(out));
#ifdef __i386__ return;
for (int iter = 0; iter < 4; ++iter) { }
uint32_t r1, r2; #endif
__asm__ volatile (".byte 0x0f, 0xc7, 0xf0;" // rdrand %eax }
".byte 0x0f, 0xc7, 0xf2;" // rdrand %edx
"setc %2" : /** Add 256 bits of entropy gathered from hardware to hasher. Do nothing if not supported. */
"=a"(r1), "=d"(r2), "=q"(ok) :: "cc"); static void SeedHardwareSlow(CSHA512& hasher) noexcept {
if (!ok) return false; #if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
WriteLE32(ent32 + 8 * iter, r1); // When we want 256 bits of entropy, prefer RdSeed over RdRand, as it's
WriteLE32(ent32 + 8 * iter + 4, r2); // guaranteed to produce independent randomness on every call.
} if (g_rdseed_supported) {
#else for (int i = 0; i < 4; ++i) {
uint64_t r1, r2, r3, r4; uint64_t out = GetRdSeed();
__asm__ volatile (".byte 0x48, 0x0f, 0xc7, 0xf0, " // rdrand %rax hasher.Write((const unsigned char*)&out, sizeof(out));
"0x48, 0x0f, 0xc7, 0xf3, " // rdrand %rbx }
"0x48, 0x0f, 0xc7, 0xf1, " // rdrand %rcx return;
"0x48, 0x0f, 0xc7, 0xf2; " // rdrand %rdx }
"setc %4" : // When falling back to RdRand, XOR the result of 1024 results.
"=a"(r1), "=b"(r2), "=c"(r3), "=d"(r4), "=q"(ok) :: "cc"); // This guarantees a reseeding occurs between each.
if (!ok) return false; if (g_rdrand_supported) {
WriteLE64(ent32, r1); for (int i = 0; i < 4; ++i) {
WriteLE64(ent32 + 8, r2); uint64_t out = 0;
WriteLE64(ent32 + 16, r3); for (int j = 0; j < 1024; ++j) out ^= GetRdRand();
WriteLE64(ent32 + 24, r4); hasher.Write((const unsigned char*)&out, sizeof(out));
#endif }
return true; return;
} }
#endif #endif
return false;
} }
static void RandAddSeedPerfmon(CSHA512& hasher) static void RandAddSeedPerfmon(CSHA512& hasher)
@ -407,8 +501,7 @@ static void SeedFast(CSHA512& hasher) noexcept
hasher.Write((const unsigned char*)&ptr, sizeof(ptr)); hasher.Write((const unsigned char*)&ptr, sizeof(ptr));
// Hardware randomness is very fast when available; use it always. // Hardware randomness is very fast when available; use it always.
bool have_hw_rand = GetHardwareRand(buffer); SeedHardwareFast(hasher);
if (have_hw_rand) hasher.Write(buffer, sizeof(buffer));
// High-precision timestamp // High-precision timestamp
SeedTimestamp(hasher); SeedTimestamp(hasher);
@ -460,6 +553,9 @@ static void SeedStartup(CSHA512& hasher) noexcept
RAND_screen(); RAND_screen();
#endif #endif
// Gather 256 bits of hardware randomness, if available
SeedHardwareSlow(hasher);
// Everything that the 'slow' seeder includes. // Everything that the 'slow' seeder includes.
SeedSlow(hasher); SeedSlow(hasher);

View file

@ -24,7 +24,7 @@
* perform 'fast' seeding, consisting of mixing in: * perform 'fast' seeding, consisting of mixing in:
* - A stack pointer (indirectly committing to calling thread and call stack) * - A stack pointer (indirectly committing to calling thread and call stack)
* - A high-precision timestamp (rdtsc when available, c++ high_resolution_clock otherwise) * - A high-precision timestamp (rdtsc when available, c++ high_resolution_clock otherwise)
* - Hardware RNG (rdrand) when available. * - 64 bits from the hardware RNG (rdrand) when available.
* These entropy sources are very fast, and only designed to protect against situations * These entropy sources are very fast, and only designed to protect against situations
* where a VM state restore/copy results in multiple systems with the same randomness. * where a VM state restore/copy results in multiple systems with the same randomness.
* FastRandomContext on the other hand does not protect against this once created, but * FastRandomContext on the other hand does not protect against this once created, but
@ -48,6 +48,7 @@
* *
* On first use of the RNG (regardless of what function is called first), all entropy * On first use of the RNG (regardless of what function is called first), all entropy
* sources used in the 'slow' seeder are included, but also: * sources used in the 'slow' seeder are included, but also:
* - 256 bits from the hardware RNG (rdseed or rdrand) when available.
* - (On Windows) Performance monitoring data from the OS. * - (On Windows) Performance monitoring data from the OS.
* - (On Windows) Through OpenSSL, the screen contents. * - (On Windows) Through OpenSSL, the screen contents.
* *