Merge pull request #6064

f46a680 Better mruset unit test (Pieter Wuille)
d4d5022 Use ring buffer of set iterators instead of deque of copies in mruset (Pieter Wuille)
d81cff3 Replace mruset setAddrKnown with CRollingBloomFilter addrKnown (Gavin Andresen)
69a5f8b Rolling bloom filter class (Gavin Andresen)
This commit is contained in:
Wladimir J. van der Laan 2015-05-01 11:52:09 +02:00
commit b46e7c24e5
No known key found for this signature in database
GPG key ID: 74810B012346C9A6
8 changed files with 252 additions and 115 deletions

View file

@ -21,22 +21,33 @@
using namespace std; using namespace std;
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) : CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
/** /**
* The ideal size for a bloom filter with a given number of elements and false positive rate is: * The ideal size for a bloom filter with a given number of elements and false positive rate is:
* - nElements * log(fp rate) / ln(2)^2 * - nElements * log(fp rate) / ln(2)^2
* We ignore filter parameters which will create a bloom filter larger than the protocol limits * We ignore filter parameters which will create a bloom filter larger than the protocol limits
*/ */
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
/** /**
* The ideal number of hash functions is filter size * ln(2) / number of elements * The ideal number of hash functions is filter size * ln(2) / number of elements
* Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
* See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
*/ */
isFull(false), isFull(false),
isEmpty(false), isEmpty(false),
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
nTweak(nTweakIn), nTweak(nTweakIn),
nFlags(nFlagsIn) nFlags(nFlagsIn)
{
}
// Private constructor used by CRollingBloomFilter
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) :
vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8),
isFull(false),
isEmpty(true),
nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)),
nTweak(nTweakIn),
nFlags(BLOOM_UPDATE_NONE)
{ {
} }
@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull()
isFull = full; isFull = full;
isEmpty = empty; isEmpty = empty;
} }
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) :
b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak)
{
// Implemented using two bloom filters of 2 * nElements each.
// We fill them up, and clear them, staggered, every nElements
// inserted, so at least one always contains the last nElements
// inserted.
nBloomSize = nElements * 2;
nInsertions = 0;
}
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
{
if (nInsertions == 0) {
b1.clear();
} else if (nInsertions == nBloomSize / 2) {
b2.clear();
}
b1.insert(vKey);
b2.insert(vKey);
if (++nInsertions == nBloomSize) {
nInsertions = 0;
}
}
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
{
if (nInsertions < nBloomSize / 2) {
return b2.contains(vKey);
}
return b1.contains(vKey);
}
void CRollingBloomFilter::clear()
{
b1.clear();
b2.clear();
nInsertions = 0;
}

View file

@ -53,6 +53,10 @@ private:
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const; unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
// Private constructor for CRollingBloomFilter, no restrictions on size
CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
friend class CRollingBloomFilter;
public: public:
/** /**
* Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements * Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
@ -97,4 +101,28 @@ public:
void UpdateEmptyFull(); void UpdateEmptyFull();
}; };
/**
* RollingBloomFilter is a probabilistic "keep track of most recently inserted" set.
* Construct it with the number of items to keep track of, and a false-positive rate.
*
* contains(item) will always return true if item was one of the last N things
* insert()'ed ... but may also return true for items that were not inserted.
*/
class CRollingBloomFilter
{
public:
CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
void insert(const std::vector<unsigned char>& vKey);
bool contains(const std::vector<unsigned char>& vKey) const;
void clear();
private:
unsigned int nBloomSize;
unsigned int nInsertions;
CBloomFilter b1, b2;
};
#endif // BITCOIN_BLOOM_H #endif // BITCOIN_BLOOM_H

View file

@ -3995,7 +3995,7 @@ bool static ProcessMessage(CNode* pfrom, string strCommand, CDataStream& vRecv,
{ {
LOCK(cs_vNodes); LOCK(cs_vNodes);
// Use deterministic randomness to send to the same nodes for 24 hours // Use deterministic randomness to send to the same nodes for 24 hours
// at a time so the setAddrKnowns of the chosen nodes prevent repeats // at a time so the addrKnowns of the chosen nodes prevent repeats
static uint256 hashSalt; static uint256 hashSalt;
if (hashSalt.IsNull()) if (hashSalt.IsNull())
hashSalt = GetRandHash(); hashSalt = GetRandHash();
@ -4779,9 +4779,9 @@ bool SendMessages(CNode* pto, bool fSendTrickle)
LOCK(cs_vNodes); LOCK(cs_vNodes);
BOOST_FOREACH(CNode* pnode, vNodes) BOOST_FOREACH(CNode* pnode, vNodes)
{ {
// Periodically clear setAddrKnown to allow refresh broadcasts // Periodically clear addrKnown to allow refresh broadcasts
if (nLastRebroadcast) if (nLastRebroadcast)
pnode->setAddrKnown.clear(); pnode->addrKnown.clear();
// Rebroadcast our address // Rebroadcast our address
AdvertizeLocal(pnode); AdvertizeLocal(pnode);
@ -4799,9 +4799,9 @@ bool SendMessages(CNode* pto, bool fSendTrickle)
vAddr.reserve(pto->vAddrToSend.size()); vAddr.reserve(pto->vAddrToSend.size());
BOOST_FOREACH(const CAddress& addr, pto->vAddrToSend) BOOST_FOREACH(const CAddress& addr, pto->vAddrToSend)
{ {
// returns true if wasn't already contained in the set if (!pto->addrKnown.contains(addr.GetKey()))
if (pto->setAddrKnown.insert(addr).second)
{ {
pto->addrKnown.insert(addr.GetKey());
vAddr.push_back(addr); vAddr.push_back(addr);
// receiver rejects addr messages larger than 1000 // receiver rejects addr messages larger than 1000
if (vAddr.size() >= 1000) if (vAddr.size() >= 1000)

View file

@ -1,12 +1,12 @@
// Copyright (c) 2012 The Bitcoin Core developers // Copyright (c) 2012-2015 The Bitcoin Core developers
// Distributed under the MIT software license, see the accompanying // Distributed under the MIT software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php. // file COPYING or http://www.opensource.org/licenses/mit-license.php.
#ifndef BITCOIN_MRUSET_H #ifndef BITCOIN_MRUSET_H
#define BITCOIN_MRUSET_H #define BITCOIN_MRUSET_H
#include <deque>
#include <set> #include <set>
#include <vector>
#include <utility> #include <utility>
/** STL-like set container that only keeps the most recent N elements. */ /** STL-like set container that only keeps the most recent N elements. */
@ -22,11 +22,13 @@ public:
protected: protected:
std::set<T> set; std::set<T> set;
std::deque<T> queue; std::vector<iterator> order;
size_type nMaxSize; size_type first_used;
size_type first_unused;
const size_type nMaxSize;
public: public:
mruset(size_type nMaxSizeIn = 0) { nMaxSize = nMaxSizeIn; } mruset(size_type nMaxSizeIn = 1) : nMaxSize(nMaxSizeIn) { clear(); }
iterator begin() const { return set.begin(); } iterator begin() const { return set.begin(); }
iterator end() const { return set.end(); } iterator end() const { return set.end(); }
size_type size() const { return set.size(); } size_type size() const { return set.size(); }
@ -36,7 +38,9 @@ public:
void clear() void clear()
{ {
set.clear(); set.clear();
queue.clear(); order.assign(nMaxSize, set.end());
first_used = 0;
first_unused = 0;
} }
bool inline friend operator==(const mruset<T>& a, const mruset<T>& b) { return a.set == b.set; } bool inline friend operator==(const mruset<T>& a, const mruset<T>& b) { return a.set == b.set; }
bool inline friend operator==(const mruset<T>& a, const std::set<T>& b) { return a.set == b; } bool inline friend operator==(const mruset<T>& a, const std::set<T>& b) { return a.set == b; }
@ -45,25 +49,17 @@ public:
{ {
std::pair<iterator, bool> ret = set.insert(x); std::pair<iterator, bool> ret = set.insert(x);
if (ret.second) { if (ret.second) {
if (nMaxSize && queue.size() == nMaxSize) { if (set.size() == nMaxSize + 1) {
set.erase(queue.front()); set.erase(order[first_used]);
queue.pop_front(); order[first_used] = set.end();
if (++first_used == nMaxSize) first_used = 0;
} }
queue.push_back(x); order[first_unused] = ret.first;
if (++first_unused == nMaxSize) first_unused = 0;
} }
return ret; return ret;
} }
size_type max_size() const { return nMaxSize; } size_type max_size() const { return nMaxSize; }
size_type max_size(size_type s)
{
if (s)
while (queue.size() > s) {
set.erase(queue.front());
queue.pop_front();
}
nMaxSize = s;
return nMaxSize;
}
}; };
#endif // BITCOIN_MRUSET_H #endif // BITCOIN_MRUSET_H

View file

@ -1905,7 +1905,10 @@ bool CAddrDB::Read(CAddrMan& addr)
unsigned int ReceiveFloodSize() { return 1000*GetArg("-maxreceivebuffer", 5*1000); } unsigned int ReceiveFloodSize() { return 1000*GetArg("-maxreceivebuffer", 5*1000); }
unsigned int SendBufferSize() { return 1000*GetArg("-maxsendbuffer", 1*1000); } unsigned int SendBufferSize() { return 1000*GetArg("-maxsendbuffer", 1*1000); }
CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) : ssSend(SER_NETWORK, INIT_PROTO_VERSION), setAddrKnown(5000) CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fInboundIn) :
ssSend(SER_NETWORK, INIT_PROTO_VERSION),
addrKnown(5000, 0.001, insecure_rand()),
setInventoryKnown(SendBufferSize() / 1000)
{ {
nServices = 0; nServices = 0;
hSocket = hSocketIn; hSocket = hSocketIn;
@ -1934,7 +1937,6 @@ CNode::CNode(SOCKET hSocketIn, CAddress addrIn, std::string addrNameIn, bool fIn
nStartingHeight = -1; nStartingHeight = -1;
fGetAddr = false; fGetAddr = false;
fRelayTxes = false; fRelayTxes = false;
setInventoryKnown.max_size(SendBufferSize() / 1000);
pfilter = new CBloomFilter(); pfilter = new CBloomFilter();
nPingNonceSent = 0; nPingNonceSent = 0;
nPingUsecStart = 0; nPingUsecStart = 0;

View file

@ -300,7 +300,7 @@ public:
// flood relay // flood relay
std::vector<CAddress> vAddrToSend; std::vector<CAddress> vAddrToSend;
mruset<CAddress> setAddrKnown; CRollingBloomFilter addrKnown;
bool fGetAddr; bool fGetAddr;
std::set<uint256> setKnown; std::set<uint256> setKnown;
@ -380,7 +380,7 @@ public:
void AddAddressKnown(const CAddress& addr) void AddAddressKnown(const CAddress& addr)
{ {
setAddrKnown.insert(addr); addrKnown.insert(addr.GetKey());
} }
void PushAddress(const CAddress& addr) void PushAddress(const CAddress& addr)
@ -388,7 +388,7 @@ public:
// Known checking here is only to save space from duplicates. // Known checking here is only to save space from duplicates.
// SendMessages will filter it again for knowns that were added // SendMessages will filter it again for knowns that were added
// after addresses were pushed. // after addresses were pushed.
if (addr.IsValid() && !setAddrKnown.count(addr)) { if (addr.IsValid() && !addrKnown.contains(addr.GetKey())) {
if (vAddrToSend.size() >= MAX_ADDR_TO_SEND) { if (vAddrToSend.size() >= MAX_ADDR_TO_SEND) {
vAddrToSend[insecure_rand() % vAddrToSend.size()] = addr; vAddrToSend[insecure_rand() % vAddrToSend.size()] = addr;
} else { } else {

View file

@ -8,6 +8,7 @@
#include "clientversion.h" #include "clientversion.h"
#include "key.h" #include "key.h"
#include "merkleblock.h" #include "merkleblock.h"
#include "random.h"
#include "serialize.h" #include "serialize.h"
#include "streams.h" #include "streams.h"
#include "uint256.h" #include "uint256.h"
@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none)
BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0))); BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0)));
} }
static std::vector<unsigned char> RandomData()
{
uint256 r = GetRandHash();
return std::vector<unsigned char>(r.begin(), r.end());
}
BOOST_AUTO_TEST_CASE(rolling_bloom)
{
// last-100-entry, 1% false positive:
CRollingBloomFilter rb1(100, 0.01, 0);
// Overfill:
static const int DATASIZE=399;
std::vector<unsigned char> data[DATASIZE];
for (int i = 0; i < DATASIZE; i++) {
data[i] = RandomData();
rb1.insert(data[i]);
}
// Last 100 guaranteed to be remembered:
for (int i = 299; i < DATASIZE; i++) {
BOOST_CHECK(rb1.contains(data[i]));
}
// false positive rate is 1%, so we should get about 100 hits if
// testing 10,000 random keys. We get worst-case false positive
// behavior when the filter is as full as possible, which is
// when we've inserted one minus an integer multiple of nElement*2.
unsigned int nHits = 0;
for (int i = 0; i < 10000; i++) {
if (rb1.contains(RandomData()))
++nHits;
}
// Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs:
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)");
// Insanely unlikely to get a fp count outside this range:
BOOST_CHECK(nHits > 25);
BOOST_CHECK(nHits < 175);
BOOST_CHECK(rb1.contains(data[DATASIZE-1]));
rb1.clear();
BOOST_CHECK(!rb1.contains(data[DATASIZE-1]));
// Now roll through data, make sure last 100 entries
// are always remembered:
for (int i = 0; i < DATASIZE; i++) {
if (i >= 100)
BOOST_CHECK(rb1.contains(data[i-100]));
rb1.insert(data[i]);
}
// Insert 999 more random entries:
for (int i = 0; i < 999; i++) {
rb1.insert(RandomData());
}
// Sanity check to make sure the filter isn't just filling up:
nHits = 0;
for (int i = 0; i < DATASIZE; i++) {
if (rb1.contains(data[i]))
++nHits;
}
// Expect about 5 false positives, more than 100 means
// something is definitely broken.
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)");
BOOST_CHECK(nHits < 100);
// last-1000-entry, 0.01% false positive:
CRollingBloomFilter rb2(1000, 0.001, 0);
for (int i = 0; i < DATASIZE; i++) {
rb2.insert(data[i]);
}
// ... room for all of them:
for (int i = 0; i < DATASIZE; i++) {
BOOST_CHECK(rb2.contains(data[i]));
}
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()

View file

@ -17,83 +17,65 @@
using namespace std; using namespace std;
class mrutester
{
private:
mruset<int> mru;
std::set<int> set;
public:
mrutester() { mru.max_size(MAX_SIZE); }
int size() const { return set.size(); }
void insert(int n)
{
mru.insert(n);
set.insert(n);
BOOST_CHECK(mru == set);
}
};
BOOST_FIXTURE_TEST_SUITE(mruset_tests, BasicTestingSetup) BOOST_FIXTURE_TEST_SUITE(mruset_tests, BasicTestingSetup)
// Test that an mruset behaves like a set, as long as no more than MAX_SIZE elements are in it BOOST_AUTO_TEST_CASE(mruset_test)
BOOST_AUTO_TEST_CASE(mruset_like_set)
{ {
// The mruset being tested.
mruset<int> mru(5000);
for (int nTest=0; nTest<NUM_TESTS; nTest++) // Run the test 10 times.
{ for (int test = 0; test < 10; test++) {
mrutester tester; // Reset mru.
while (tester.size() < MAX_SIZE) mru.clear();
tester.insert(GetRandInt(2 * MAX_SIZE));
}
} // A deque + set to simulate the mruset.
std::deque<int> rep;
std::set<int> all;
// Test that an mruset's size never exceeds its max_size // Insert 10000 random integers below 15000.
BOOST_AUTO_TEST_CASE(mruset_limited_size) for (int j=0; j<10000; j++) {
{ int add = GetRandInt(15000);
for (int nTest=0; nTest<NUM_TESTS; nTest++) mru.insert(add);
{
mruset<int> mru(MAX_SIZE); // Add the number to rep/all as well.
for (int nAction=0; nAction<3*MAX_SIZE; nAction++) if (all.count(add) == 0) {
{ all.insert(add);
int n = GetRandInt(2 * MAX_SIZE); rep.push_back(add);
mru.insert(n); if (all.size() == 5001) {
BOOST_CHECK(mru.size() <= MAX_SIZE); all.erase(rep.front());
rep.pop_front();
}
}
// Do a full comparison between mru and the simulated mru every 1000 and every 5001 elements.
if (j % 1000 == 0 || j % 5001 == 0) {
mruset<int> mru2 = mru; // Also try making a copy
// Check that all elements that should be in there, are in there.
BOOST_FOREACH(int x, rep) {
BOOST_CHECK(mru.count(x));
BOOST_CHECK(mru2.count(x));
}
// Check that all elements that are in there, should be in there.
BOOST_FOREACH(int x, mru) {
BOOST_CHECK(all.count(x));
}
// Check that all elements that are in there, should be in there.
BOOST_FOREACH(int x, mru2) {
BOOST_CHECK(all.count(x));
}
for (int t = 0; t < 10; t++) {
int r = GetRandInt(15000);
BOOST_CHECK(all.count(r) == mru.count(r));
BOOST_CHECK(all.count(r) == mru2.count(r));
}
}
} }
} }
} }
// 16-bit permutation function
int static permute(int n)
{
// hexadecimals of pi; verified to be linearly independent
static const int table[16] = {0x243F, 0x6A88, 0x85A3, 0x08D3, 0x1319, 0x8A2E, 0x0370, 0x7344,
0xA409, 0x3822, 0x299F, 0x31D0, 0x082E, 0xFA98, 0xEC4E, 0x6C89};
int ret = 0;
for (int bit=0; bit<16; bit++)
if (n & (1<<bit))
ret ^= table[bit];
return ret;
}
// Test that an mruset acts like a moving window, if no duplicate elements are added
BOOST_AUTO_TEST_CASE(mruset_window)
{
mruset<int> mru(MAX_SIZE);
for (int n=0; n<10*MAX_SIZE; n++)
{
mru.insert(permute(n));
set<int> tester;
for (int m=max(0,n-MAX_SIZE+1); m<=n; m++)
tester.insert(permute(m));
BOOST_CHECK(mru == tester);
}
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()