Rolling bloom filter class
For when you need to keep track of the last N items you've seen, and can tolerate some false-positives. Rebased-by: Pieter Wuille <pieter.wuille@gmail.com>
This commit is contained in:
parent
8a10000222
commit
69a5f8be0a
3 changed files with 173 additions and 16 deletions
|
@ -21,22 +21,33 @@
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
|
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
|
||||||
/**
|
/**
|
||||||
* The ideal size for a bloom filter with a given number of elements and false positive rate is:
|
* The ideal size for a bloom filter with a given number of elements and false positive rate is:
|
||||||
* - nElements * log(fp rate) / ln(2)^2
|
* - nElements * log(fp rate) / ln(2)^2
|
||||||
* We ignore filter parameters which will create a bloom filter larger than the protocol limits
|
* We ignore filter parameters which will create a bloom filter larger than the protocol limits
|
||||||
*/
|
*/
|
||||||
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
|
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
|
||||||
/**
|
/**
|
||||||
* The ideal number of hash functions is filter size * ln(2) / number of elements
|
* The ideal number of hash functions is filter size * ln(2) / number of elements
|
||||||
* Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
|
* Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
|
||||||
* See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
|
* See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
|
||||||
*/
|
*/
|
||||||
isFull(false),
|
isFull(false),
|
||||||
isEmpty(false),
|
isEmpty(false),
|
||||||
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
|
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
|
||||||
nTweak(nTweakIn),
|
nTweak(nTweakIn),
|
||||||
nFlags(nFlagsIn)
|
nFlags(nFlagsIn)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
// Private constructor used by CRollingBloomFilter
|
||||||
|
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) :
|
||||||
|
vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8),
|
||||||
|
isFull(false),
|
||||||
|
isEmpty(true),
|
||||||
|
nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)),
|
||||||
|
nTweak(nTweakIn),
|
||||||
|
nFlags(BLOOM_UPDATE_NONE)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull()
|
||||||
isFull = full;
|
isFull = full;
|
||||||
isEmpty = empty;
|
isEmpty = empty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) :
|
||||||
|
b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak)
|
||||||
|
{
|
||||||
|
// Implemented using two bloom filters of 2 * nElements each.
|
||||||
|
// We fill them up, and clear them, staggered, every nElements
|
||||||
|
// inserted, so at least one always contains the last nElements
|
||||||
|
// inserted.
|
||||||
|
nBloomSize = nElements * 2;
|
||||||
|
nInsertions = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
|
||||||
|
{
|
||||||
|
if (nInsertions == 0) {
|
||||||
|
b1.clear();
|
||||||
|
} else if (nInsertions == nBloomSize / 2) {
|
||||||
|
b2.clear();
|
||||||
|
}
|
||||||
|
b1.insert(vKey);
|
||||||
|
b2.insert(vKey);
|
||||||
|
if (++nInsertions == nBloomSize) {
|
||||||
|
nInsertions = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
|
||||||
|
{
|
||||||
|
if (nInsertions < nBloomSize / 2) {
|
||||||
|
return b2.contains(vKey);
|
||||||
|
}
|
||||||
|
return b1.contains(vKey);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CRollingBloomFilter::clear()
|
||||||
|
{
|
||||||
|
b1.clear();
|
||||||
|
b2.clear();
|
||||||
|
nInsertions = 0;
|
||||||
|
}
|
||||||
|
|
28
src/bloom.h
28
src/bloom.h
|
@ -53,6 +53,10 @@ private:
|
||||||
|
|
||||||
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
|
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
|
||||||
|
|
||||||
|
// Private constructor for CRollingBloomFilter, no restrictions on size
|
||||||
|
CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
|
||||||
|
friend class CRollingBloomFilter;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
|
* Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
|
||||||
|
@ -97,4 +101,28 @@ public:
|
||||||
void UpdateEmptyFull();
|
void UpdateEmptyFull();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RollingBloomFilter is a probabilistic "keep track of most recently inserted" set.
|
||||||
|
* Construct it with the number of items to keep track of, and a false-positive rate.
|
||||||
|
*
|
||||||
|
* contains(item) will always return true if item was one of the last N things
|
||||||
|
* insert()'ed ... but may also return true for items that were not inserted.
|
||||||
|
*/
|
||||||
|
class CRollingBloomFilter
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
|
||||||
|
|
||||||
|
void insert(const std::vector<unsigned char>& vKey);
|
||||||
|
bool contains(const std::vector<unsigned char>& vKey) const;
|
||||||
|
|
||||||
|
void clear();
|
||||||
|
|
||||||
|
private:
|
||||||
|
unsigned int nBloomSize;
|
||||||
|
unsigned int nInsertions;
|
||||||
|
CBloomFilter b1, b2;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif // BITCOIN_BLOOM_H
|
#endif // BITCOIN_BLOOM_H
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include "clientversion.h"
|
#include "clientversion.h"
|
||||||
#include "key.h"
|
#include "key.h"
|
||||||
#include "merkleblock.h"
|
#include "merkleblock.h"
|
||||||
|
#include "random.h"
|
||||||
#include "serialize.h"
|
#include "serialize.h"
|
||||||
#include "streams.h"
|
#include "streams.h"
|
||||||
#include "uint256.h"
|
#include "uint256.h"
|
||||||
|
@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none)
|
||||||
BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0)));
|
BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<unsigned char> RandomData()
|
||||||
|
{
|
||||||
|
uint256 r = GetRandHash();
|
||||||
|
return std::vector<unsigned char>(r.begin(), r.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE(rolling_bloom)
|
||||||
|
{
|
||||||
|
// last-100-entry, 1% false positive:
|
||||||
|
CRollingBloomFilter rb1(100, 0.01, 0);
|
||||||
|
|
||||||
|
// Overfill:
|
||||||
|
static const int DATASIZE=399;
|
||||||
|
std::vector<unsigned char> data[DATASIZE];
|
||||||
|
for (int i = 0; i < DATASIZE; i++) {
|
||||||
|
data[i] = RandomData();
|
||||||
|
rb1.insert(data[i]);
|
||||||
|
}
|
||||||
|
// Last 100 guaranteed to be remembered:
|
||||||
|
for (int i = 299; i < DATASIZE; i++) {
|
||||||
|
BOOST_CHECK(rb1.contains(data[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// false positive rate is 1%, so we should get about 100 hits if
|
||||||
|
// testing 10,000 random keys. We get worst-case false positive
|
||||||
|
// behavior when the filter is as full as possible, which is
|
||||||
|
// when we've inserted one minus an integer multiple of nElement*2.
|
||||||
|
unsigned int nHits = 0;
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
if (rb1.contains(RandomData()))
|
||||||
|
++nHits;
|
||||||
|
}
|
||||||
|
// Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs:
|
||||||
|
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)");
|
||||||
|
|
||||||
|
// Insanely unlikely to get a fp count outside this range:
|
||||||
|
BOOST_CHECK(nHits > 25);
|
||||||
|
BOOST_CHECK(nHits < 175);
|
||||||
|
|
||||||
|
BOOST_CHECK(rb1.contains(data[DATASIZE-1]));
|
||||||
|
rb1.clear();
|
||||||
|
BOOST_CHECK(!rb1.contains(data[DATASIZE-1]));
|
||||||
|
|
||||||
|
// Now roll through data, make sure last 100 entries
|
||||||
|
// are always remembered:
|
||||||
|
for (int i = 0; i < DATASIZE; i++) {
|
||||||
|
if (i >= 100)
|
||||||
|
BOOST_CHECK(rb1.contains(data[i-100]));
|
||||||
|
rb1.insert(data[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert 999 more random entries:
|
||||||
|
for (int i = 0; i < 999; i++) {
|
||||||
|
rb1.insert(RandomData());
|
||||||
|
}
|
||||||
|
// Sanity check to make sure the filter isn't just filling up:
|
||||||
|
nHits = 0;
|
||||||
|
for (int i = 0; i < DATASIZE; i++) {
|
||||||
|
if (rb1.contains(data[i]))
|
||||||
|
++nHits;
|
||||||
|
}
|
||||||
|
// Expect about 5 false positives, more than 100 means
|
||||||
|
// something is definitely broken.
|
||||||
|
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)");
|
||||||
|
BOOST_CHECK(nHits < 100);
|
||||||
|
|
||||||
|
// last-1000-entry, 0.01% false positive:
|
||||||
|
CRollingBloomFilter rb2(1000, 0.001, 0);
|
||||||
|
for (int i = 0; i < DATASIZE; i++) {
|
||||||
|
rb2.insert(data[i]);
|
||||||
|
}
|
||||||
|
// ... room for all of them:
|
||||||
|
for (int i = 0; i < DATASIZE; i++) {
|
||||||
|
BOOST_CHECK(rb2.contains(data[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
|
Loading…
Reference in a new issue