Rolling bloom filter class

For when you need to keep track of the last N items you've seen, and can tolerate some false-positives. Rebased-by: Pieter Wuille <pieter.wuille@gmail.com>
2015-04-24 13:14:45 -04:00 · 2015-04-24 13:14:45 -04:00 · 69a5f8be0a
commit 69a5f8be0a
parent 8a10000222
3 changed files with 173 additions and 16 deletions
--- a/src/bloom.cpp
+++ b/src/bloom.cpp
@ -21,22 +21,33 @@
 using namespace std;
 CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
-/**
+    /**
- * The ideal size for a bloom filter with a given number of elements and false positive rate is:
+     * The ideal size for a bloom filter with a given number of elements and false positive rate is:
- * - nElements * log(fp rate) / ln(2)^2
+     * - nElements * log(fp rate) / ln(2)^2
- * We ignore filter parameters which will create a bloom filter larger than the protocol limits
+     * We ignore filter parameters which will create a bloom filter larger than the protocol limits
- */
+     */
-vData(min((unsigned int)(-1  / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
+    vData(min((unsigned int)(-1  / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
-/**
+    /**
- * The ideal number of hash functions is filter size * ln(2) / number of elements
+     * The ideal number of hash functions is filter size * ln(2) / number of elements
- * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
+     * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
- * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
+     * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
- */
+     */
-isFull(false),
+    isFull(false),
-isEmpty(false),
+    isEmpty(false),
-nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
+    nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
-nTweak(nTweakIn),
+    nTweak(nTweakIn),
-nFlags(nFlagsIn)
+    nFlags(nFlagsIn)
 {
 }
 // Private constructor used by CRollingBloomFilter
 CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) :
    vData((unsigned int)(-1  / LN2SQUARED * nElements * log(nFPRate)) / 8),
    isFull(false),
    isEmpty(true),
    nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)),
    nTweak(nTweakIn),
    nFlags(BLOOM_UPDATE_NONE)
 {
 }
@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull()
    isFull = full;
    isEmpty = empty;
 }
 CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) :
    b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak)
 {
    // Implemented using two bloom filters of 2 * nElements each.
    // We fill them up, and clear them, staggered, every nElements
    // inserted, so at least one always contains the last nElements
    // inserted.
    nBloomSize = nElements * 2;
    nInsertions = 0;
 }
 void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
 {
    if (nInsertions == 0) {
        b1.clear();
    } else if (nInsertions == nBloomSize / 2) {
        b2.clear();
    }
    b1.insert(vKey);
    b2.insert(vKey);
    if (++nInsertions == nBloomSize) {
        nInsertions = 0;
    }
 }
 bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
 {
    if (nInsertions < nBloomSize / 2) {
        return b2.contains(vKey);
    }
    return b1.contains(vKey);
 }
 void CRollingBloomFilter::clear()
 {
    b1.clear();
    b2.clear();
    nInsertions = 0;
 }
--- a/src/bloom.h
+++ b/src/bloom.h
@ -53,6 +53,10 @@ private:
    unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
    // Private constructor for CRollingBloomFilter, no restrictions on size
    CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
    friend class CRollingBloomFilter;
 public:
    /**
     * Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
@ -97,4 +101,28 @@ public:
    void UpdateEmptyFull();
 };
 /**
 * RollingBloomFilter is a probabilistic "keep track of most recently inserted" set.
 * Construct it with the number of items to keep track of, and a false-positive rate.
 *
 * contains(item) will always return true if item was one of the last N things
 * insert()'ed ... but may also return true for items that were not inserted.
 */
 class CRollingBloomFilter
 {
 public:
    CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
    void insert(const std::vector<unsigned char>& vKey);
    bool contains(const std::vector<unsigned char>& vKey) const;
    void clear();
 private:
    unsigned int nBloomSize;
    unsigned int nInsertions;
    CBloomFilter b1, b2;
 };
 #endif // BITCOIN_BLOOM_H
--- a/src/test/bloom_tests.cpp
+++ b/src/test/bloom_tests.cpp
@ -8,6 +8,7 @@
 #include "clientversion.h"
 #include "key.h"
 #include "merkleblock.h"
 #include "random.h"
 #include "serialize.h"
 #include "streams.h"
 #include "uint256.h"
@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none)
    BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0)));
 }
 static std::vector<unsigned char> RandomData()
 {
    uint256 r = GetRandHash();
    return std::vector<unsigned char>(r.begin(), r.end());
 }
 BOOST_AUTO_TEST_CASE(rolling_bloom)
 {
    // last-100-entry, 1% false positive:
    CRollingBloomFilter rb1(100, 0.01, 0);
    // Overfill:
    static const int DATASIZE=399;
    std::vector<unsigned char> data[DATASIZE];
    for (int i = 0; i < DATASIZE; i++) {
        data[i] = RandomData();
        rb1.insert(data[i]);
    }
    // Last 100 guaranteed to be remembered:
    for (int i = 299; i < DATASIZE; i++) {
        BOOST_CHECK(rb1.contains(data[i]));
    }
    // false positive rate is 1%, so we should get about 100 hits if
    // testing 10,000 random keys. We get worst-case false positive
    // behavior when the filter is as full as possible, which is
    // when we've inserted one minus an integer multiple of nElement*2.
    unsigned int nHits = 0;
    for (int i = 0; i < 10000; i++) {
        if (rb1.contains(RandomData()))
            ++nHits;
    }
    // Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs:
    BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)");
    // Insanely unlikely to get a fp count outside this range:
    BOOST_CHECK(nHits > 25);
    BOOST_CHECK(nHits < 175);
    BOOST_CHECK(rb1.contains(data[DATASIZE-1]));
    rb1.clear();
    BOOST_CHECK(!rb1.contains(data[DATASIZE-1]));
    // Now roll through data, make sure last 100 entries
    // are always remembered:
    for (int i = 0; i < DATASIZE; i++) {
        if (i >= 100)
            BOOST_CHECK(rb1.contains(data[i-100]));
        rb1.insert(data[i]);
    }
    // Insert 999 more random entries:
    for (int i = 0; i < 999; i++) {
        rb1.insert(RandomData());
    }
    // Sanity check to make sure the filter isn't just filling up:
    nHits = 0;
    for (int i = 0; i < DATASIZE; i++) {
        if (rb1.contains(data[i]))
            ++nHits;
    }
    // Expect about 5 false positives, more than 100 means
    // something is definitely broken.
    BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)");
    BOOST_CHECK(nHits < 100);
    // last-1000-entry, 0.01% false positive:
    CRollingBloomFilter rb2(1000, 0.001, 0);
    for (int i = 0; i < DATASIZE; i++) {
        rb2.insert(data[i]);
    }
    // ... room for all of them:
    for (int i = 0; i < DATASIZE; i++) {
        BOOST_CHECK(rb2.contains(data[i]));
    }
 }
 BOOST_AUTO_TEST_SUITE_END()