gcs: add package for building and using Golomb-coded set filters

This commit is contained in:
Alex 2017-01-12 13:25:31 -07:00 committed by Olaoluwa Osuntokun
parent 06f32abe07
commit 3069dcf175
7 changed files with 616 additions and 1 deletions

View file

@ -1,6 +1,7 @@
ISC License ISC License
Copyright (c) 2013-2016 The btcsuite developers Copyright (c) 2013-2017 The btcsuite developers
Copyright (c) 2016-2017 The Lightning Network Developers
Permission to use, copy, modify, and distribute this software for any Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above purpose with or without fee is hereby granted, provided that the above

24
gcs/README.md Normal file
View file

@ -0,0 +1,24 @@
base58
==========
[![Build Status](http://img.shields.io/travis/btcsuite/btcutil.svg)]
(https://travis-ci.org/btcsuite/btcutil) [![ISC License]
(http://img.shields.io/badge/license-ISC-blue.svg)](http://copyfree.org)
[![GoDoc](https://godoc.org/github.com/btcsuite/btcutil/gcs?status.png)]
(http://godoc.org/github.com/btcsuite/btcutil/gcs)
Package gcs provides an API for building and using a Golomb-coded set filter
similar to that described [here].(http://giovanni.bajo.it/post/47119962313/golomb-coded-sets-smaller-than-bloom-filters)
A comprehensive suite of tests is provided to ensure proper functionality.
## Installation and Updating
```bash
$ go get -u github.com/btcsuite/btcutil/gcs
```
## License
Package gcs is licensed under the [copyfree](http://copyfree.org) ISC
License.

24
gcs/doc.go Normal file
View file

@ -0,0 +1,24 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
/*
Package gcs provides an API for building and using a Golomb-coded set filter.
Golomb-Coded Set
A Golomb-coded set is a probabilistic data structure used similarly to a Bloom
filter. A filter uses constant-size overhead plus on average n+2 bits per
item added to the filter, where 2^-1 is the desired false positive (collision)
probability.
GCS use in Bitcoin
GCS filters are a proposed mechanism for storing and transmitting per-block
filters in Bitcoin. The usage is intended to be the inverse of Bloom filters:
a full node would send an SPV node the GCS filter for a block, which the SPV
node would check against its list of relevant items. The suggested collision
probability for Bitcoin use is 2^-20.
*/
package gcs

280
gcs/gcs.go Normal file
View file

@ -0,0 +1,280 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs
import (
"fmt"
"io"
"sort"
"github.com/aead/siphash"
"github.com/kkdai/bstream"
)
// Inspired by https://github.com/rasky/gcs
var (
// ErrNTooBig signifies that the filter can't handle N items.
ErrNTooBig = fmt.Errorf("N is too big to fit in uint32")
// ErrPTooBig signifies that the filter can't handle `1/2**P`
// collision probability.
ErrPTooBig = fmt.Errorf("P is too big to fit in uint32")
// ErrNoData signifies that an empty slice was passed.
ErrNoData = fmt.Errorf("No data provided")
)
const (
//KeySize is the size of the byte array required for key material for
// the SipHash keyed hash function.
KeySize = 16
)
// gcsFilter describes an immutable filter that can be built from
// a set of data elements, serialized, deserialized, and queried
// in a thread-safe manner. The serialized form is compressed as
// a Golomb Coded Set (GCS), but does not include N or P to allow
// the user to encode the metadata separately if necessary. The
// hash function used is SipHash, a keyed function; the key used
// in building the filter is required in order to match filter
// values and is not included in the serialized form.
type gcsFilter struct {
n uint32
p uint8
modulusP uint64
modulusNP uint64
filterData []byte
}
// BuildGCSFilter builds a new GCS filter with the collision probability of
// `1/(2**P)`, key `key`, and including every `[]byte` in `data` as a member of
// the set.
func BuildGCSFilter(P uint8, key [KeySize]byte,
data [][]byte) (*gcsFilter, error) {
// Some initial parameter checks: make sure we have data from which to
// build the filter, and make sure our parameters will fit the hash
// function we're using.
if len(data) == 0 {
return nil, ErrNoData
}
if len(data) > ((1 << 32) - 1) {
return nil, ErrNTooBig
}
if P > 32 {
return nil, ErrPTooBig
}
// Create the filter object and insert metadata.
f := gcsFilter{
n: uint32(len(data)),
p: P,
}
f.modulusP = uint64(1 << f.p)
f.modulusNP = uint64(f.n) * f.modulusP
// Build the filter.
var values uint64Slice
b := bstream.NewBStreamWriter(0)
// Insert the hash (modulo N*P) of each data element into a slice and
// sort the slice.
for _, d := range data {
v := siphash.Sum64(d, &key) % f.modulusNP
values = append(values, v)
}
sort.Sort(values)
// Write the sorted list of values into the filter bitstream,
// compressing it using Golomb coding.
var value, lastValue, remainder uint64
for _, v := range values {
// Calculate the difference between this value and the last,
// modulo P.
remainder = (v - lastValue) % f.modulusP
// Calculate the difference between this value and the last,
// divided by P.
value = (v - lastValue - remainder) / f.modulusP
lastValue = v
// Write the P multiple into the bitstream in unary; the
// average should be around 1 (2 bits - 0b10).
for value > 0 {
b.WriteBit(true)
value--
}
b.WriteBit(false)
// Write the remainder as a big-endian integer with enough bits
// to represent the appropriate collision probability.
b.WriteBits(remainder, int(f.p))
}
// Copy the bitstream into the filter object and return the object.
f.filterData = b.Bytes()
return &f, nil
}
// FromBytes deserializes a GCS filter from a known N, P, and serialized
// filter as returned by Bytes().
func FromBytes(N uint32, P uint8, d []byte) (*gcsFilter, error) {
// Basic sanity check.
if P > 32 {
return nil, ErrPTooBig
}
// Create the filter object and insert metadata.
f := &gcsFilter{
n: N,
p: P,
}
f.modulusP = uint64(1 << f.p)
f.modulusNP = uint64(f.n) * f.modulusP
// Copy the filter.
f.filterData = make([]byte, len(d))
copy(f.filterData, d)
return f, nil
}
// Bytes returns the serialized format of the GCS filter, which does not
// include N or P (returned by separate methods) or the key used by SipHash.
func (f *gcsFilter) Bytes() []byte {
filterData := make([]byte, len(f.filterData))
copy(filterData, f.filterData)
return filterData
}
// P returns the filter's collision probability as a negative power of 2 (that
// is, a collision probability of `1/2**20` is represented as 20).
func (f *gcsFilter) P() uint8 {
return f.p
}
// N returns the size of the data set used to build the filter.
func (f *gcsFilter) N() uint32 {
return f.n
}
// Match checks whether a []byte value is likely (within collision
// probability) to be a member of the set represented by the filter.
func (f *gcsFilter) Match(key [KeySize]byte, data []byte) (bool, error) {
// Create a filter bitstream.
filterData := f.Bytes()
b := bstream.NewBStreamReader(filterData)
// Hash our search term with the same parameters as the filter.
term := siphash.Sum64(data, &key) % f.modulusNP
// Go through the search filter and look for the desired value.
var lastValue uint64
for lastValue < term {
// Read the difference between previous and new value
// from bitstream.
value, err := f.readFullUint64(b)
if err != nil {
if err == io.EOF {
return false, nil
}
return false, err
}
// Add the previous value to it.
value += lastValue
if value == term {
return true, nil
}
lastValue = value
}
return false, nil
}
// MatchAny returns checks whether any []byte value is likely (within
// collision probability) to be a member of the set represented by the
// filter faster than calling Match() for each value individually.
func (f *gcsFilter) MatchAny(key [KeySize]byte, data [][]byte) (bool, error) {
// Basic sanity check.
if len(data) == 0 {
return false, ErrNoData
}
// Create a filter bitstream.
filterData := f.Bytes()
b := bstream.NewBStreamReader(filterData)
// Create an uncompressed filter of the search values.
var values uint64Slice
for _, d := range data {
v := siphash.Sum64(d, &key) % f.modulusNP
values = append(values, v)
}
sort.Sort(values)
// Zip down the filters, comparing values until we either run out of
// values to compare in one of the filters or we reach a matching value.
var lastValue1, lastValue2 uint64
lastValue2 = values[0]
i := 1
for lastValue1 != lastValue2 {
// Check which filter to advance to make sure we're comparing
// the right values.
switch {
case lastValue1 > lastValue2:
// Advance filter created from search terms or return
// false if we're at the end because nothing matched.
if i < len(values) {
lastValue2 = values[i]
i++
} else {
return false, nil
}
case lastValue2 > lastValue1:
// Advance filter we're searching or return false if
// we're at the end because nothing matched.
value, err := f.readFullUint64(b)
if err != nil {
if err == io.EOF {
return false, nil
}
return false, err
}
lastValue1 += value
}
}
// If we've made it this far, an element matched between filters so
// we return true.
return true, nil
}
// readFullUint64 reads a value represented by the sum of a unary multiple
// of the filter's P modulus (`2**P`) and a big-endian P-bit remainder.
func (f *gcsFilter) readFullUint64(b *bstream.BStream) (uint64, error) {
var v uint64
// Count the 1s until we reach a 0.
c, err := b.ReadBit()
if err != nil {
return 0, err
}
for c == true {
v++
c, err = b.ReadBit()
if err != nil {
return 0, err
}
}
// Read P bits.
remainder, err := b.ReadBits(int(f.p))
if err != nil {
return 0, err
}
// Add the multiple and the remainder.
v = v*f.modulusP + remainder
return v, nil
}

211
gcs/gcs_test.go Normal file
View file

@ -0,0 +1,211 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs_test
import (
"encoding/binary"
"math/rand"
"testing"
"github.com/btcsuite/btcutil/gcs"
)
var (
// No need to allocate an err variable in every test
err error
// Collision probability for the tests (1/2**20)
P = uint8(20)
// Filters are conserved between tests but we must define with an
// interface which functions we're testing because the gcsFilter
// type isn't exported
filter, filter2 interface {
Match([gcs.KeySize]byte, []byte) (bool, error)
MatchAny([gcs.KeySize]byte, [][]byte) (bool, error)
N() uint32
P() uint8
Bytes() []byte
}
// We need to use the same key for building and querying the filters
key [gcs.KeySize]byte
// List of values for building a filter
contents = [][]byte{
[]byte("Alex"),
[]byte("Bob"),
[]byte("Charlie"),
[]byte("Dick"),
[]byte("Ed"),
[]byte("Frank"),
[]byte("George"),
[]byte("Harry"),
[]byte("Ilya"),
[]byte("John"),
[]byte("Kevin"),
[]byte("Larry"),
[]byte("Michael"),
[]byte("Nate"),
[]byte("Owen"),
[]byte("Paul"),
[]byte("Quentin"),
}
// List of values for querying a filter using MatchAny()
contents2 = [][]byte{
[]byte("Alice"),
[]byte("Betty"),
[]byte("Charmaine"),
[]byte("Donna"),
[]byte("Edith"),
[]byte("Faina"),
[]byte("Georgia"),
[]byte("Hannah"),
[]byte("Ilsbeth"),
[]byte("Jennifer"),
[]byte("Kayla"),
[]byte("Lena"),
[]byte("Michelle"),
[]byte("Natalie"),
[]byte("Ophelia"),
[]byte("Peggy"),
[]byte("Queenie"),
}
)
// TestGCSFilterBuild builds a test filter with a randomized key. For Bitcoin
// use, deterministic filter generation is desired. Therefore, a
// key that's derived deterministically would be required.
func TestGCSFilterBuild(t *testing.T) {
for i := 0; i < gcs.KeySize; i += 4 {
binary.BigEndian.PutUint32(key[i:], rand.Uint32())
}
filter, err = gcs.BuildGCSFilter(P, key, contents)
if err != nil {
t.Fatalf("Filter build failed: %s", err.Error())
}
}
// TestGCSFilterCopy deserializes and serializes a filter to create a copy.
func TestGCSFilterCopy(t *testing.T) {
filter2, err = gcs.FromBytes(filter.N(), P, filter.Bytes())
if err != nil {
t.Fatalf("Filter copy failed: %s", err.Error())
}
}
// TestGCSFilterMetadata checks that the filter metadata is built and
// copied correctly.
func TestGCSFilterMetadata(t *testing.T) {
if filter.P() != P {
t.Fatal("P not correctly stored in filter metadata")
}
if filter.N() != uint32(len(contents)) {
t.Fatal("N not correctly stored in filter metadata")
}
if filter.P() != filter2.P() {
t.Fatal("P doesn't match between copied filters")
}
if filter.N() != filter2.N() {
t.Fatal("N doesn't match between copied filters")
}
}
// TestGCSFilterMatch checks that both the built and copied filters match
// correctly, logging any false positives without failing on them.
func TestGCSFilterMatch(t *testing.T) {
match, err := filter.Match(key, []byte("Nate"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter2.Match(key, []byte("Nate"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter.Match(key, []byte("Quentin"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter2.Match(key, []byte("Quentin"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter.Match(key, []byte("Nates"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.Match(key, []byte("Nates"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter.Match(key, []byte("Quentins"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.Match(key, []byte("Quentins"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
}
// TestGCSFilterMatchAny checks that both the built and copied filters match
// a list correctly, logging any false positives without failing on them.
func TestGCSFilterMatchAny(t *testing.T) {
match, err := filter.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
contents2 = append(contents2, []byte("Nate"))
match, err = filter.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match any when it should have!")
}
match, err = filter2.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match any when it should have!")
}
}

49
gcs/gcsbench_test.go Normal file
View file

@ -0,0 +1,49 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs_test
import (
"encoding/binary"
"math/rand"
"testing"
"github.com/btcsuite/btcutil/gcs"
)
// BenchmarkGCSFilterBuild benchmarks building a filter.
func BenchmarkGCSFilterBuild(b *testing.B) {
b.StopTimer()
for i := 0; i < gcs.KeySize; i += 4 {
binary.BigEndian.PutUint32(key[i:], rand.Uint32())
}
b.StartTimer()
for i := 0; i < b.N; i++ {
gcs.BuildGCSFilter(P, key, contents)
}
}
// BenchmarkGCSFilterMatch benchmarks querying a filter for a single value.
func BenchmarkGCSFilterMatch(b *testing.B) {
b.StopTimer()
filter, err = gcs.BuildGCSFilter(P, key, contents)
if err != nil {
b.Errorf("Failed to build filter")
}
b.StartTimer()
for i := 0; i < b.N; i++ {
filter.Match(key, []byte("Nate"))
filter.Match(key, []byte("Nates"))
}
}
// BenchmarkGCSFilterMatchAny benchmarks querying a filter for a list of values.
func BenchmarkGCSFilterMatchAny(b *testing.B) {
for i := 0; i < b.N; i++ {
filter.MatchAny(key, contents2)
}
}

26
gcs/uint64slice.go Normal file
View file

@ -0,0 +1,26 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs
// uint64slice is a package-local utility class that allows us to use Go's
// sort package to sort a []uint64 by implementing sort.Interface.
type uint64Slice []uint64
// Len returns the length of the slice.
func (p uint64Slice) Len() int {
return len(p)
}
// Less returns true when the ith element is smaller than the jth element
// of the slice, and returns false otherwise.
func (p uint64Slice) Less(i, j int) bool {
return p[i] < p[j]
}
// Swap swaps two slice elements.
func (p uint64Slice) Swap(i, j int) {
p[i], p[j] = p[j], p[i]
}