gcs: add package for building and using Golomb-coded set filters

This commit is contained in:
Alex 2017-01-12 13:25:31 -07:00 committed by Olaoluwa Osuntokun
parent 06f32abe07
commit 3069dcf175
7 changed files with 616 additions and 1 deletions

View file

@ -1,6 +1,7 @@
ISC License
Copyright (c) 2013-2016 The btcsuite developers
Copyright (c) 2013-2017 The btcsuite developers
Copyright (c) 2016-2017 The Lightning Network Developers
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above

24
gcs/README.md Normal file
View file

@ -0,0 +1,24 @@
base58
==========
[![Build Status](http://img.shields.io/travis/btcsuite/btcutil.svg)]
(https://travis-ci.org/btcsuite/btcutil) [![ISC License]
(http://img.shields.io/badge/license-ISC-blue.svg)](http://copyfree.org)
[![GoDoc](https://godoc.org/github.com/btcsuite/btcutil/gcs?status.png)]
(http://godoc.org/github.com/btcsuite/btcutil/gcs)
Package gcs provides an API for building and using a Golomb-coded set filter
similar to that described [here].(http://giovanni.bajo.it/post/47119962313/golomb-coded-sets-smaller-than-bloom-filters)
A comprehensive suite of tests is provided to ensure proper functionality.
## Installation and Updating
```bash
$ go get -u github.com/btcsuite/btcutil/gcs
```
## License
Package gcs is licensed under the [copyfree](http://copyfree.org) ISC
License.

24
gcs/doc.go Normal file
View file

@ -0,0 +1,24 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
/*
Package gcs provides an API for building and using a Golomb-coded set filter.
Golomb-Coded Set
A Golomb-coded set is a probabilistic data structure used similarly to a Bloom
filter. A filter uses constant-size overhead plus on average n+2 bits per
item added to the filter, where 2^-1 is the desired false positive (collision)
probability.
GCS use in Bitcoin
GCS filters are a proposed mechanism for storing and transmitting per-block
filters in Bitcoin. The usage is intended to be the inverse of Bloom filters:
a full node would send an SPV node the GCS filter for a block, which the SPV
node would check against its list of relevant items. The suggested collision
probability for Bitcoin use is 2^-20.
*/
package gcs

280
gcs/gcs.go Normal file
View file

@ -0,0 +1,280 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs
import (
"fmt"
"io"
"sort"
"github.com/aead/siphash"
"github.com/kkdai/bstream"
)
// Inspired by https://github.com/rasky/gcs
var (
// ErrNTooBig signifies that the filter can't handle N items.
ErrNTooBig = fmt.Errorf("N is too big to fit in uint32")
// ErrPTooBig signifies that the filter can't handle `1/2**P`
// collision probability.
ErrPTooBig = fmt.Errorf("P is too big to fit in uint32")
// ErrNoData signifies that an empty slice was passed.
ErrNoData = fmt.Errorf("No data provided")
)
const (
//KeySize is the size of the byte array required for key material for
// the SipHash keyed hash function.
KeySize = 16
)
// gcsFilter describes an immutable filter that can be built from
// a set of data elements, serialized, deserialized, and queried
// in a thread-safe manner. The serialized form is compressed as
// a Golomb Coded Set (GCS), but does not include N or P to allow
// the user to encode the metadata separately if necessary. The
// hash function used is SipHash, a keyed function; the key used
// in building the filter is required in order to match filter
// values and is not included in the serialized form.
type gcsFilter struct {
n uint32
p uint8
modulusP uint64
modulusNP uint64
filterData []byte
}
// BuildGCSFilter builds a new GCS filter with the collision probability of
// `1/(2**P)`, key `key`, and including every `[]byte` in `data` as a member of
// the set.
func BuildGCSFilter(P uint8, key [KeySize]byte,
data [][]byte) (*gcsFilter, error) {
// Some initial parameter checks: make sure we have data from which to
// build the filter, and make sure our parameters will fit the hash
// function we're using.
if len(data) == 0 {
return nil, ErrNoData
}
if len(data) > ((1 << 32) - 1) {
return nil, ErrNTooBig
}
if P > 32 {
return nil, ErrPTooBig
}
// Create the filter object and insert metadata.
f := gcsFilter{
n: uint32(len(data)),
p: P,
}
f.modulusP = uint64(1 << f.p)
f.modulusNP = uint64(f.n) * f.modulusP
// Build the filter.
var values uint64Slice
b := bstream.NewBStreamWriter(0)
// Insert the hash (modulo N*P) of each data element into a slice and
// sort the slice.
for _, d := range data {
v := siphash.Sum64(d, &key) % f.modulusNP
values = append(values, v)
}
sort.Sort(values)
// Write the sorted list of values into the filter bitstream,
// compressing it using Golomb coding.
var value, lastValue, remainder uint64
for _, v := range values {
// Calculate the difference between this value and the last,
// modulo P.
remainder = (v - lastValue) % f.modulusP
// Calculate the difference between this value and the last,
// divided by P.
value = (v - lastValue - remainder) / f.modulusP
lastValue = v
// Write the P multiple into the bitstream in unary; the
// average should be around 1 (2 bits - 0b10).
for value > 0 {
b.WriteBit(true)
value--
}
b.WriteBit(false)
// Write the remainder as a big-endian integer with enough bits
// to represent the appropriate collision probability.
b.WriteBits(remainder, int(f.p))
}
// Copy the bitstream into the filter object and return the object.
f.filterData = b.Bytes()
return &f, nil
}
// FromBytes deserializes a GCS filter from a known N, P, and serialized
// filter as returned by Bytes().
func FromBytes(N uint32, P uint8, d []byte) (*gcsFilter, error) {
// Basic sanity check.
if P > 32 {
return nil, ErrPTooBig
}
// Create the filter object and insert metadata.
f := &gcsFilter{
n: N,
p: P,
}
f.modulusP = uint64(1 << f.p)
f.modulusNP = uint64(f.n) * f.modulusP
// Copy the filter.
f.filterData = make([]byte, len(d))
copy(f.filterData, d)
return f, nil
}
// Bytes returns the serialized format of the GCS filter, which does not
// include N or P (returned by separate methods) or the key used by SipHash.
func (f *gcsFilter) Bytes() []byte {
filterData := make([]byte, len(f.filterData))
copy(filterData, f.filterData)
return filterData
}
// P returns the filter's collision probability as a negative power of 2 (that
// is, a collision probability of `1/2**20` is represented as 20).
func (f *gcsFilter) P() uint8 {
return f.p
}
// N returns the size of the data set used to build the filter.
func (f *gcsFilter) N() uint32 {
return f.n
}
// Match checks whether a []byte value is likely (within collision
// probability) to be a member of the set represented by the filter.
func (f *gcsFilter) Match(key [KeySize]byte, data []byte) (bool, error) {
// Create a filter bitstream.
filterData := f.Bytes()
b := bstream.NewBStreamReader(filterData)
// Hash our search term with the same parameters as the filter.
term := siphash.Sum64(data, &key) % f.modulusNP
// Go through the search filter and look for the desired value.
var lastValue uint64
for lastValue < term {
// Read the difference between previous and new value
// from bitstream.
value, err := f.readFullUint64(b)
if err != nil {
if err == io.EOF {
return false, nil
}
return false, err
}
// Add the previous value to it.
value += lastValue
if value == term {
return true, nil
}
lastValue = value
}
return false, nil
}
// MatchAny returns checks whether any []byte value is likely (within
// collision probability) to be a member of the set represented by the
// filter faster than calling Match() for each value individually.
func (f *gcsFilter) MatchAny(key [KeySize]byte, data [][]byte) (bool, error) {
// Basic sanity check.
if len(data) == 0 {
return false, ErrNoData
}
// Create a filter bitstream.
filterData := f.Bytes()
b := bstream.NewBStreamReader(filterData)
// Create an uncompressed filter of the search values.
var values uint64Slice
for _, d := range data {
v := siphash.Sum64(d, &key) % f.modulusNP
values = append(values, v)
}
sort.Sort(values)
// Zip down the filters, comparing values until we either run out of
// values to compare in one of the filters or we reach a matching value.
var lastValue1, lastValue2 uint64
lastValue2 = values[0]
i := 1
for lastValue1 != lastValue2 {
// Check which filter to advance to make sure we're comparing
// the right values.
switch {
case lastValue1 > lastValue2:
// Advance filter created from search terms or return
// false if we're at the end because nothing matched.
if i < len(values) {
lastValue2 = values[i]
i++
} else {
return false, nil
}
case lastValue2 > lastValue1:
// Advance filter we're searching or return false if
// we're at the end because nothing matched.
value, err := f.readFullUint64(b)
if err != nil {
if err == io.EOF {
return false, nil
}
return false, err
}
lastValue1 += value
}
}
// If we've made it this far, an element matched between filters so
// we return true.
return true, nil
}
// readFullUint64 reads a value represented by the sum of a unary multiple
// of the filter's P modulus (`2**P`) and a big-endian P-bit remainder.
func (f *gcsFilter) readFullUint64(b *bstream.BStream) (uint64, error) {
var v uint64
// Count the 1s until we reach a 0.
c, err := b.ReadBit()
if err != nil {
return 0, err
}
for c == true {
v++
c, err = b.ReadBit()
if err != nil {
return 0, err
}
}
// Read P bits.
remainder, err := b.ReadBits(int(f.p))
if err != nil {
return 0, err
}
// Add the multiple and the remainder.
v = v*f.modulusP + remainder
return v, nil
}

211
gcs/gcs_test.go Normal file
View file

@ -0,0 +1,211 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs_test
import (
"encoding/binary"
"math/rand"
"testing"
"github.com/btcsuite/btcutil/gcs"
)
var (
// No need to allocate an err variable in every test
err error
// Collision probability for the tests (1/2**20)
P = uint8(20)
// Filters are conserved between tests but we must define with an
// interface which functions we're testing because the gcsFilter
// type isn't exported
filter, filter2 interface {
Match([gcs.KeySize]byte, []byte) (bool, error)
MatchAny([gcs.KeySize]byte, [][]byte) (bool, error)
N() uint32
P() uint8
Bytes() []byte
}
// We need to use the same key for building and querying the filters
key [gcs.KeySize]byte
// List of values for building a filter
contents = [][]byte{
[]byte("Alex"),
[]byte("Bob"),
[]byte("Charlie"),
[]byte("Dick"),
[]byte("Ed"),
[]byte("Frank"),
[]byte("George"),
[]byte("Harry"),
[]byte("Ilya"),
[]byte("John"),
[]byte("Kevin"),
[]byte("Larry"),
[]byte("Michael"),
[]byte("Nate"),
[]byte("Owen"),
[]byte("Paul"),
[]byte("Quentin"),
}
// List of values for querying a filter using MatchAny()
contents2 = [][]byte{
[]byte("Alice"),
[]byte("Betty"),
[]byte("Charmaine"),
[]byte("Donna"),
[]byte("Edith"),
[]byte("Faina"),
[]byte("Georgia"),
[]byte("Hannah"),
[]byte("Ilsbeth"),
[]byte("Jennifer"),
[]byte("Kayla"),
[]byte("Lena"),
[]byte("Michelle"),
[]byte("Natalie"),
[]byte("Ophelia"),
[]byte("Peggy"),
[]byte("Queenie"),
}
)
// TestGCSFilterBuild builds a test filter with a randomized key. For Bitcoin
// use, deterministic filter generation is desired. Therefore, a
// key that's derived deterministically would be required.
func TestGCSFilterBuild(t *testing.T) {
for i := 0; i < gcs.KeySize; i += 4 {
binary.BigEndian.PutUint32(key[i:], rand.Uint32())
}
filter, err = gcs.BuildGCSFilter(P, key, contents)
if err != nil {
t.Fatalf("Filter build failed: %s", err.Error())
}
}
// TestGCSFilterCopy deserializes and serializes a filter to create a copy.
func TestGCSFilterCopy(t *testing.T) {
filter2, err = gcs.FromBytes(filter.N(), P, filter.Bytes())
if err != nil {
t.Fatalf("Filter copy failed: %s", err.Error())
}
}
// TestGCSFilterMetadata checks that the filter metadata is built and
// copied correctly.
func TestGCSFilterMetadata(t *testing.T) {
if filter.P() != P {
t.Fatal("P not correctly stored in filter metadata")
}
if filter.N() != uint32(len(contents)) {
t.Fatal("N not correctly stored in filter metadata")
}
if filter.P() != filter2.P() {
t.Fatal("P doesn't match between copied filters")
}
if filter.N() != filter2.N() {
t.Fatal("N doesn't match between copied filters")
}
}
// TestGCSFilterMatch checks that both the built and copied filters match
// correctly, logging any false positives without failing on them.
func TestGCSFilterMatch(t *testing.T) {
match, err := filter.Match(key, []byte("Nate"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter2.Match(key, []byte("Nate"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter.Match(key, []byte("Quentin"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter2.Match(key, []byte("Quentin"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match when it should have!")
}
match, err = filter.Match(key, []byte("Nates"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.Match(key, []byte("Nates"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter.Match(key, []byte("Quentins"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.Match(key, []byte("Quentins"))
if err != nil {
t.Fatalf("Filter match failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
}
// TestGCSFilterMatchAny checks that both the built and copied filters match
// a list correctly, logging any false positives without failing on them.
func TestGCSFilterMatchAny(t *testing.T) {
match, err := filter.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
match, err = filter2.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if match {
t.Logf("False positive match, should be 1 in 2**%d!", P)
}
contents2 = append(contents2, []byte("Nate"))
match, err = filter.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match any when it should have!")
}
match, err = filter2.MatchAny(key, contents2)
if err != nil {
t.Fatalf("Filter match any failed: %s", err.Error())
}
if !match {
t.Fatal("Filter didn't match any when it should have!")
}
}

49
gcs/gcsbench_test.go Normal file
View file

@ -0,0 +1,49 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs_test
import (
"encoding/binary"
"math/rand"
"testing"
"github.com/btcsuite/btcutil/gcs"
)
// BenchmarkGCSFilterBuild benchmarks building a filter.
func BenchmarkGCSFilterBuild(b *testing.B) {
b.StopTimer()
for i := 0; i < gcs.KeySize; i += 4 {
binary.BigEndian.PutUint32(key[i:], rand.Uint32())
}
b.StartTimer()
for i := 0; i < b.N; i++ {
gcs.BuildGCSFilter(P, key, contents)
}
}
// BenchmarkGCSFilterMatch benchmarks querying a filter for a single value.
func BenchmarkGCSFilterMatch(b *testing.B) {
b.StopTimer()
filter, err = gcs.BuildGCSFilter(P, key, contents)
if err != nil {
b.Errorf("Failed to build filter")
}
b.StartTimer()
for i := 0; i < b.N; i++ {
filter.Match(key, []byte("Nate"))
filter.Match(key, []byte("Nates"))
}
}
// BenchmarkGCSFilterMatchAny benchmarks querying a filter for a list of values.
func BenchmarkGCSFilterMatchAny(b *testing.B) {
for i := 0; i < b.N; i++ {
filter.MatchAny(key, contents2)
}
}

26
gcs/uint64slice.go Normal file
View file

@ -0,0 +1,26 @@
// Copyright (c) 2016-2017 The btcsuite developers
// Copyright (c) 2016-2017 The Lightning Network Developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.
package gcs
// uint64slice is a package-local utility class that allows us to use Go's
// sort package to sort a []uint64 by implementing sort.Interface.
type uint64Slice []uint64
// Len returns the length of the slice.
func (p uint64Slice) Len() int {
return len(p)
}
// Less returns true when the ith element is smaller than the jth element
// of the slice, and returns false otherwise.
func (p uint64Slice) Less(i, j int) bool {
return p[i] < p[j]
}
// Swap swaps two slice elements.
func (p uint64Slice) Swap(i, j int) {
p[i], p[j] = p[j], p[i]
}