From 3069dcf17578c35bd94871035f4f14c77f2594d6 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 12 Jan 2017 13:25:31 -0700 Subject: [PATCH] gcs: add package for building and using Golomb-coded set filters --- LICENSE | 3 +- gcs/README.md | 24 ++++ gcs/doc.go | 24 ++++ gcs/gcs.go | 280 +++++++++++++++++++++++++++++++++++++++++++ gcs/gcs_test.go | 211 ++++++++++++++++++++++++++++++++ gcs/gcsbench_test.go | 49 ++++++++ gcs/uint64slice.go | 26 ++++ 7 files changed, 616 insertions(+), 1 deletion(-) create mode 100644 gcs/README.md create mode 100644 gcs/doc.go create mode 100644 gcs/gcs.go create mode 100644 gcs/gcs_test.go create mode 100644 gcs/gcsbench_test.go create mode 100644 gcs/uint64slice.go diff --git a/LICENSE b/LICENSE index 49de919..3e7b167 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ ISC License -Copyright (c) 2013-2016 The btcsuite developers +Copyright (c) 2013-2017 The btcsuite developers +Copyright (c) 2016-2017 The Lightning Network Developers Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/gcs/README.md b/gcs/README.md new file mode 100644 index 0000000..1e7582b --- /dev/null +++ b/gcs/README.md @@ -0,0 +1,24 @@ +base58 +========== + +[![Build Status](http://img.shields.io/travis/btcsuite/btcutil.svg)] +(https://travis-ci.org/btcsuite/btcutil) [![ISC License] +(http://img.shields.io/badge/license-ISC-blue.svg)](http://copyfree.org) +[![GoDoc](https://godoc.org/github.com/btcsuite/btcutil/gcs?status.png)] +(http://godoc.org/github.com/btcsuite/btcutil/gcs) + +Package gcs provides an API for building and using a Golomb-coded set filter +similar to that described [here].(http://giovanni.bajo.it/post/47119962313/golomb-coded-sets-smaller-than-bloom-filters) + +A comprehensive suite of tests is provided to ensure proper functionality. + +## Installation and Updating + +```bash +$ go get -u github.com/btcsuite/btcutil/gcs +``` + +## License + +Package gcs is licensed under the [copyfree](http://copyfree.org) ISC +License. diff --git a/gcs/doc.go b/gcs/doc.go new file mode 100644 index 0000000..2056b4d --- /dev/null +++ b/gcs/doc.go @@ -0,0 +1,24 @@ +// Copyright (c) 2016-2017 The btcsuite developers +// Copyright (c) 2016-2017 The Lightning Network Developers +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +/* +Package gcs provides an API for building and using a Golomb-coded set filter. + +Golomb-Coded Set + +A Golomb-coded set is a probabilistic data structure used similarly to a Bloom +filter. A filter uses constant-size overhead plus on average n+2 bits per +item added to the filter, where 2^-1 is the desired false positive (collision) +probability. + +GCS use in Bitcoin + +GCS filters are a proposed mechanism for storing and transmitting per-block +filters in Bitcoin. The usage is intended to be the inverse of Bloom filters: +a full node would send an SPV node the GCS filter for a block, which the SPV +node would check against its list of relevant items. The suggested collision +probability for Bitcoin use is 2^-20. +*/ +package gcs diff --git a/gcs/gcs.go b/gcs/gcs.go new file mode 100644 index 0000000..817af80 --- /dev/null +++ b/gcs/gcs.go @@ -0,0 +1,280 @@ +// Copyright (c) 2016-2017 The btcsuite developers +// Copyright (c) 2016-2017 The Lightning Network Developers +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +package gcs + +import ( + "fmt" + "io" + "sort" + + "github.com/aead/siphash" + "github.com/kkdai/bstream" +) + +// Inspired by https://github.com/rasky/gcs + +var ( + // ErrNTooBig signifies that the filter can't handle N items. + ErrNTooBig = fmt.Errorf("N is too big to fit in uint32") + + // ErrPTooBig signifies that the filter can't handle `1/2**P` + // collision probability. + ErrPTooBig = fmt.Errorf("P is too big to fit in uint32") + + // ErrNoData signifies that an empty slice was passed. + ErrNoData = fmt.Errorf("No data provided") +) + +const ( + //KeySize is the size of the byte array required for key material for + // the SipHash keyed hash function. + KeySize = 16 +) + +// gcsFilter describes an immutable filter that can be built from +// a set of data elements, serialized, deserialized, and queried +// in a thread-safe manner. The serialized form is compressed as +// a Golomb Coded Set (GCS), but does not include N or P to allow +// the user to encode the metadata separately if necessary. The +// hash function used is SipHash, a keyed function; the key used +// in building the filter is required in order to match filter +// values and is not included in the serialized form. +type gcsFilter struct { + n uint32 + p uint8 + modulusP uint64 + modulusNP uint64 + filterData []byte +} + +// BuildGCSFilter builds a new GCS filter with the collision probability of +// `1/(2**P)`, key `key`, and including every `[]byte` in `data` as a member of +// the set. +func BuildGCSFilter(P uint8, key [KeySize]byte, + data [][]byte) (*gcsFilter, error) { + + // Some initial parameter checks: make sure we have data from which to + // build the filter, and make sure our parameters will fit the hash + // function we're using. + if len(data) == 0 { + return nil, ErrNoData + } + if len(data) > ((1 << 32) - 1) { + return nil, ErrNTooBig + } + if P > 32 { + return nil, ErrPTooBig + } + + // Create the filter object and insert metadata. + f := gcsFilter{ + n: uint32(len(data)), + p: P, + } + f.modulusP = uint64(1 << f.p) + f.modulusNP = uint64(f.n) * f.modulusP + + // Build the filter. + var values uint64Slice + b := bstream.NewBStreamWriter(0) + + // Insert the hash (modulo N*P) of each data element into a slice and + // sort the slice. + for _, d := range data { + v := siphash.Sum64(d, &key) % f.modulusNP + values = append(values, v) + } + sort.Sort(values) + + // Write the sorted list of values into the filter bitstream, + // compressing it using Golomb coding. + var value, lastValue, remainder uint64 + for _, v := range values { + // Calculate the difference between this value and the last, + // modulo P. + remainder = (v - lastValue) % f.modulusP + // Calculate the difference between this value and the last, + // divided by P. + value = (v - lastValue - remainder) / f.modulusP + lastValue = v + // Write the P multiple into the bitstream in unary; the + // average should be around 1 (2 bits - 0b10). + for value > 0 { + b.WriteBit(true) + value-- + } + b.WriteBit(false) + // Write the remainder as a big-endian integer with enough bits + // to represent the appropriate collision probability. + b.WriteBits(remainder, int(f.p)) + } + + // Copy the bitstream into the filter object and return the object. + f.filterData = b.Bytes() + return &f, nil +} + +// FromBytes deserializes a GCS filter from a known N, P, and serialized +// filter as returned by Bytes(). +func FromBytes(N uint32, P uint8, d []byte) (*gcsFilter, error) { + + // Basic sanity check. + if P > 32 { + return nil, ErrPTooBig + } + + // Create the filter object and insert metadata. + f := &gcsFilter{ + n: N, + p: P, + } + f.modulusP = uint64(1 << f.p) + f.modulusNP = uint64(f.n) * f.modulusP + + // Copy the filter. + f.filterData = make([]byte, len(d)) + copy(f.filterData, d) + return f, nil +} + +// Bytes returns the serialized format of the GCS filter, which does not +// include N or P (returned by separate methods) or the key used by SipHash. +func (f *gcsFilter) Bytes() []byte { + filterData := make([]byte, len(f.filterData)) + copy(filterData, f.filterData) + return filterData +} + +// P returns the filter's collision probability as a negative power of 2 (that +// is, a collision probability of `1/2**20` is represented as 20). +func (f *gcsFilter) P() uint8 { + return f.p +} + +// N returns the size of the data set used to build the filter. +func (f *gcsFilter) N() uint32 { + return f.n +} + +// Match checks whether a []byte value is likely (within collision +// probability) to be a member of the set represented by the filter. +func (f *gcsFilter) Match(key [KeySize]byte, data []byte) (bool, error) { + + // Create a filter bitstream. + filterData := f.Bytes() + b := bstream.NewBStreamReader(filterData) + + // Hash our search term with the same parameters as the filter. + term := siphash.Sum64(data, &key) % f.modulusNP + + // Go through the search filter and look for the desired value. + var lastValue uint64 + for lastValue < term { + // Read the difference between previous and new value + // from bitstream. + value, err := f.readFullUint64(b) + if err != nil { + if err == io.EOF { + return false, nil + } + return false, err + } + // Add the previous value to it. + value += lastValue + if value == term { + return true, nil + } + lastValue = value + } + return false, nil +} + +// MatchAny returns checks whether any []byte value is likely (within +// collision probability) to be a member of the set represented by the +// filter faster than calling Match() for each value individually. +func (f *gcsFilter) MatchAny(key [KeySize]byte, data [][]byte) (bool, error) { + + // Basic sanity check. + if len(data) == 0 { + return false, ErrNoData + } + + // Create a filter bitstream. + filterData := f.Bytes() + b := bstream.NewBStreamReader(filterData) + + // Create an uncompressed filter of the search values. + var values uint64Slice + for _, d := range data { + v := siphash.Sum64(d, &key) % f.modulusNP + values = append(values, v) + } + sort.Sort(values) + + // Zip down the filters, comparing values until we either run out of + // values to compare in one of the filters or we reach a matching value. + var lastValue1, lastValue2 uint64 + lastValue2 = values[0] + i := 1 + for lastValue1 != lastValue2 { + // Check which filter to advance to make sure we're comparing + // the right values. + switch { + case lastValue1 > lastValue2: + // Advance filter created from search terms or return + // false if we're at the end because nothing matched. + if i < len(values) { + lastValue2 = values[i] + i++ + } else { + return false, nil + } + case lastValue2 > lastValue1: + // Advance filter we're searching or return false if + // we're at the end because nothing matched. + value, err := f.readFullUint64(b) + if err != nil { + if err == io.EOF { + return false, nil + } + return false, err + } + lastValue1 += value + } + } + // If we've made it this far, an element matched between filters so + // we return true. + return true, nil +} + +// readFullUint64 reads a value represented by the sum of a unary multiple +// of the filter's P modulus (`2**P`) and a big-endian P-bit remainder. +func (f *gcsFilter) readFullUint64(b *bstream.BStream) (uint64, error) { + var v uint64 + + // Count the 1s until we reach a 0. + c, err := b.ReadBit() + if err != nil { + return 0, err + } + for c == true { + v++ + c, err = b.ReadBit() + if err != nil { + return 0, err + } + } + + // Read P bits. + remainder, err := b.ReadBits(int(f.p)) + if err != nil { + return 0, err + } + + // Add the multiple and the remainder. + v = v*f.modulusP + remainder + return v, nil +} diff --git a/gcs/gcs_test.go b/gcs/gcs_test.go new file mode 100644 index 0000000..f4cbb29 --- /dev/null +++ b/gcs/gcs_test.go @@ -0,0 +1,211 @@ +// Copyright (c) 2016-2017 The btcsuite developers +// Copyright (c) 2016-2017 The Lightning Network Developers +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +package gcs_test + +import ( + "encoding/binary" + "math/rand" + "testing" + + "github.com/btcsuite/btcutil/gcs" +) + +var ( + // No need to allocate an err variable in every test + err error + + // Collision probability for the tests (1/2**20) + P = uint8(20) + + // Filters are conserved between tests but we must define with an + // interface which functions we're testing because the gcsFilter + // type isn't exported + filter, filter2 interface { + Match([gcs.KeySize]byte, []byte) (bool, error) + MatchAny([gcs.KeySize]byte, [][]byte) (bool, error) + N() uint32 + P() uint8 + Bytes() []byte + } + + // We need to use the same key for building and querying the filters + key [gcs.KeySize]byte + + // List of values for building a filter + contents = [][]byte{ + []byte("Alex"), + []byte("Bob"), + []byte("Charlie"), + []byte("Dick"), + []byte("Ed"), + []byte("Frank"), + []byte("George"), + []byte("Harry"), + []byte("Ilya"), + []byte("John"), + []byte("Kevin"), + []byte("Larry"), + []byte("Michael"), + []byte("Nate"), + []byte("Owen"), + []byte("Paul"), + []byte("Quentin"), + } + + // List of values for querying a filter using MatchAny() + contents2 = [][]byte{ + []byte("Alice"), + []byte("Betty"), + []byte("Charmaine"), + []byte("Donna"), + []byte("Edith"), + []byte("Faina"), + []byte("Georgia"), + []byte("Hannah"), + []byte("Ilsbeth"), + []byte("Jennifer"), + []byte("Kayla"), + []byte("Lena"), + []byte("Michelle"), + []byte("Natalie"), + []byte("Ophelia"), + []byte("Peggy"), + []byte("Queenie"), + } +) + +// TestGCSFilterBuild builds a test filter with a randomized key. For Bitcoin +// use, deterministic filter generation is desired. Therefore, a +// key that's derived deterministically would be required. +func TestGCSFilterBuild(t *testing.T) { + for i := 0; i < gcs.KeySize; i += 4 { + binary.BigEndian.PutUint32(key[i:], rand.Uint32()) + } + filter, err = gcs.BuildGCSFilter(P, key, contents) + if err != nil { + t.Fatalf("Filter build failed: %s", err.Error()) + } +} + +// TestGCSFilterCopy deserializes and serializes a filter to create a copy. +func TestGCSFilterCopy(t *testing.T) { + filter2, err = gcs.FromBytes(filter.N(), P, filter.Bytes()) + if err != nil { + t.Fatalf("Filter copy failed: %s", err.Error()) + } +} + +// TestGCSFilterMetadata checks that the filter metadata is built and +// copied correctly. +func TestGCSFilterMetadata(t *testing.T) { + if filter.P() != P { + t.Fatal("P not correctly stored in filter metadata") + } + if filter.N() != uint32(len(contents)) { + t.Fatal("N not correctly stored in filter metadata") + } + if filter.P() != filter2.P() { + t.Fatal("P doesn't match between copied filters") + } + if filter.N() != filter2.N() { + t.Fatal("N doesn't match between copied filters") + } +} + +// TestGCSFilterMatch checks that both the built and copied filters match +// correctly, logging any false positives without failing on them. +func TestGCSFilterMatch(t *testing.T) { + match, err := filter.Match(key, []byte("Nate")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match when it should have!") + } + match, err = filter2.Match(key, []byte("Nate")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match when it should have!") + } + match, err = filter.Match(key, []byte("Quentin")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match when it should have!") + } + match, err = filter2.Match(key, []byte("Quentin")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match when it should have!") + } + match, err = filter.Match(key, []byte("Nates")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } + match, err = filter2.Match(key, []byte("Nates")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } + match, err = filter.Match(key, []byte("Quentins")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } + match, err = filter2.Match(key, []byte("Quentins")) + if err != nil { + t.Fatalf("Filter match failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } +} + +// TestGCSFilterMatchAny checks that both the built and copied filters match +// a list correctly, logging any false positives without failing on them. +func TestGCSFilterMatchAny(t *testing.T) { + match, err := filter.MatchAny(key, contents2) + if err != nil { + t.Fatalf("Filter match any failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } + match, err = filter2.MatchAny(key, contents2) + if err != nil { + t.Fatalf("Filter match any failed: %s", err.Error()) + } + if match { + t.Logf("False positive match, should be 1 in 2**%d!", P) + } + contents2 = append(contents2, []byte("Nate")) + match, err = filter.MatchAny(key, contents2) + if err != nil { + t.Fatalf("Filter match any failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match any when it should have!") + } + match, err = filter2.MatchAny(key, contents2) + if err != nil { + t.Fatalf("Filter match any failed: %s", err.Error()) + } + if !match { + t.Fatal("Filter didn't match any when it should have!") + } +} diff --git a/gcs/gcsbench_test.go b/gcs/gcsbench_test.go new file mode 100644 index 0000000..bd22246 --- /dev/null +++ b/gcs/gcsbench_test.go @@ -0,0 +1,49 @@ +// Copyright (c) 2016-2017 The btcsuite developers +// Copyright (c) 2016-2017 The Lightning Network Developers +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +package gcs_test + +import ( + "encoding/binary" + "math/rand" + "testing" + + "github.com/btcsuite/btcutil/gcs" +) + +// BenchmarkGCSFilterBuild benchmarks building a filter. +func BenchmarkGCSFilterBuild(b *testing.B) { + b.StopTimer() + for i := 0; i < gcs.KeySize; i += 4 { + binary.BigEndian.PutUint32(key[i:], rand.Uint32()) + } + b.StartTimer() + + for i := 0; i < b.N; i++ { + gcs.BuildGCSFilter(P, key, contents) + } +} + +// BenchmarkGCSFilterMatch benchmarks querying a filter for a single value. +func BenchmarkGCSFilterMatch(b *testing.B) { + b.StopTimer() + filter, err = gcs.BuildGCSFilter(P, key, contents) + if err != nil { + b.Errorf("Failed to build filter") + } + b.StartTimer() + + for i := 0; i < b.N; i++ { + filter.Match(key, []byte("Nate")) + filter.Match(key, []byte("Nates")) + } +} + +// BenchmarkGCSFilterMatchAny benchmarks querying a filter for a list of values. +func BenchmarkGCSFilterMatchAny(b *testing.B) { + for i := 0; i < b.N; i++ { + filter.MatchAny(key, contents2) + } +} diff --git a/gcs/uint64slice.go b/gcs/uint64slice.go new file mode 100644 index 0000000..9629701 --- /dev/null +++ b/gcs/uint64slice.go @@ -0,0 +1,26 @@ +// Copyright (c) 2016-2017 The btcsuite developers +// Copyright (c) 2016-2017 The Lightning Network Developers +// Use of this source code is governed by an ISC +// license that can be found in the LICENSE file. + +package gcs + +// uint64slice is a package-local utility class that allows us to use Go's +// sort package to sort a []uint64 by implementing sort.Interface. +type uint64Slice []uint64 + +// Len returns the length of the slice. +func (p uint64Slice) Len() int { + return len(p) +} + +// Less returns true when the ith element is smaller than the jth element +// of the slice, and returns false otherwise. +func (p uint64Slice) Less(i, j int) bool { + return p[i] < p[j] +} + +// Swap swaps two slice elements. +func (p uint64Slice) Swap(i, j int) { + p[i], p[j] = p[j], p[i] +}