From d7f97ab7504ceca5dac3a1198f59ccebc7b0e920 Mon Sep 17 00:00:00 2001 From: Brannon King Date: Wed, 7 Jul 2021 16:46:22 -0400 Subject: [PATCH] initial sketch and test of faster trie use custom search formatted --- claimtrie/change/change.go | 2 +- claimtrie/merkletrie/prefix_trie.go | 226 +++++++++++++++++++++++ claimtrie/merkletrie/prefix_trie_test.go | 92 +++++++++ claimtrie/node/claim.go | 2 +- 4 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 claimtrie/merkletrie/prefix_trie.go create mode 100644 claimtrie/merkletrie/prefix_trie_test.go diff --git a/claimtrie/change/change.go b/claimtrie/change/change.go index 33e1d052..9a029271 100644 --- a/claimtrie/change/change.go +++ b/claimtrie/change/change.go @@ -15,7 +15,7 @@ type Change struct { Height int32 Name []byte - ClaimID string + ClaimID string // TODO: can we store this and OutPoint as bytes? OutPoint string Amount int64 Value []byte diff --git a/claimtrie/merkletrie/prefix_trie.go b/claimtrie/merkletrie/prefix_trie.go new file mode 100644 index 00000000..ceabb180 --- /dev/null +++ b/claimtrie/merkletrie/prefix_trie.go @@ -0,0 +1,226 @@ +package merkletrie + +import ( + "github.com/lbryio/chain/chaincfg/chainhash" +) + +type KeyType []byte + +type PrefixTrieNode struct { // implements sort.Interface + children []*PrefixTrieNode + key KeyType + hash *chainhash.Hash + hasClaims bool +} + +// insertAt inserts v into s at index i and returns the new slice. +// https://stackoverflow.com/questions/42746972/golang-insert-to-a-sorted-slice +func insertAt(data []*PrefixTrieNode, i int, v *PrefixTrieNode) []*PrefixTrieNode { + if i == len(data) { + // Insert at end is the easy case. + return append(data, v) + } + + // Make space for the inserted element by shifting + // values at the insertion index up one index. The call + // to append does not allocate memory when cap(data) is + // greater than len(data). + data = append(data[:i+1], data[i:]...) + data[i] = v + return data +} + +func (ptn *PrefixTrieNode) Insert(value *PrefixTrieNode) *PrefixTrieNode { + // keep it sorted (and sort.Sort is too slow) + index := sortSearch(ptn.children, value.key[0]) + ptn.children = insertAt(ptn.children, index, value) + + return value +} + +// this sort.Search is stolen shamelessly from search.go, +// and modified for performance to not need a closure +func sortSearch(nodes []*PrefixTrieNode, b byte) int { + i, j := 0, len(nodes) + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + if nodes[h].key[0] < b { + i = h + 1 // preserves f(i-1) == false + } else { + j = h // preserves f(j) == true + } + } + // i == j, f(i-1) == false, and f(j) (= f(i)) == true => answer is i. + return i +} + +func (ptn *PrefixTrieNode) FindNearest(start KeyType) (int, *PrefixTrieNode) { + // none of the children overlap on the first char or we would have a parent node with that char + index := sortSearch(ptn.children, start[0]) + hits := ptn.children[index:] + if len(hits) > 0 { + return index, hits[0] + } + return -1, nil +} + +type PrefixTrie interface { + InsertOrFind(value KeyType) (bool, *PrefixTrieNode) + Find(value KeyType) *PrefixTrieNode + FindPath(value KeyType) ([]int, []*PrefixTrieNode) + IterateFrom(start KeyType, handler func(value *PrefixTrieNode) bool) + Erase(value KeyType) bool + NodeCount() int +} + +type prefixTrie struct { + root *PrefixTrieNode + Nodes int +} + +func NewPrefixTrie() PrefixTrie { + // we never delete the root node + return &prefixTrie{root: &PrefixTrieNode{key: make(KeyType, 0)}, Nodes: 1} +} + +func (pt *prefixTrie) NodeCount() int { + return pt.Nodes +} + +func matchLength(a, b KeyType) int { + minLen := len(a) + if len(b) < minLen { + minLen = len(b) + } + for i := 0; i < minLen; i++ { + if a[i] != b[i] { + return i + } + } + return minLen +} + +func (pt *prefixTrie) insert(value KeyType, node *PrefixTrieNode) (bool, *PrefixTrieNode) { + index, child := node.FindNearest(value) + match := 0 + if index >= 0 { // if we found a child + match = matchLength(value, child.key) + if len(value) == match && len(child.key) == match { + return false, child + } + } + if match <= 0 { + pt.Nodes++ + return true, node.Insert(&PrefixTrieNode{key: value}) + } + if match < len(child.key) { + grandChild := PrefixTrieNode{key: child.key[match:], children: child.children, + hasClaims: child.hasClaims, hash: child.hash} + newChild := PrefixTrieNode{key: child.key[0:match], children: []*PrefixTrieNode{&grandChild}} + child = &newChild + node.children[index] = child + pt.Nodes++ + if len(value) == match { + return true, child + } + } + return pt.insert(value[match:], child) +} + +func (pt *prefixTrie) InsertOrFind(value KeyType) (bool, *PrefixTrieNode) { + if len(value) <= 0 { + return false, pt.root + } + return pt.insert(value, pt.root) +} + +func find(value KeyType, node *PrefixTrieNode, pathIndexes *[]int, path *[]*PrefixTrieNode) *PrefixTrieNode { + index, child := node.FindNearest(value) + if index < 0 { + return nil + } + match := matchLength(value, child.key) + if len(value) == match && len(child.key) == match { + if pathIndexes != nil { + *pathIndexes = append(*pathIndexes, index) + } + if path != nil { + *path = append(*path, child) + } + return child + } + if match < len(child.key) || match == len(value) { + return nil + } + if pathIndexes != nil { + *pathIndexes = append(*pathIndexes, index) + } + if path != nil { + *path = append(*path, child) + } + return find(value[match:], child, pathIndexes, path) +} + +func (pt *prefixTrie) Find(value KeyType) *PrefixTrieNode { + if len(value) <= 0 { + return pt.root + } + return find(value, pt.root, nil, nil) +} + +func (pt *prefixTrie) FindPath(value KeyType) ([]int, []*PrefixTrieNode) { + pathIndexes := []int{-1} + path := []*PrefixTrieNode{pt.root} + result := find(value, pt.root, &pathIndexes, &path) + if result == nil { + return nil, nil + } // not sure I want this line + return pathIndexes, path +} + +// IterateFrom can be used to find a value and run a function on that value. +// If the handler returns true it continues to iterate through the children of value. +func (pt *prefixTrie) IterateFrom(start KeyType, handler func(value *PrefixTrieNode) bool) { + node := find(start, pt.root, nil, nil) + if node == nil { + return + } + iterateFrom(node, handler) +} + +func iterateFrom(node *PrefixTrieNode, handler func(value *PrefixTrieNode) bool) { + for handler(node) { + for _, child := range node.children { + iterateFrom(child, handler) + } + } +} + +func (pt *prefixTrie) Erase(value KeyType) bool { + indexes, path := pt.FindPath(value) + if path == nil || len(path) <= 1 { + return false + } + nodes := pt.Nodes + for i := len(path) - 1; i > 0; i-- { + childCount := len(path[i].children) + noClaimData := !path[i].hasClaims + if childCount == 1 && noClaimData { + path[i].key = append(path[i].key, path[i].children[0].key...) + path[i].hash = nil + path[i].hasClaims = path[i].children[0].hasClaims + path[i].children = path[i].children[0].children + pt.Nodes-- + continue + } + if childCount == 0 && noClaimData { + index := indexes[i] + path[i-1].children = append(path[i-1].children[:index], path[i-1].children[index+1:]...) + pt.Nodes-- + continue + } + break + } + return nodes > pt.Nodes +} diff --git a/claimtrie/merkletrie/prefix_trie_test.go b/claimtrie/merkletrie/prefix_trie_test.go new file mode 100644 index 00000000..ad277c36 --- /dev/null +++ b/claimtrie/merkletrie/prefix_trie_test.go @@ -0,0 +1,92 @@ +package merkletrie + +import ( + "bytes" + "github.com/stretchr/testify/assert" + "math/rand" + "testing" + "time" +) + +func b(value string) []byte { return []byte(value) } +func eq(x []byte, y string) bool { return bytes.Compare(x, b(y)) == 0 } + +func TestInsertAndErase(t *testing.T) { + trie := NewPrefixTrie() + assert.True(t, trie.NodeCount() == 1) + inserted, node := trie.InsertOrFind(b("abc")) + assert.True(t, inserted) + assert.NotNil(t, node) + assert.Equal(t, 2, trie.NodeCount()) + inserted, node = trie.InsertOrFind(b("abd")) + assert.True(t, inserted) + assert.Equal(t, 4, trie.NodeCount()) + assert.NotNil(t, node) + hit := trie.Find(b("ab")) + assert.True(t, eq(hit.key, "ab")) + assert.Equal(t, 2, len(hit.children)) + hit = trie.Find(b("abc")) + assert.True(t, eq(hit.key, "c")) + hit = trie.Find(b("abd")) + assert.True(t, eq(hit.key, "d")) + hit = trie.Find(b("a")) + assert.Nil(t, hit) + indexes, path := trie.FindPath(b("abd")) + assert.Equal(t, 3, len(indexes)) + assert.True(t, eq(path[1].key, "ab")) + erased := trie.Erase(b("ab")) + assert.False(t, erased) + assert.Equal(t, 4, trie.NodeCount()) + erased = trie.Erase(b("abc")) + assert.True(t, erased) + assert.Equal(t, 2, trie.NodeCount()) + erased = trie.Erase(b("abd")) + assert.True(t, erased) + assert.Equal(t, 1, trie.NodeCount()) +} + +func TestPrefixTrie(t *testing.T) { + inserts := 1000000 + data := make([][]byte, inserts) + rand.Seed(42) + for i := 0; i < inserts; i++ { + size := rand.Intn(70) + 4 + data[i] = make([]byte, size) + rand.Read(data[i]) + for j := 0; j < size; j++ { + data[i][j] %= byte(62) // shrink the range to match the old test + } + } + + trie := NewPrefixTrie() + // doing my own timing because I couldn't get the B.Run method to work: + start := time.Now() + for i := 0; i < inserts; i++ { + _, node := trie.InsertOrFind(data[i]) + assert.NotNil(t, node, "Failure at %d of %d", i, inserts) + } + t.Logf("Insertion in %f sec.", time.Now().Sub(start).Seconds()) + + start = time.Now() + for i := 0; i < inserts; i++ { + node := trie.Find(data[i]) + assert.True(t, bytes.HasSuffix(data[i], node.key), "Failure on %d of %d", i, inserts) + } + t.Logf("Lookup in %f sec. on %d nodes.", time.Now().Sub(start).Seconds(), trie.NodeCount()) + + start = time.Now() + for i := 0; i < inserts; i++ { + indexes, path := trie.FindPath(data[i]) + assert.True(t, len(indexes) == len(path)) + assert.True(t, len(path) > 1) + assert.True(t, bytes.HasSuffix(data[i], path[len(path)-1].key)) + } + t.Logf("Parents in %f sec.", time.Now().Sub(start).Seconds()) + + start = time.Now() + for i := 0; i < inserts; i++ { + trie.Erase(data[i]) + } + t.Logf("Deletion in %f sec.", time.Now().Sub(start).Seconds()) + assert.Equal(t, 1, trie.NodeCount()) +} diff --git a/claimtrie/node/claim.go b/claimtrie/node/claim.go index 4070dc94..e0a42ab5 100644 --- a/claimtrie/node/claim.go +++ b/claimtrie/node/claim.go @@ -17,7 +17,7 @@ import ( // ClaimID represents a Claim's ClaimID. type ClaimID [20]byte -// NewClaimID returns a Claim ID caclculated from Ripemd160(Sha256(OUTPOINT). +// NewClaimID returns a Claim ID calculated from Ripemd160(Sha256(OUTPOINT). func NewClaimID(op wire.OutPoint) ClaimID { w := bytes.NewBuffer(op.Hash[:])