From d7f97ab7504ceca5dac3a1198f59ccebc7b0e920 Mon Sep 17 00:00:00 2001
From: Brannon King <countprimes@gmail.com>
Date: Wed, 7 Jul 2021 16:46:22 -0400
Subject: [PATCH] initial sketch and test of faster trie

use custom search

formatted
---
 claimtrie/change/change.go               |   2 +-
 claimtrie/merkletrie/prefix_trie.go      | 226 +++++++++++++++++++++++
 claimtrie/merkletrie/prefix_trie_test.go |  92 +++++++++
 claimtrie/node/claim.go                  |   2 +-
 4 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 claimtrie/merkletrie/prefix_trie.go
 create mode 100644 claimtrie/merkletrie/prefix_trie_test.go

diff --git a/claimtrie/change/change.go b/claimtrie/change/change.go
index 33e1d052..9a029271 100644
--- a/claimtrie/change/change.go
+++ b/claimtrie/change/change.go
@@ -15,7 +15,7 @@ type Change struct {
 	Height int32
 
 	Name     []byte
-	ClaimID  string
+	ClaimID  string // TODO: can we store this and OutPoint as bytes?
 	OutPoint string
 	Amount   int64
 	Value    []byte
diff --git a/claimtrie/merkletrie/prefix_trie.go b/claimtrie/merkletrie/prefix_trie.go
new file mode 100644
index 00000000..ceabb180
--- /dev/null
+++ b/claimtrie/merkletrie/prefix_trie.go
@@ -0,0 +1,226 @@
+package merkletrie
+
+import (
+	"github.com/lbryio/chain/chaincfg/chainhash"
+)
+
+type KeyType []byte
+
+type PrefixTrieNode struct { // implements sort.Interface
+	children  []*PrefixTrieNode
+	key       KeyType
+	hash      *chainhash.Hash
+	hasClaims bool
+}
+
+// insertAt inserts v into s at index i and returns the new slice.
+// https://stackoverflow.com/questions/42746972/golang-insert-to-a-sorted-slice
+func insertAt(data []*PrefixTrieNode, i int, v *PrefixTrieNode) []*PrefixTrieNode {
+	if i == len(data) {
+		// Insert at end is the easy case.
+		return append(data, v)
+	}
+
+	// Make space for the inserted element by shifting
+	// values at the insertion index up one index. The call
+	// to append does not allocate memory when cap(data) is
+	// greater than len(data).
+	data = append(data[:i+1], data[i:]...)
+	data[i] = v
+	return data
+}
+
+func (ptn *PrefixTrieNode) Insert(value *PrefixTrieNode) *PrefixTrieNode {
+	// keep it sorted (and sort.Sort is too slow)
+	index := sortSearch(ptn.children, value.key[0])
+	ptn.children = insertAt(ptn.children, index, value)
+
+	return value
+}
+
+// this sort.Search is stolen shamelessly from search.go,
+// and modified for performance to not need a closure
+func sortSearch(nodes []*PrefixTrieNode, b byte) int {
+	i, j := 0, len(nodes)
+	for i < j {
+		h := int(uint(i+j) >> 1) // avoid overflow when computing h
+		// i ≤ h < j
+		if nodes[h].key[0] < b {
+			i = h + 1 // preserves f(i-1) == false
+		} else {
+			j = h // preserves f(j) == true
+		}
+	}
+	// i == j, f(i-1) == false, and f(j) (= f(i)) == true  =>  answer is i.
+	return i
+}
+
+func (ptn *PrefixTrieNode) FindNearest(start KeyType) (int, *PrefixTrieNode) {
+	// none of the children overlap on the first char or we would have a parent node with that char
+	index := sortSearch(ptn.children, start[0])
+	hits := ptn.children[index:]
+	if len(hits) > 0 {
+		return index, hits[0]
+	}
+	return -1, nil
+}
+
+type PrefixTrie interface {
+	InsertOrFind(value KeyType) (bool, *PrefixTrieNode)
+	Find(value KeyType) *PrefixTrieNode
+	FindPath(value KeyType) ([]int, []*PrefixTrieNode)
+	IterateFrom(start KeyType, handler func(value *PrefixTrieNode) bool)
+	Erase(value KeyType) bool
+	NodeCount() int
+}
+
+type prefixTrie struct {
+	root  *PrefixTrieNode
+	Nodes int
+}
+
+func NewPrefixTrie() PrefixTrie {
+	// we never delete the root node
+	return &prefixTrie{root: &PrefixTrieNode{key: make(KeyType, 0)}, Nodes: 1}
+}
+
+func (pt *prefixTrie) NodeCount() int {
+	return pt.Nodes
+}
+
+func matchLength(a, b KeyType) int {
+	minLen := len(a)
+	if len(b) < minLen {
+		minLen = len(b)
+	}
+	for i := 0; i < minLen; i++ {
+		if a[i] != b[i] {
+			return i
+		}
+	}
+	return minLen
+}
+
+func (pt *prefixTrie) insert(value KeyType, node *PrefixTrieNode) (bool, *PrefixTrieNode) {
+	index, child := node.FindNearest(value)
+	match := 0
+	if index >= 0 { // if we found a child
+		match = matchLength(value, child.key)
+		if len(value) == match && len(child.key) == match {
+			return false, child
+		}
+	}
+	if match <= 0 {
+		pt.Nodes++
+		return true, node.Insert(&PrefixTrieNode{key: value})
+	}
+	if match < len(child.key) {
+		grandChild := PrefixTrieNode{key: child.key[match:], children: child.children,
+			hasClaims: child.hasClaims, hash: child.hash}
+		newChild := PrefixTrieNode{key: child.key[0:match], children: []*PrefixTrieNode{&grandChild}}
+		child = &newChild
+		node.children[index] = child
+		pt.Nodes++
+		if len(value) == match {
+			return true, child
+		}
+	}
+	return pt.insert(value[match:], child)
+}
+
+func (pt *prefixTrie) InsertOrFind(value KeyType) (bool, *PrefixTrieNode) {
+	if len(value) <= 0 {
+		return false, pt.root
+	}
+	return pt.insert(value, pt.root)
+}
+
+func find(value KeyType, node *PrefixTrieNode, pathIndexes *[]int, path *[]*PrefixTrieNode) *PrefixTrieNode {
+	index, child := node.FindNearest(value)
+	if index < 0 {
+		return nil
+	}
+	match := matchLength(value, child.key)
+	if len(value) == match && len(child.key) == match {
+		if pathIndexes != nil {
+			*pathIndexes = append(*pathIndexes, index)
+		}
+		if path != nil {
+			*path = append(*path, child)
+		}
+		return child
+	}
+	if match < len(child.key) || match == len(value) {
+		return nil
+	}
+	if pathIndexes != nil {
+		*pathIndexes = append(*pathIndexes, index)
+	}
+	if path != nil {
+		*path = append(*path, child)
+	}
+	return find(value[match:], child, pathIndexes, path)
+}
+
+func (pt *prefixTrie) Find(value KeyType) *PrefixTrieNode {
+	if len(value) <= 0 {
+		return pt.root
+	}
+	return find(value, pt.root, nil, nil)
+}
+
+func (pt *prefixTrie) FindPath(value KeyType) ([]int, []*PrefixTrieNode) {
+	pathIndexes := []int{-1}
+	path := []*PrefixTrieNode{pt.root}
+	result := find(value, pt.root, &pathIndexes, &path)
+	if result == nil {
+		return nil, nil
+	} // not sure I want this line
+	return pathIndexes, path
+}
+
+// IterateFrom can be used to find a value and run a function on that value.
+// If the handler returns true it continues to iterate through the children of value.
+func (pt *prefixTrie) IterateFrom(start KeyType, handler func(value *PrefixTrieNode) bool) {
+	node := find(start, pt.root, nil, nil)
+	if node == nil {
+		return
+	}
+	iterateFrom(node, handler)
+}
+
+func iterateFrom(node *PrefixTrieNode, handler func(value *PrefixTrieNode) bool) {
+	for handler(node) {
+		for _, child := range node.children {
+			iterateFrom(child, handler)
+		}
+	}
+}
+
+func (pt *prefixTrie) Erase(value KeyType) bool {
+	indexes, path := pt.FindPath(value)
+	if path == nil || len(path) <= 1 {
+		return false
+	}
+	nodes := pt.Nodes
+	for i := len(path) - 1; i > 0; i-- {
+		childCount := len(path[i].children)
+		noClaimData := !path[i].hasClaims
+		if childCount == 1 && noClaimData {
+			path[i].key = append(path[i].key, path[i].children[0].key...)
+			path[i].hash = nil
+			path[i].hasClaims = path[i].children[0].hasClaims
+			path[i].children = path[i].children[0].children
+			pt.Nodes--
+			continue
+		}
+		if childCount == 0 && noClaimData {
+			index := indexes[i]
+			path[i-1].children = append(path[i-1].children[:index], path[i-1].children[index+1:]...)
+			pt.Nodes--
+			continue
+		}
+		break
+	}
+	return nodes > pt.Nodes
+}
diff --git a/claimtrie/merkletrie/prefix_trie_test.go b/claimtrie/merkletrie/prefix_trie_test.go
new file mode 100644
index 00000000..ad277c36
--- /dev/null
+++ b/claimtrie/merkletrie/prefix_trie_test.go
@@ -0,0 +1,92 @@
+package merkletrie
+
+import (
+	"bytes"
+	"github.com/stretchr/testify/assert"
+	"math/rand"
+	"testing"
+	"time"
+)
+
+func b(value string) []byte      { return []byte(value) }
+func eq(x []byte, y string) bool { return bytes.Compare(x, b(y)) == 0 }
+
+func TestInsertAndErase(t *testing.T) {
+	trie := NewPrefixTrie()
+	assert.True(t, trie.NodeCount() == 1)
+	inserted, node := trie.InsertOrFind(b("abc"))
+	assert.True(t, inserted)
+	assert.NotNil(t, node)
+	assert.Equal(t, 2, trie.NodeCount())
+	inserted, node = trie.InsertOrFind(b("abd"))
+	assert.True(t, inserted)
+	assert.Equal(t, 4, trie.NodeCount())
+	assert.NotNil(t, node)
+	hit := trie.Find(b("ab"))
+	assert.True(t, eq(hit.key, "ab"))
+	assert.Equal(t, 2, len(hit.children))
+	hit = trie.Find(b("abc"))
+	assert.True(t, eq(hit.key, "c"))
+	hit = trie.Find(b("abd"))
+	assert.True(t, eq(hit.key, "d"))
+	hit = trie.Find(b("a"))
+	assert.Nil(t, hit)
+	indexes, path := trie.FindPath(b("abd"))
+	assert.Equal(t, 3, len(indexes))
+	assert.True(t, eq(path[1].key, "ab"))
+	erased := trie.Erase(b("ab"))
+	assert.False(t, erased)
+	assert.Equal(t, 4, trie.NodeCount())
+	erased = trie.Erase(b("abc"))
+	assert.True(t, erased)
+	assert.Equal(t, 2, trie.NodeCount())
+	erased = trie.Erase(b("abd"))
+	assert.True(t, erased)
+	assert.Equal(t, 1, trie.NodeCount())
+}
+
+func TestPrefixTrie(t *testing.T) {
+	inserts := 1000000
+	data := make([][]byte, inserts)
+	rand.Seed(42)
+	for i := 0; i < inserts; i++ {
+		size := rand.Intn(70) + 4
+		data[i] = make([]byte, size)
+		rand.Read(data[i])
+		for j := 0; j < size; j++ {
+			data[i][j] %= byte(62) // shrink the range to match the old test
+		}
+	}
+
+	trie := NewPrefixTrie()
+	// doing my own timing because I couldn't get the B.Run method to work:
+	start := time.Now()
+	for i := 0; i < inserts; i++ {
+		_, node := trie.InsertOrFind(data[i])
+		assert.NotNil(t, node, "Failure at %d of %d", i, inserts)
+	}
+	t.Logf("Insertion in %f sec.", time.Now().Sub(start).Seconds())
+
+	start = time.Now()
+	for i := 0; i < inserts; i++ {
+		node := trie.Find(data[i])
+		assert.True(t, bytes.HasSuffix(data[i], node.key), "Failure on %d of %d", i, inserts)
+	}
+	t.Logf("Lookup in %f sec. on %d nodes.", time.Now().Sub(start).Seconds(), trie.NodeCount())
+
+	start = time.Now()
+	for i := 0; i < inserts; i++ {
+		indexes, path := trie.FindPath(data[i])
+		assert.True(t, len(indexes) == len(path))
+		assert.True(t, len(path) > 1)
+		assert.True(t, bytes.HasSuffix(data[i], path[len(path)-1].key))
+	}
+	t.Logf("Parents in %f sec.", time.Now().Sub(start).Seconds())
+
+	start = time.Now()
+	for i := 0; i < inserts; i++ {
+		trie.Erase(data[i])
+	}
+	t.Logf("Deletion in %f sec.", time.Now().Sub(start).Seconds())
+	assert.Equal(t, 1, trie.NodeCount())
+}
diff --git a/claimtrie/node/claim.go b/claimtrie/node/claim.go
index 4070dc94..e0a42ab5 100644
--- a/claimtrie/node/claim.go
+++ b/claimtrie/node/claim.go
@@ -17,7 +17,7 @@ import (
 // ClaimID represents a Claim's ClaimID.
 type ClaimID [20]byte
 
-// NewClaimID returns a Claim ID caclculated from Ripemd160(Sha256(OUTPOINT).
+// NewClaimID returns a Claim ID calculated from Ripemd160(Sha256(OUTPOINT).
 func NewClaimID(op wire.OutPoint) ClaimID {
 
 	w := bytes.NewBuffer(op.Hash[:])