[lbry] removed dependency on text/norm, fixed NFD normalization

2021-12-29 14:24:19 -05:00 · 2021-12-29 14:24:19 -05:00 · 1f8ed174c0
commit 1f8ed174c0
parent 50d678b007
10 changed files with 21519 additions and 1596 deletions
--- a/claimtrie/normalization/CaseFolding_v13.txt
+++ b/claimtrie/normalization/CaseFolding_v13.txt
--- a/claimtrie/normalization/NFC_v11.txt
+++ b/claimtrie/normalization/NFC_v11.txt
--- a/claimtrie/normalization/NormalizationTest_v11.txt
+++ b/claimtrie/normalization/NormalizationTest_v11.txt
--- a/claimtrie/normalization/case_folder.go
+++ b/claimtrie/normalization/case_folder.go
@ -38,7 +38,7 @@ func init() {
 	}
 }
-func CaseFold(name []byte) []byte {
+func caseFold(name []byte) []byte {
 	var b bytes.Buffer
 	b.Grow(len(name))
 	for i := 0; i < len(name); {
--- a/claimtrie/normalization/char_decomposer.go
+++ b/claimtrie/normalization/char_decomposer.go
@ -0,0 +1,177 @@
 package normalization
 import (
 	"bufio"
 	_ "embed"
 	"strconv"
 	"strings"
 	"unicode/utf8"
 )
 //go:embed NFC_v11.txt
 var decompositions string // the data file that came from ICU 63.2
 var nfdMap map[rune][]rune
 var nfdOrder map[rune]int32
 func init() {
 	nfdMap = map[rune][]rune{}
 	nfdOrder = map[rune]int32{}
 	scanner := bufio.NewScanner(strings.NewReader(decompositions))
 	for scanner.Scan() {
 		line := scanner.Text()
 		if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
 			continue
 		}
 		if strings.ContainsAny(line, ":") {
 			// it's a ordering def:
 			addOrdering(line)
 			continue
 		}
 		splits := strings.Split(line, "=")
 		if len(splits) <= 1 {
 			splits = strings.Split(line, ">")
 			if len(splits) <= 1 {
 				continue
 			}
 		}
 		key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
 		if err != nil {
 			panic(err)
 		}
 		splits = strings.Split(splits[1], " ")
 		values := make([]rune, 0, len(splits))
 		for j := range splits {
 			value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
 			if err != nil {
 				panic(err)
 			}
 			existing := nfdMap[rune(value)]
 			if len(existing) > 0 {
 				values = append(values, existing...)
 			} else {
 				values = append(values, rune(value))
 			}
 		}
 		nfdMap[rune(key)] = values
 	}
 	// run one more expansion pass to catch stragglers
 	for key, values := range nfdMap {
 		for i, value := range values {
 			other := nfdMap[value]
 			if len(other) > 0 {
 				newValues := make([]rune, len(values)+len(other)-1)
 				copy(newValues, values[:i])
 				copy(newValues[i:i+len(other)], other)
 				copy(newValues[i+len(other):], values[i+1:])
 				nfdMap[key] = newValues
 			}
 		}
 	}
 	// assert no more expansions are necessary:
 	for _, values := range nfdMap {
 		for _, value := range values {
 			other := nfdMap[value]
 			if len(other) > 0 {
 				panic("Failed in NFD expansion")
 			}
 		}
 	}
 }
 func addOrdering(line string) {
 	splits := strings.Split(line, ":")
 	ranges := strings.Split(splits[0], "..")
 	value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
 	if err != nil {
 		panic(err)
 	}
 	start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
 	if err != nil {
 		panic(err)
 	}
 	end := start
 	if len(ranges) > 1 {
 		end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
 		if err != nil {
 			panic(err)
 		}
 	}
 	for i := start; i <= end; i++ {
 		nfdOrder[rune(i)] = int32(value)
 	}
 }
 func decompose(name []byte) []byte {
 	// see https://unicode.org/reports/tr15/ section 1.3
 	runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
 	for i := 0; i < len(name); {
 		r, w := utf8.DecodeRune(name[i:])
 		if r == utf8.RuneError && w < 2 {
 			// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
 			return name
 		}
 		replacements := nfdMap[r]
 		if len(replacements) > 0 {
 			runes = append(runes, replacements...)
 		} else {
 			hanguls := decomposeHangul(r)
 			if len(hanguls) > 0 {
 				runes = append(runes, hanguls...)
 			} else {
 				runes = append(runes, r)
 			}
 		}
 		i += w
 	}
 	repairOrdering(runes)
 	return []byte(string(runes))
 }
 func decomposeHangul(s rune) []rune {
 	// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
 	const SBase int32 = 0xAC00
 	const LBase int32 = 0x1100
 	const VBase int32 = 0x1161
 	const TBase int32 = 0x11A7
 	const LCount int32 = 19
 	const VCount int32 = 21
 	const TCount int32 = 28
 	const NCount = VCount * TCount // 588
 	const SCount = LCount * NCount // 11172
 	SIndex := s - SBase
 	if SIndex < 0 || SIndex >= SCount {
 		return nil
 	}
 	L := LBase + SIndex/NCount
 	V := VBase + (SIndex%NCount)/TCount
 	T := TBase + SIndex%TCount
 	result := []rune{L, V}
 	if T != TBase {
 		result = append(result, T)
 	}
 	return result
 }
 func repairOrdering(runes []rune) {
 	for i := 1; i < len(runes); i++ {
 		a := runes[i-1]
 		b := runes[i]
 		oa := nfdOrder[a]
 		ob := nfdOrder[b]
 		if oa > ob && ob > 0 {
 			runes[i-1], runes[i] = b, a
 			if i >= 2 {
 				i -= 2
 			} else {
 				i = 0
 			}
 		}
 	}
 }
--- a/claimtrie/normalization/normalizer.go
+++ b/claimtrie/normalization/normalizer.go
@ -2,11 +2,10 @@ package normalization
 import (
 	"github.com/lbryio/lbcd/claimtrie/param"
 	"golang.org/x/text/unicode/norm"
 )
 var Normalize = normalizeGo
-var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
+var NormalizeTitle = "Normalizing strings via Go. Casefold and NFD table versions: 11.0.0 (from ICU 63.2)"
 func NormalizeIfNecessary(name []byte, height int32) []byte {
 	if height < param.ActiveParams.NormalizedNameForkHeight {
@ -17,7 +16,7 @@ func NormalizeIfNecessary(name []byte, height int32) []byte {
 func normalizeGo(value []byte) []byte {
-	normalized := norm.NFD.Bytes(value) // may need to hard-code the version on this
+	normalized := decompose(value) // may need to hard-code the version on this
 	// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
-	return CaseFold(normalized)
+	return caseFold(normalized)
 }
--- a/claimtrie/normalization/normalizer_icu.go
+++ b/claimtrie/normalization/normalizer_icu.go
@ -31,6 +31,8 @@ package normalization
 // }
 import "C"
 import (
 	"bytes"
 	"encoding/hex"
 	"fmt"
 	"unsafe"
 )
@ -47,21 +49,29 @@ func IcuVersion() string {
 }
 func normalizeICU(value []byte) []byte {
 	original := value
 	if len(value) <= 0 {
 		return value
 	}
 	other := normalizeGo(value)
 	name := (*C.char)(unsafe.Pointer(&value[0]))
 	length := C.int(len(value))
 	// hopefully this is a stack alloc (but it may be a bit large for that):
 	var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
-	result := unsafe.Pointer(&resultName[0])
+	pointer := unsafe.Pointer(&resultName[0])
-	resultLength := C.normalize(name, length, (*C.char)(result))
+	resultLength := C.normalize(name, length, (*C.char)(pointer))
-	if resultLength == 0 {
+	if resultLength > 0 {
-		return value
+		value = C.GoBytes(pointer, resultLength)
 	}
-	// return resultName[0:resultLength] -- we want to shrink the result (not use a slice on 1024)
+	// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
-	return C.GoBytes(result, resultLength)
+	if !bytes.Equal(other, value) {
 		fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
 			hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
 	}
 	return value
 }
--- a/claimtrie/normalization/normalizer_icu_test.go
+++ b/claimtrie/normalization/normalizer_icu_test.go
@ -4,6 +4,7 @@
 package normalization
 import (
 	"bytes"
 	"encoding/hex"
 	"testing"
 	"unicode/utf8"
@ -63,3 +64,11 @@ func TestBlock760150_1020105(t *testing.T) {
 		// t.Logf("%s -> %s", s, string(b))
 	}
 }
 func TestBlock1085612(t *testing.T) {
 	s, err := hex.DecodeString("6eccb7cd9dcc92cd90cc86cc80cc80cd91cd9dcd8acd80cd92cc94cc85cc8fccbdcda0ccbdcd80cda0cd84cc94cc8ccc9acd84cc94cd9bcda0cca7cc99ccaccd99cca9cca7")
 	assert.NoError(t, err)
 	a := normalizeICU(s)
 	b := normalizeGo(s)
 	assert.Equal(t, a, b, "%s != %s, %v", string(a), string(b), bytes.Equal(b, s))
 }
--- a/claimtrie/normalization/normalizer_test.go
+++ b/claimtrie/normalization/normalizer_test.go
@ -1,7 +1,12 @@
 package normalization
 import (
 	"bufio"
 	"bytes"
 	_ "embed"
 	"math/rand"
 	"strconv"
 	"strings"
 	"testing"
 	"github.com/stretchr/testify/require"
@ -52,3 +57,33 @@ func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
 		require.True(b, len(s) >= 8)
 	}
 }
 //go:embed NormalizationTest_v11.txt
 var nfdTests string
 func TestDecomposition(t *testing.T) {
 	r := require.New(t)
 	scanner := bufio.NewScanner(strings.NewReader(nfdTests))
 	for scanner.Scan() {
 		line := scanner.Text()
 		if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
 			continue
 		}
 		splits := strings.Split(line, ";")
 		source := convertToBytes(splits[0])
 		targetNFD := convertToBytes(splits[2])
 		fixed := decompose(source)
 		r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
 	}
 }
 func convertToBytes(s string) []byte {
 	splits := strings.Split(s, " ")
 	var b bytes.Buffer
 	for i := range splits {
 		value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
 		b.WriteRune(rune(value))
 	}
 	return b.Bytes()
 }
--- a/go.mod
+++ b/go.mod
@ -22,7 +22,6 @@ require (
 	github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
 	github.com/vmihailenco/msgpack/v5 v5.3.2
 	golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
 	golang.org/x/text v0.3.7
 )
 require (