[lbry] removed dependency on text/norm, fixed NFD normalization

2021-12-29 14:24:19 -05:00 · 2021-12-29 14:24:19 -05:00 · 1f8ed174c0
commit 1f8ed174c0
parent 50d678b007
10 changed files with 21519 additions and 1596 deletions
--- a/claimtrie/normalization/CaseFolding_v13.txt
+++ b/claimtrie/normalization/CaseFolding_v13.txt
--- a/claimtrie/normalization/NFC_v11.txt
+++ b/claimtrie/normalization/NFC_v11.txt
--- a/claimtrie/normalization/NormalizationTest_v11.txt
+++ b/claimtrie/normalization/NormalizationTest_v11.txt
--- a/claimtrie/normalization/case_folder.go
+++ b/claimtrie/normalization/case_folder.go
@ -38,7 +38,7 @@ func init() {
 	}
 }

-func CaseFold(name []byte) []byte {
+func caseFold(name []byte) []byte {
 	var b bytes.Buffer
 	b.Grow(len(name))
 	for i := 0; i < len(name); {
--- a/claimtrie/normalization/char_decomposer.go
+++ b/claimtrie/normalization/char_decomposer.go
@ -0,0 +1,177 @@
+package normalization
+
+import (
+	"bufio"
+	_ "embed"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+//go:embed NFC_v11.txt
+var decompositions string // the data file that came from ICU 63.2
+
+var nfdMap map[rune][]rune
+var nfdOrder map[rune]int32
+
+func init() {
+	nfdMap = map[rune][]rune{}
+	nfdOrder = map[rune]int32{}
+	scanner := bufio.NewScanner(strings.NewReader(decompositions))
+	for scanner.Scan() {
+		line := scanner.Text()
+		if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
+			continue
+		}
+		if strings.ContainsAny(line, ":") {
+			// it's a ordering def:
+			addOrdering(line)
+			continue
+		}
+		splits := strings.Split(line, "=")
+		if len(splits) <= 1 {
+			splits = strings.Split(line, ">")
+			if len(splits) <= 1 {
+				continue
+			}
+		}
+		key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
+		if err != nil {
+			panic(err)
+		}
+		splits = strings.Split(splits[1], " ")
+		values := make([]rune, 0, len(splits))
+		for j := range splits {
+			value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
+			if err != nil {
+				panic(err)
+			}
+			existing := nfdMap[rune(value)]
+			if len(existing) > 0 {
+				values = append(values, existing...)
+			} else {
+				values = append(values, rune(value))
+			}
+		}
+		nfdMap[rune(key)] = values
+	}
+
+	// run one more expansion pass to catch stragglers
+	for key, values := range nfdMap {
+		for i, value := range values {
+			other := nfdMap[value]
+			if len(other) > 0 {
+				newValues := make([]rune, len(values)+len(other)-1)
+				copy(newValues, values[:i])
+				copy(newValues[i:i+len(other)], other)
+				copy(newValues[i+len(other):], values[i+1:])
+				nfdMap[key] = newValues
+			}
+		}
+	}
+
+	// assert no more expansions are necessary:
+	for _, values := range nfdMap {
+		for _, value := range values {
+			other := nfdMap[value]
+			if len(other) > 0 {
+				panic("Failed in NFD expansion")
+			}
+		}
+	}
+}
+
+func addOrdering(line string) {
+	splits := strings.Split(line, ":")
+	ranges := strings.Split(splits[0], "..")
+
+	value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
+	if err != nil {
+		panic(err)
+	}
+
+	start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
+	if err != nil {
+		panic(err)
+	}
+	end := start
+	if len(ranges) > 1 {
+		end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
+		if err != nil {
+			panic(err)
+		}
+	}
+	for i := start; i <= end; i++ {
+		nfdOrder[rune(i)] = int32(value)
+	}
+}
+
+func decompose(name []byte) []byte {
+	// see https://unicode.org/reports/tr15/ section 1.3
+	runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
+	for i := 0; i < len(name); {
+		r, w := utf8.DecodeRune(name[i:])
+		if r == utf8.RuneError && w < 2 {
+			// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
+			return name
+		}
+		replacements := nfdMap[r]
+		if len(replacements) > 0 {
+			runes = append(runes, replacements...)
+		} else {
+			hanguls := decomposeHangul(r)
+			if len(hanguls) > 0 {
+				runes = append(runes, hanguls...)
+			} else {
+				runes = append(runes, r)
+			}
+		}
+		i += w
+	}
+	repairOrdering(runes)
+	return []byte(string(runes))
+}
+
+func decomposeHangul(s rune) []rune {
+	// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
+
+	const SBase int32 = 0xAC00
+	const LBase int32 = 0x1100
+	const VBase int32 = 0x1161
+	const TBase int32 = 0x11A7
+	const LCount int32 = 19
+	const VCount int32 = 21
+	const TCount int32 = 28
+	const NCount = VCount * TCount // 588
+	const SCount = LCount * NCount // 11172
+
+	SIndex := s - SBase
+	if SIndex < 0 || SIndex >= SCount {
+		return nil
+	}
+	L := LBase + SIndex/NCount
+	V := VBase + (SIndex%NCount)/TCount
+	T := TBase + SIndex%TCount
+	result := []rune{L, V}
+	if T != TBase {
+		result = append(result, T)
+	}
+	return result
+}
+
+func repairOrdering(runes []rune) {
+	for i := 1; i < len(runes); i++ {
+		a := runes[i-1]
+		b := runes[i]
+		oa := nfdOrder[a]
+		ob := nfdOrder[b]
+		if oa > ob && ob > 0 {
+			runes[i-1], runes[i] = b, a
+			if i >= 2 {
+				i -= 2
+			} else {
+				i = 0
+			}
+		}
+	}
+}
--- a/claimtrie/normalization/normalizer.go
+++ b/claimtrie/normalization/normalizer.go
@ -2,11 +2,10 @@ package normalization

 import (
 	"github.com/lbryio/lbcd/claimtrie/param"
-	"golang.org/x/text/unicode/norm"
 )

 var Normalize = normalizeGo
-var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
+var NormalizeTitle = "Normalizing strings via Go. Casefold and NFD table versions: 11.0.0 (from ICU 63.2)"

 func NormalizeIfNecessary(name []byte, height int32) []byte {
 	if height < param.ActiveParams.NormalizedNameForkHeight {
@ -17,7 +16,7 @@ func NormalizeIfNecessary(name []byte, height int32) []byte {

 func normalizeGo(value []byte) []byte {

-	normalized := norm.NFD.Bytes(value) // may need to hard-code the version on this
+	normalized := decompose(value) // may need to hard-code the version on this
 	// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
-	return CaseFold(normalized)
+	return caseFold(normalized)
 }
--- a/claimtrie/normalization/normalizer_icu.go
+++ b/claimtrie/normalization/normalizer_icu.go
@ -31,6 +31,8 @@ package normalization
 // }
 import "C"
 import (
+	"bytes"
+	"encoding/hex"
 	"fmt"
 	"unsafe"
 )
@ -47,21 +49,29 @@ func IcuVersion() string {
 }

 func normalizeICU(value []byte) []byte {
+	original := value
 	if len(value) <= 0 {
 		return value
 	}
+
+	other := normalizeGo(value)
+
 	name := (*C.char)(unsafe.Pointer(&value[0]))
 	length := C.int(len(value))

 	// hopefully this is a stack alloc (but it may be a bit large for that):
 	var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
-	result := unsafe.Pointer(&resultName[0])
+	pointer := unsafe.Pointer(&resultName[0])

-	resultLength := C.normalize(name, length, (*C.char)(result))
-	if resultLength == 0 {
+	resultLength := C.normalize(name, length, (*C.char)(pointer))
+	if resultLength > 0 {
+		value = C.GoBytes(pointer, resultLength)
+	}
+
+	// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
+	if !bytes.Equal(other, value) {
+		fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
+			hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
+	}
 	return value
 }
-
-	// return resultName[0:resultLength] -- we want to shrink the result (not use a slice on 1024)
-	return C.GoBytes(result, resultLength)
-}
--- a/claimtrie/normalization/normalizer_icu_test.go
+++ b/claimtrie/normalization/normalizer_icu_test.go
@ -4,6 +4,7 @@
 package normalization

 import (
+	"bytes"
 	"encoding/hex"
 	"testing"
 	"unicode/utf8"
@ -63,3 +64,11 @@ func TestBlock760150_1020105(t *testing.T) {
 		// t.Logf("%s -> %s", s, string(b))
 	}
 }
+
+func TestBlock1085612(t *testing.T) {
+	s, err := hex.DecodeString("6eccb7cd9dcc92cd90cc86cc80cc80cd91cd9dcd8acd80cd92cc94cc85cc8fccbdcda0ccbdcd80cda0cd84cc94cc8ccc9acd84cc94cd9bcda0cca7cc99ccaccd99cca9cca7")
+	assert.NoError(t, err)
+	a := normalizeICU(s)
+	b := normalizeGo(s)
+	assert.Equal(t, a, b, "%s != %s, %v", string(a), string(b), bytes.Equal(b, s))
+}
--- a/claimtrie/normalization/normalizer_test.go
+++ b/claimtrie/normalization/normalizer_test.go
@ -1,7 +1,12 @@
 package normalization

 import (
+	"bufio"
+	"bytes"
+	_ "embed"
 	"math/rand"
+	"strconv"
+	"strings"
 	"testing"

 	"github.com/stretchr/testify/require"
@ -52,3 +57,33 @@ func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
 		require.True(b, len(s) >= 8)
 	}
 }
+
+//go:embed NormalizationTest_v11.txt
+var nfdTests string
+
+func TestDecomposition(t *testing.T) {
+	r := require.New(t)
+
+	scanner := bufio.NewScanner(strings.NewReader(nfdTests))
+	for scanner.Scan() {
+		line := scanner.Text()
+		if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
+			continue
+		}
+		splits := strings.Split(line, ";")
+		source := convertToBytes(splits[0])
+		targetNFD := convertToBytes(splits[2])
+		fixed := decompose(source)
+		r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
+	}
+}
+
+func convertToBytes(s string) []byte {
+	splits := strings.Split(s, " ")
+	var b bytes.Buffer
+	for i := range splits {
+		value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
+		b.WriteRune(rune(value))
+	}
+	return b.Bytes()
+}
--- a/go.mod
+++ b/go.mod
@ -22,7 +22,6 @@ require (
 	github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
 	github.com/vmihailenco/msgpack/v5 v5.3.2
 	golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
-	golang.org/x/text v0.3.7
 )

 require (