[lbry] removed dependency on text/norm, fixed NFD normalization

This commit is contained in:
Brannon King 2021-12-29 14:24:19 -05:00
parent 50d678b007
commit 1f8ed174c0
10 changed files with 21519 additions and 1596 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -38,7 +38,7 @@ func init() {
}
}
func CaseFold(name []byte) []byte {
func caseFold(name []byte) []byte {
var b bytes.Buffer
b.Grow(len(name))
for i := 0; i < len(name); {

View file

@ -0,0 +1,177 @@
package normalization
import (
"bufio"
_ "embed"
"strconv"
"strings"
"unicode/utf8"
)
//go:embed NFC_v11.txt
var decompositions string // the data file that came from ICU 63.2
var nfdMap map[rune][]rune
var nfdOrder map[rune]int32
func init() {
nfdMap = map[rune][]rune{}
nfdOrder = map[rune]int32{}
scanner := bufio.NewScanner(strings.NewReader(decompositions))
for scanner.Scan() {
line := scanner.Text()
if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
continue
}
if strings.ContainsAny(line, ":") {
// it's a ordering def:
addOrdering(line)
continue
}
splits := strings.Split(line, "=")
if len(splits) <= 1 {
splits = strings.Split(line, ">")
if len(splits) <= 1 {
continue
}
}
key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
if err != nil {
panic(err)
}
splits = strings.Split(splits[1], " ")
values := make([]rune, 0, len(splits))
for j := range splits {
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
if err != nil {
panic(err)
}
existing := nfdMap[rune(value)]
if len(existing) > 0 {
values = append(values, existing...)
} else {
values = append(values, rune(value))
}
}
nfdMap[rune(key)] = values
}
// run one more expansion pass to catch stragglers
for key, values := range nfdMap {
for i, value := range values {
other := nfdMap[value]
if len(other) > 0 {
newValues := make([]rune, len(values)+len(other)-1)
copy(newValues, values[:i])
copy(newValues[i:i+len(other)], other)
copy(newValues[i+len(other):], values[i+1:])
nfdMap[key] = newValues
}
}
}
// assert no more expansions are necessary:
for _, values := range nfdMap {
for _, value := range values {
other := nfdMap[value]
if len(other) > 0 {
panic("Failed in NFD expansion")
}
}
}
}
func addOrdering(line string) {
splits := strings.Split(line, ":")
ranges := strings.Split(splits[0], "..")
value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
if err != nil {
panic(err)
}
start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
if err != nil {
panic(err)
}
end := start
if len(ranges) > 1 {
end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
if err != nil {
panic(err)
}
}
for i := start; i <= end; i++ {
nfdOrder[rune(i)] = int32(value)
}
}
func decompose(name []byte) []byte {
// see https://unicode.org/reports/tr15/ section 1.3
runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
for i := 0; i < len(name); {
r, w := utf8.DecodeRune(name[i:])
if r == utf8.RuneError && w < 2 {
// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
return name
}
replacements := nfdMap[r]
if len(replacements) > 0 {
runes = append(runes, replacements...)
} else {
hanguls := decomposeHangul(r)
if len(hanguls) > 0 {
runes = append(runes, hanguls...)
} else {
runes = append(runes, r)
}
}
i += w
}
repairOrdering(runes)
return []byte(string(runes))
}
func decomposeHangul(s rune) []rune {
// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
const SBase int32 = 0xAC00
const LBase int32 = 0x1100
const VBase int32 = 0x1161
const TBase int32 = 0x11A7
const LCount int32 = 19
const VCount int32 = 21
const TCount int32 = 28
const NCount = VCount * TCount // 588
const SCount = LCount * NCount // 11172
SIndex := s - SBase
if SIndex < 0 || SIndex >= SCount {
return nil
}
L := LBase + SIndex/NCount
V := VBase + (SIndex%NCount)/TCount
T := TBase + SIndex%TCount
result := []rune{L, V}
if T != TBase {
result = append(result, T)
}
return result
}
func repairOrdering(runes []rune) {
for i := 1; i < len(runes); i++ {
a := runes[i-1]
b := runes[i]
oa := nfdOrder[a]
ob := nfdOrder[b]
if oa > ob && ob > 0 {
runes[i-1], runes[i] = b, a
if i >= 2 {
i -= 2
} else {
i = 0
}
}
}
}

View file

@ -2,11 +2,10 @@ package normalization
import (
"github.com/lbryio/lbcd/claimtrie/param"
"golang.org/x/text/unicode/norm"
)
var Normalize = normalizeGo
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
var NormalizeTitle = "Normalizing strings via Go. Casefold and NFD table versions: 11.0.0 (from ICU 63.2)"
func NormalizeIfNecessary(name []byte, height int32) []byte {
if height < param.ActiveParams.NormalizedNameForkHeight {
@ -17,7 +16,7 @@ func NormalizeIfNecessary(name []byte, height int32) []byte {
func normalizeGo(value []byte) []byte {
normalized := norm.NFD.Bytes(value) // may need to hard-code the version on this
normalized := decompose(value) // may need to hard-code the version on this
// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
return CaseFold(normalized)
return caseFold(normalized)
}

View file

@ -31,6 +31,8 @@ package normalization
// }
import "C"
import (
"bytes"
"encoding/hex"
"fmt"
"unsafe"
)
@ -47,21 +49,29 @@ func IcuVersion() string {
}
func normalizeICU(value []byte) []byte {
original := value
if len(value) <= 0 {
return value
}
other := normalizeGo(value)
name := (*C.char)(unsafe.Pointer(&value[0]))
length := C.int(len(value))
// hopefully this is a stack alloc (but it may be a bit large for that):
var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
result := unsafe.Pointer(&resultName[0])
pointer := unsafe.Pointer(&resultName[0])
resultLength := C.normalize(name, length, (*C.char)(result))
if resultLength == 0 {
return value
resultLength := C.normalize(name, length, (*C.char)(pointer))
if resultLength > 0 {
value = C.GoBytes(pointer, resultLength)
}
// return resultName[0:resultLength] -- we want to shrink the result (not use a slice on 1024)
return C.GoBytes(result, resultLength)
// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
if !bytes.Equal(other, value) {
fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
}
return value
}

View file

@ -4,6 +4,7 @@
package normalization
import (
"bytes"
"encoding/hex"
"testing"
"unicode/utf8"
@ -63,3 +64,11 @@ func TestBlock760150_1020105(t *testing.T) {
// t.Logf("%s -> %s", s, string(b))
}
}
func TestBlock1085612(t *testing.T) {
s, err := hex.DecodeString("6eccb7cd9dcc92cd90cc86cc80cc80cd91cd9dcd8acd80cd92cc94cc85cc8fccbdcda0ccbdcd80cda0cd84cc94cc8ccc9acd84cc94cd9bcda0cca7cc99ccaccd99cca9cca7")
assert.NoError(t, err)
a := normalizeICU(s)
b := normalizeGo(s)
assert.Equal(t, a, b, "%s != %s, %v", string(a), string(b), bytes.Equal(b, s))
}

View file

@ -1,7 +1,12 @@
package normalization
import (
"bufio"
"bytes"
_ "embed"
"math/rand"
"strconv"
"strings"
"testing"
"github.com/stretchr/testify/require"
@ -52,3 +57,33 @@ func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
require.True(b, len(s) >= 8)
}
}
//go:embed NormalizationTest_v11.txt
var nfdTests string
func TestDecomposition(t *testing.T) {
r := require.New(t)
scanner := bufio.NewScanner(strings.NewReader(nfdTests))
for scanner.Scan() {
line := scanner.Text()
if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
continue
}
splits := strings.Split(line, ";")
source := convertToBytes(splits[0])
targetNFD := convertToBytes(splits[2])
fixed := decompose(source)
r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
}
}
func convertToBytes(s string) []byte {
splits := strings.Split(s, " ")
var b bytes.Buffer
for i := range splits {
value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
b.WriteRune(rune(value))
}
return b.Bytes()
}

1
go.mod
View file

@ -22,7 +22,6 @@ require (
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
github.com/vmihailenco/msgpack/v5 v5.3.2
golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
golang.org/x/text v0.3.7
)
require (