[lbry] removed dependency on text/norm, fixed NFD normalization
This commit is contained in:
parent
50d678b007
commit
1f8ed174c0
10 changed files with 21519 additions and 1596 deletions
File diff suppressed because it is too large
Load diff
2431
claimtrie/normalization/NFC_v11.txt
Normal file
2431
claimtrie/normalization/NFC_v11.txt
Normal file
File diff suppressed because it is too large
Load diff
18847
claimtrie/normalization/NormalizationTest_v11.txt
Normal file
18847
claimtrie/normalization/NormalizationTest_v11.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -38,7 +38,7 @@ func init() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func CaseFold(name []byte) []byte {
|
func caseFold(name []byte) []byte {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
b.Grow(len(name))
|
b.Grow(len(name))
|
||||||
for i := 0; i < len(name); {
|
for i := 0; i < len(name); {
|
||||||
|
|
177
claimtrie/normalization/char_decomposer.go
Normal file
177
claimtrie/normalization/char_decomposer.go
Normal file
|
@ -0,0 +1,177 @@
|
||||||
|
package normalization
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
_ "embed"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
//go:embed NFC_v11.txt
|
||||||
|
var decompositions string // the data file that came from ICU 63.2
|
||||||
|
|
||||||
|
var nfdMap map[rune][]rune
|
||||||
|
var nfdOrder map[rune]int32
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
nfdMap = map[rune][]rune{}
|
||||||
|
nfdOrder = map[rune]int32{}
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(decompositions))
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.ContainsAny(line, ":") {
|
||||||
|
// it's a ordering def:
|
||||||
|
addOrdering(line)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
splits := strings.Split(line, "=")
|
||||||
|
if len(splits) <= 1 {
|
||||||
|
splits = strings.Split(line, ">")
|
||||||
|
if len(splits) <= 1 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
splits = strings.Split(splits[1], " ")
|
||||||
|
values := make([]rune, 0, len(splits))
|
||||||
|
for j := range splits {
|
||||||
|
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
existing := nfdMap[rune(value)]
|
||||||
|
if len(existing) > 0 {
|
||||||
|
values = append(values, existing...)
|
||||||
|
} else {
|
||||||
|
values = append(values, rune(value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nfdMap[rune(key)] = values
|
||||||
|
}
|
||||||
|
|
||||||
|
// run one more expansion pass to catch stragglers
|
||||||
|
for key, values := range nfdMap {
|
||||||
|
for i, value := range values {
|
||||||
|
other := nfdMap[value]
|
||||||
|
if len(other) > 0 {
|
||||||
|
newValues := make([]rune, len(values)+len(other)-1)
|
||||||
|
copy(newValues, values[:i])
|
||||||
|
copy(newValues[i:i+len(other)], other)
|
||||||
|
copy(newValues[i+len(other):], values[i+1:])
|
||||||
|
nfdMap[key] = newValues
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// assert no more expansions are necessary:
|
||||||
|
for _, values := range nfdMap {
|
||||||
|
for _, value := range values {
|
||||||
|
other := nfdMap[value]
|
||||||
|
if len(other) > 0 {
|
||||||
|
panic("Failed in NFD expansion")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func addOrdering(line string) {
|
||||||
|
splits := strings.Split(line, ":")
|
||||||
|
ranges := strings.Split(splits[0], "..")
|
||||||
|
|
||||||
|
value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
end := start
|
||||||
|
if len(ranges) > 1 {
|
||||||
|
end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i := start; i <= end; i++ {
|
||||||
|
nfdOrder[rune(i)] = int32(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func decompose(name []byte) []byte {
|
||||||
|
// see https://unicode.org/reports/tr15/ section 1.3
|
||||||
|
runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
|
||||||
|
for i := 0; i < len(name); {
|
||||||
|
r, w := utf8.DecodeRune(name[i:])
|
||||||
|
if r == utf8.RuneError && w < 2 {
|
||||||
|
// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
|
||||||
|
return name
|
||||||
|
}
|
||||||
|
replacements := nfdMap[r]
|
||||||
|
if len(replacements) > 0 {
|
||||||
|
runes = append(runes, replacements...)
|
||||||
|
} else {
|
||||||
|
hanguls := decomposeHangul(r)
|
||||||
|
if len(hanguls) > 0 {
|
||||||
|
runes = append(runes, hanguls...)
|
||||||
|
} else {
|
||||||
|
runes = append(runes, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i += w
|
||||||
|
}
|
||||||
|
repairOrdering(runes)
|
||||||
|
return []byte(string(runes))
|
||||||
|
}
|
||||||
|
|
||||||
|
func decomposeHangul(s rune) []rune {
|
||||||
|
// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
|
||||||
|
|
||||||
|
const SBase int32 = 0xAC00
|
||||||
|
const LBase int32 = 0x1100
|
||||||
|
const VBase int32 = 0x1161
|
||||||
|
const TBase int32 = 0x11A7
|
||||||
|
const LCount int32 = 19
|
||||||
|
const VCount int32 = 21
|
||||||
|
const TCount int32 = 28
|
||||||
|
const NCount = VCount * TCount // 588
|
||||||
|
const SCount = LCount * NCount // 11172
|
||||||
|
|
||||||
|
SIndex := s - SBase
|
||||||
|
if SIndex < 0 || SIndex >= SCount {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
L := LBase + SIndex/NCount
|
||||||
|
V := VBase + (SIndex%NCount)/TCount
|
||||||
|
T := TBase + SIndex%TCount
|
||||||
|
result := []rune{L, V}
|
||||||
|
if T != TBase {
|
||||||
|
result = append(result, T)
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func repairOrdering(runes []rune) {
|
||||||
|
for i := 1; i < len(runes); i++ {
|
||||||
|
a := runes[i-1]
|
||||||
|
b := runes[i]
|
||||||
|
oa := nfdOrder[a]
|
||||||
|
ob := nfdOrder[b]
|
||||||
|
if oa > ob && ob > 0 {
|
||||||
|
runes[i-1], runes[i] = b, a
|
||||||
|
if i >= 2 {
|
||||||
|
i -= 2
|
||||||
|
} else {
|
||||||
|
i = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,11 +2,10 @@ package normalization
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/lbryio/lbcd/claimtrie/param"
|
"github.com/lbryio/lbcd/claimtrie/param"
|
||||||
"golang.org/x/text/unicode/norm"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var Normalize = normalizeGo
|
var Normalize = normalizeGo
|
||||||
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
|
var NormalizeTitle = "Normalizing strings via Go. Casefold and NFD table versions: 11.0.0 (from ICU 63.2)"
|
||||||
|
|
||||||
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
||||||
if height < param.ActiveParams.NormalizedNameForkHeight {
|
if height < param.ActiveParams.NormalizedNameForkHeight {
|
||||||
|
@ -17,7 +16,7 @@ func NormalizeIfNecessary(name []byte, height int32) []byte {
|
||||||
|
|
||||||
func normalizeGo(value []byte) []byte {
|
func normalizeGo(value []byte) []byte {
|
||||||
|
|
||||||
normalized := norm.NFD.Bytes(value) // may need to hard-code the version on this
|
normalized := decompose(value) // may need to hard-code the version on this
|
||||||
// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
|
// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
|
||||||
return CaseFold(normalized)
|
return caseFold(normalized)
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,6 +31,8 @@ package normalization
|
||||||
// }
|
// }
|
||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/hex"
|
||||||
"fmt"
|
"fmt"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
@ -47,21 +49,29 @@ func IcuVersion() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeICU(value []byte) []byte {
|
func normalizeICU(value []byte) []byte {
|
||||||
|
original := value
|
||||||
if len(value) <= 0 {
|
if len(value) <= 0 {
|
||||||
return value
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
other := normalizeGo(value)
|
||||||
|
|
||||||
name := (*C.char)(unsafe.Pointer(&value[0]))
|
name := (*C.char)(unsafe.Pointer(&value[0]))
|
||||||
length := C.int(len(value))
|
length := C.int(len(value))
|
||||||
|
|
||||||
// hopefully this is a stack alloc (but it may be a bit large for that):
|
// hopefully this is a stack alloc (but it may be a bit large for that):
|
||||||
var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
|
var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
|
||||||
result := unsafe.Pointer(&resultName[0])
|
pointer := unsafe.Pointer(&resultName[0])
|
||||||
|
|
||||||
resultLength := C.normalize(name, length, (*C.char)(result))
|
resultLength := C.normalize(name, length, (*C.char)(pointer))
|
||||||
if resultLength == 0 {
|
if resultLength > 0 {
|
||||||
return value
|
value = C.GoBytes(pointer, resultLength)
|
||||||
}
|
}
|
||||||
|
|
||||||
// return resultName[0:resultLength] -- we want to shrink the result (not use a slice on 1024)
|
// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
|
||||||
return C.GoBytes(result, resultLength)
|
if !bytes.Equal(other, value) {
|
||||||
|
fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
|
||||||
|
hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
|
||||||
|
}
|
||||||
|
return value
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
package normalization
|
package normalization
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"testing"
|
"testing"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
@ -63,3 +64,11 @@ func TestBlock760150_1020105(t *testing.T) {
|
||||||
// t.Logf("%s -> %s", s, string(b))
|
// t.Logf("%s -> %s", s, string(b))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBlock1085612(t *testing.T) {
|
||||||
|
s, err := hex.DecodeString("6eccb7cd9dcc92cd90cc86cc80cc80cd91cd9dcd8acd80cd92cc94cc85cc8fccbdcda0ccbdcd80cda0cd84cc94cc8ccc9acd84cc94cd9bcda0cca7cc99ccaccd99cca9cca7")
|
||||||
|
assert.NoError(t, err)
|
||||||
|
a := normalizeICU(s)
|
||||||
|
b := normalizeGo(s)
|
||||||
|
assert.Equal(t, a, b, "%s != %s, %v", string(a), string(b), bytes.Equal(b, s))
|
||||||
|
}
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
package normalization
|
package normalization
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
_ "embed"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
@ -52,3 +57,33 @@ func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
|
||||||
require.True(b, len(s) >= 8)
|
require.True(b, len(s) >= 8)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//go:embed NormalizationTest_v11.txt
|
||||||
|
var nfdTests string
|
||||||
|
|
||||||
|
func TestDecomposition(t *testing.T) {
|
||||||
|
r := require.New(t)
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(nfdTests))
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
splits := strings.Split(line, ";")
|
||||||
|
source := convertToBytes(splits[0])
|
||||||
|
targetNFD := convertToBytes(splits[2])
|
||||||
|
fixed := decompose(source)
|
||||||
|
r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func convertToBytes(s string) []byte {
|
||||||
|
splits := strings.Split(s, " ")
|
||||||
|
var b bytes.Buffer
|
||||||
|
for i := range splits {
|
||||||
|
value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
|
||||||
|
b.WriteRune(rune(value))
|
||||||
|
}
|
||||||
|
return b.Bytes()
|
||||||
|
}
|
||||||
|
|
1
go.mod
1
go.mod
|
@ -22,7 +22,6 @@ require (
|
||||||
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
|
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
|
||||||
github.com/vmihailenco/msgpack/v5 v5.3.2
|
github.com/vmihailenco/msgpack/v5 v5.3.2
|
||||||
golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
|
golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
|
||||||
golang.org/x/text v0.3.7
|
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
|
Loading…
Reference in a new issue