[lbry] removed dependency on text/norm, fixed NFD normalization
This commit is contained in:
parent
50d678b007
commit
1f8ed174c0
10 changed files with 21519 additions and 1596 deletions
File diff suppressed because it is too large
Load diff
2431
claimtrie/normalization/NFC_v11.txt
Normal file
2431
claimtrie/normalization/NFC_v11.txt
Normal file
File diff suppressed because it is too large
Load diff
18847
claimtrie/normalization/NormalizationTest_v11.txt
Normal file
18847
claimtrie/normalization/NormalizationTest_v11.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -38,7 +38,7 @@ func init() {
|
|||
}
|
||||
}
|
||||
|
||||
func CaseFold(name []byte) []byte {
|
||||
func caseFold(name []byte) []byte {
|
||||
var b bytes.Buffer
|
||||
b.Grow(len(name))
|
||||
for i := 0; i < len(name); {
|
||||
|
|
177
claimtrie/normalization/char_decomposer.go
Normal file
177
claimtrie/normalization/char_decomposer.go
Normal file
|
@ -0,0 +1,177 @@
|
|||
package normalization
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
_ "embed"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
//go:embed NFC_v11.txt
|
||||
var decompositions string // the data file that came from ICU 63.2
|
||||
|
||||
var nfdMap map[rune][]rune
|
||||
var nfdOrder map[rune]int32
|
||||
|
||||
func init() {
|
||||
nfdMap = map[rune][]rune{}
|
||||
nfdOrder = map[rune]int32{}
|
||||
scanner := bufio.NewScanner(strings.NewReader(decompositions))
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
|
||||
continue
|
||||
}
|
||||
if strings.ContainsAny(line, ":") {
|
||||
// it's a ordering def:
|
||||
addOrdering(line)
|
||||
continue
|
||||
}
|
||||
splits := strings.Split(line, "=")
|
||||
if len(splits) <= 1 {
|
||||
splits = strings.Split(line, ">")
|
||||
if len(splits) <= 1 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
splits = strings.Split(splits[1], " ")
|
||||
values := make([]rune, 0, len(splits))
|
||||
for j := range splits {
|
||||
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
existing := nfdMap[rune(value)]
|
||||
if len(existing) > 0 {
|
||||
values = append(values, existing...)
|
||||
} else {
|
||||
values = append(values, rune(value))
|
||||
}
|
||||
}
|
||||
nfdMap[rune(key)] = values
|
||||
}
|
||||
|
||||
// run one more expansion pass to catch stragglers
|
||||
for key, values := range nfdMap {
|
||||
for i, value := range values {
|
||||
other := nfdMap[value]
|
||||
if len(other) > 0 {
|
||||
newValues := make([]rune, len(values)+len(other)-1)
|
||||
copy(newValues, values[:i])
|
||||
copy(newValues[i:i+len(other)], other)
|
||||
copy(newValues[i+len(other):], values[i+1:])
|
||||
nfdMap[key] = newValues
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// assert no more expansions are necessary:
|
||||
for _, values := range nfdMap {
|
||||
for _, value := range values {
|
||||
other := nfdMap[value]
|
||||
if len(other) > 0 {
|
||||
panic("Failed in NFD expansion")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func addOrdering(line string) {
|
||||
splits := strings.Split(line, ":")
|
||||
ranges := strings.Split(splits[0], "..")
|
||||
|
||||
value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
end := start
|
||||
if len(ranges) > 1 {
|
||||
end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
for i := start; i <= end; i++ {
|
||||
nfdOrder[rune(i)] = int32(value)
|
||||
}
|
||||
}
|
||||
|
||||
func decompose(name []byte) []byte {
|
||||
// see https://unicode.org/reports/tr15/ section 1.3
|
||||
runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
|
||||
for i := 0; i < len(name); {
|
||||
r, w := utf8.DecodeRune(name[i:])
|
||||
if r == utf8.RuneError && w < 2 {
|
||||
// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
|
||||
return name
|
||||
}
|
||||
replacements := nfdMap[r]
|
||||
if len(replacements) > 0 {
|
||||
runes = append(runes, replacements...)
|
||||
} else {
|
||||
hanguls := decomposeHangul(r)
|
||||
if len(hanguls) > 0 {
|
||||
runes = append(runes, hanguls...)
|
||||
} else {
|
||||
runes = append(runes, r)
|
||||
}
|
||||
}
|
||||
i += w
|
||||
}
|
||||
repairOrdering(runes)
|
||||
return []byte(string(runes))
|
||||
}
|
||||
|
||||
func decomposeHangul(s rune) []rune {
|
||||
// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
|
||||
|
||||
const SBase int32 = 0xAC00
|
||||
const LBase int32 = 0x1100
|
||||
const VBase int32 = 0x1161
|
||||
const TBase int32 = 0x11A7
|
||||
const LCount int32 = 19
|
||||
const VCount int32 = 21
|
||||
const TCount int32 = 28
|
||||
const NCount = VCount * TCount // 588
|
||||
const SCount = LCount * NCount // 11172
|
||||
|
||||
SIndex := s - SBase
|
||||
if SIndex < 0 || SIndex >= SCount {
|
||||
return nil
|
||||
}
|
||||
L := LBase + SIndex/NCount
|
||||
V := VBase + (SIndex%NCount)/TCount
|
||||
T := TBase + SIndex%TCount
|
||||
result := []rune{L, V}
|
||||
if T != TBase {
|
||||
result = append(result, T)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func repairOrdering(runes []rune) {
|
||||
for i := 1; i < len(runes); i++ {
|
||||
a := runes[i-1]
|
||||
b := runes[i]
|
||||
oa := nfdOrder[a]
|
||||
ob := nfdOrder[b]
|
||||
if oa > ob && ob > 0 {
|
||||
runes[i-1], runes[i] = b, a
|
||||
if i >= 2 {
|
||||
i -= 2
|
||||
} else {
|
||||
i = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,11 +2,10 @@ package normalization
|
|||
|
||||
import (
|
||||
"github.com/lbryio/lbcd/claimtrie/param"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
var Normalize = normalizeGo
|
||||
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
|
||||
var NormalizeTitle = "Normalizing strings via Go. Casefold and NFD table versions: 11.0.0 (from ICU 63.2)"
|
||||
|
||||
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
||||
if height < param.ActiveParams.NormalizedNameForkHeight {
|
||||
|
@ -17,7 +16,7 @@ func NormalizeIfNecessary(name []byte, height int32) []byte {
|
|||
|
||||
func normalizeGo(value []byte) []byte {
|
||||
|
||||
normalized := norm.NFD.Bytes(value) // may need to hard-code the version on this
|
||||
normalized := decompose(value) // may need to hard-code the version on this
|
||||
// not using x/text/cases because it does too good of a job; it seems to use v14 tables even when it claims v13
|
||||
return CaseFold(normalized)
|
||||
return caseFold(normalized)
|
||||
}
|
||||
|
|
|
@ -31,6 +31,8 @@ package normalization
|
|||
// }
|
||||
import "C"
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"unsafe"
|
||||
)
|
||||
|
@ -47,21 +49,29 @@ func IcuVersion() string {
|
|||
}
|
||||
|
||||
func normalizeICU(value []byte) []byte {
|
||||
original := value
|
||||
if len(value) <= 0 {
|
||||
return value
|
||||
}
|
||||
|
||||
other := normalizeGo(value)
|
||||
|
||||
name := (*C.char)(unsafe.Pointer(&value[0]))
|
||||
length := C.int(len(value))
|
||||
|
||||
// hopefully this is a stack alloc (but it may be a bit large for that):
|
||||
var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
|
||||
result := unsafe.Pointer(&resultName[0])
|
||||
pointer := unsafe.Pointer(&resultName[0])
|
||||
|
||||
resultLength := C.normalize(name, length, (*C.char)(result))
|
||||
if resultLength == 0 {
|
||||
return value
|
||||
resultLength := C.normalize(name, length, (*C.char)(pointer))
|
||||
if resultLength > 0 {
|
||||
value = C.GoBytes(pointer, resultLength)
|
||||
}
|
||||
|
||||
// return resultName[0:resultLength] -- we want to shrink the result (not use a slice on 1024)
|
||||
return C.GoBytes(result, resultLength)
|
||||
// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
|
||||
if !bytes.Equal(other, value) {
|
||||
fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
|
||||
hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
package normalization
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/hex"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
|
@ -63,3 +64,11 @@ func TestBlock760150_1020105(t *testing.T) {
|
|||
// t.Logf("%s -> %s", s, string(b))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlock1085612(t *testing.T) {
|
||||
s, err := hex.DecodeString("6eccb7cd9dcc92cd90cc86cc80cc80cd91cd9dcd8acd80cd92cc94cc85cc8fccbdcda0ccbdcd80cda0cd84cc94cc8ccc9acd84cc94cd9bcda0cca7cc99ccaccd99cca9cca7")
|
||||
assert.NoError(t, err)
|
||||
a := normalizeICU(s)
|
||||
b := normalizeGo(s)
|
||||
assert.Equal(t, a, b, "%s != %s, %v", string(a), string(b), bytes.Equal(b, s))
|
||||
}
|
||||
|
|
|
@ -1,7 +1,12 @@
|
|||
package normalization
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
_ "embed"
|
||||
"math/rand"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
@ -52,3 +57,33 @@ func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
|
|||
require.True(b, len(s) >= 8)
|
||||
}
|
||||
}
|
||||
|
||||
//go:embed NormalizationTest_v11.txt
|
||||
var nfdTests string
|
||||
|
||||
func TestDecomposition(t *testing.T) {
|
||||
r := require.New(t)
|
||||
|
||||
scanner := bufio.NewScanner(strings.NewReader(nfdTests))
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
|
||||
continue
|
||||
}
|
||||
splits := strings.Split(line, ";")
|
||||
source := convertToBytes(splits[0])
|
||||
targetNFD := convertToBytes(splits[2])
|
||||
fixed := decompose(source)
|
||||
r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
|
||||
}
|
||||
}
|
||||
|
||||
func convertToBytes(s string) []byte {
|
||||
splits := strings.Split(s, " ")
|
||||
var b bytes.Buffer
|
||||
for i := range splits {
|
||||
value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
|
||||
b.WriteRune(rune(value))
|
||||
}
|
||||
return b.Bytes()
|
||||
}
|
||||
|
|
1
go.mod
1
go.mod
|
@ -22,7 +22,6 @@ require (
|
|||
github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7
|
||||
github.com/vmihailenco/msgpack/v5 v5.3.2
|
||||
golang.org/x/crypto v0.0.0-20211209193657-4570a0811e8b
|
||||
golang.org/x/text v0.3.7
|
||||
)
|
||||
|
||||
require (
|
||||
|
|
Loading…
Reference in a new issue