fix bug with case folding table
This commit is contained in:
parent
fa9239e0c5
commit
6009f9048c
5 changed files with 50 additions and 9 deletions
|
@ -14,6 +14,7 @@ type NormalizingManager struct { // implements Manager
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewNormalizingManager(baseManager Manager) Manager {
|
func NewNormalizingManager(baseManager Manager) Manager {
|
||||||
|
log.Info(normalization.NormalizeTitle)
|
||||||
return &NormalizingManager{
|
return &NormalizingManager{
|
||||||
Manager: baseManager,
|
Manager: baseManager,
|
||||||
normalizedAt: -1,
|
normalizedAt: -1,
|
||||||
|
|
|
@ -20,14 +20,20 @@ func init() {
|
||||||
matches := r.FindAllStringSubmatch(v11, 1000000000)
|
matches := r.FindAllStringSubmatch(v11, 1000000000)
|
||||||
for i := range matches {
|
for i := range matches {
|
||||||
if matches[i][2] == "C" || matches[i][2] == "F" {
|
if matches[i][2] == "C" || matches[i][2] == "F" {
|
||||||
key, _ := strconv.Unquote(`"\u` + matches[i][1] + `"`)
|
key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
splits := strings.Split(matches[i][3], " ")
|
splits := strings.Split(matches[i][3], " ")
|
||||||
var values []rune
|
var values []rune
|
||||||
for j := range splits {
|
for j := range splits {
|
||||||
value, _ := strconv.Unquote(`"\u` + splits[j] + `"`)
|
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
|
||||||
values = append(values, []rune(value)[0])
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
}
|
}
|
||||||
foldMap[[]rune(key)[0]] = values
|
values = append(values, rune(value))
|
||||||
|
}
|
||||||
|
foldMap[rune(key)] = values
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
var Normalize = normalizeGo
|
var Normalize = normalizeGo
|
||||||
|
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
|
||||||
|
|
||||||
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
||||||
if height < param.ActiveParams.NormalizedNameForkHeight {
|
if height < param.ActiveParams.NormalizedNameForkHeight {
|
||||||
|
|
|
@ -37,6 +37,7 @@ import (
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
Normalize = normalizeICU
|
Normalize = normalizeICU
|
||||||
|
NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()
|
||||||
}
|
}
|
||||||
|
|
||||||
func IcuVersion() string {
|
func IcuVersion() string {
|
||||||
|
|
|
@ -19,15 +19,47 @@ func BenchmarkNormalizeICU(b *testing.B) {
|
||||||
benchmarkNormalize(b, normalizeICU)
|
benchmarkNormalize(b, normalizeICU)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestBlock760150(t *testing.T) {
|
var testStrings = []string{
|
||||||
|
"Les-Masques-Blancs-Die-Dead-place-Sathonay-28-Août",
|
||||||
|
"Bez-komentu-výbuch-z-vnútra,-radšej-pozri-video...-",
|
||||||
|
"၂-နစ်အကြာမှာ",
|
||||||
|
"ငရဲပြည်မှ-6",
|
||||||
|
"@happyvision",
|
||||||
|
"ကမ္ဘာပျက်ကိန်း-9",
|
||||||
|
"ဝိညာဉ်နား၊-3",
|
||||||
|
"un-amore-nuovo-o-un-ritorno-cosa-mi-dona",
|
||||||
|
"è-innamorato-di-me-anche-se-non-lo-dice",
|
||||||
|
"ပြင်ဆင်ပါ-no.1",
|
||||||
|
"ပြင်ဆင်ပါ-no.4",
|
||||||
|
"ပြင်ဆင်ပါ-no.2",
|
||||||
|
"ပြင်ဆင်ပါ-no.3",
|
||||||
|
"ငရဲပြည်မှ-5",
|
||||||
|
"ပြင်ဆင်ပါ-no.6",
|
||||||
|
"ပြင်ဆင်ပါ-no.5",
|
||||||
|
"ပြင်ဆင်ပါ-no.7",
|
||||||
|
"ပြင်ဆင်ပါ-no.8",
|
||||||
|
"အချိန်-2",
|
||||||
|
"ဝိညာဉ်နား၊-4",
|
||||||
|
"ပြင်ဆင်ပါ-no.-13",
|
||||||
|
"ပြင်ဆင်ပါ-no.15",
|
||||||
|
"ပြင်ဆင်ပါ-9",
|
||||||
|
"schilddrüsenhormonsubstitution-nach",
|
||||||
|
"Linxextremismus-JPzuG_UBtEg",
|
||||||
|
"Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”",
|
||||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBlock760150_1020105(t *testing.T) {
|
||||||
test, _ := hex.DecodeString("43efbfbd")
|
test, _ := hex.DecodeString("43efbfbd")
|
||||||
assert.True(t, utf8.Valid(test))
|
assert.True(t, utf8.Valid(test))
|
||||||
a := normalizeGo(test)
|
a := normalizeGo(test)
|
||||||
b := normalizeICU(test)
|
b := normalizeICU(test)
|
||||||
assert.Equal(t, a, b)
|
assert.Equal(t, a, b)
|
||||||
|
|
||||||
test2 := "Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”"
|
for i, s := range testStrings {
|
||||||
a = normalizeGo([]byte(test2))
|
a = normalizeGo([]byte(s))
|
||||||
b = normalizeICU([]byte(test2))
|
b = normalizeICU([]byte(s))
|
||||||
assert.Equal(t, a, b)
|
assert.Equal(t, a, b, "%d: %s != %s", i, string(a), string(b))
|
||||||
|
// t.Logf("%s -> %s", s, string(b))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue