fix bug with case folding table
This commit is contained in:
parent
fa9239e0c5
commit
6009f9048c
5 changed files with 50 additions and 9 deletions
|
@ -14,6 +14,7 @@ type NormalizingManager struct { // implements Manager
|
|||
}
|
||||
|
||||
func NewNormalizingManager(baseManager Manager) Manager {
|
||||
log.Info(normalization.NormalizeTitle)
|
||||
return &NormalizingManager{
|
||||
Manager: baseManager,
|
||||
normalizedAt: -1,
|
||||
|
|
|
@ -20,14 +20,20 @@ func init() {
|
|||
matches := r.FindAllStringSubmatch(v11, 1000000000)
|
||||
for i := range matches {
|
||||
if matches[i][2] == "C" || matches[i][2] == "F" {
|
||||
key, _ := strconv.Unquote(`"\u` + matches[i][1] + `"`)
|
||||
key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
splits := strings.Split(matches[i][3], " ")
|
||||
var values []rune
|
||||
for j := range splits {
|
||||
value, _ := strconv.Unquote(`"\u` + splits[j] + `"`)
|
||||
values = append(values, []rune(value)[0])
|
||||
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
values = append(values, rune(value))
|
||||
}
|
||||
foldMap[[]rune(key)[0]] = values
|
||||
foldMap[rune(key)] = values
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ import (
|
|||
)
|
||||
|
||||
var Normalize = normalizeGo
|
||||
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
|
||||
|
||||
func NormalizeIfNecessary(name []byte, height int32) []byte {
|
||||
if height < param.ActiveParams.NormalizedNameForkHeight {
|
||||
|
|
|
@ -37,6 +37,7 @@ import (
|
|||
|
||||
func init() {
|
||||
Normalize = normalizeICU
|
||||
NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()
|
||||
}
|
||||
|
||||
func IcuVersion() string {
|
||||
|
|
|
@ -19,15 +19,47 @@ func BenchmarkNormalizeICU(b *testing.B) {
|
|||
benchmarkNormalize(b, normalizeICU)
|
||||
}
|
||||
|
||||
func TestBlock760150(t *testing.T) {
|
||||
var testStrings = []string{
|
||||
"Les-Masques-Blancs-Die-Dead-place-Sathonay-28-Août",
|
||||
"Bez-komentu-výbuch-z-vnútra,-radšej-pozri-video...-",
|
||||
"၂-နစ်အကြာမှာ",
|
||||
"ငရဲပြည်မှ-6",
|
||||
"@happyvision",
|
||||
"ကမ္ဘာပျက်ကိန်း-9",
|
||||
"ဝိညာဉ်နား၊-3",
|
||||
"un-amore-nuovo-o-un-ritorno-cosa-mi-dona",
|
||||
"è-innamorato-di-me-anche-se-non-lo-dice",
|
||||
"ပြင်ဆင်ပါ-no.1",
|
||||
"ပြင်ဆင်ပါ-no.4",
|
||||
"ပြင်ဆင်ပါ-no.2",
|
||||
"ပြင်ဆင်ပါ-no.3",
|
||||
"ငရဲပြည်မှ-5",
|
||||
"ပြင်ဆင်ပါ-no.6",
|
||||
"ပြင်ဆင်ပါ-no.5",
|
||||
"ပြင်ဆင်ပါ-no.7",
|
||||
"ပြင်ဆင်ပါ-no.8",
|
||||
"အချိန်-2",
|
||||
"ဝိညာဉ်နား၊-4",
|
||||
"ပြင်ဆင်ပါ-no.-13",
|
||||
"ပြင်ဆင်ပါ-no.15",
|
||||
"ပြင်ဆင်ပါ-9",
|
||||
"schilddrüsenhormonsubstitution-nach",
|
||||
"Linxextremismus-JPzuG_UBtEg",
|
||||
"Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”",
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||
}
|
||||
|
||||
func TestBlock760150_1020105(t *testing.T) {
|
||||
test, _ := hex.DecodeString("43efbfbd")
|
||||
assert.True(t, utf8.Valid(test))
|
||||
a := normalizeGo(test)
|
||||
b := normalizeICU(test)
|
||||
assert.Equal(t, a, b)
|
||||
|
||||
test2 := "Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”"
|
||||
a = normalizeGo([]byte(test2))
|
||||
b = normalizeICU([]byte(test2))
|
||||
assert.Equal(t, a, b)
|
||||
for i, s := range testStrings {
|
||||
a = normalizeGo([]byte(s))
|
||||
b = normalizeICU([]byte(s))
|
||||
assert.Equal(t, a, b, "%d: %s != %s", i, string(a), string(b))
|
||||
// t.Logf("%s -> %s", s, string(b))
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue