fix bug with case folding table

This commit is contained in:
Brannon King 2021-09-14 13:46:47 -04:00
parent fa9239e0c5
commit 6009f9048c
5 changed files with 50 additions and 9 deletions

View file

@ -14,6 +14,7 @@ type NormalizingManager struct { // implements Manager
}
func NewNormalizingManager(baseManager Manager) Manager {
log.Info(normalization.NormalizeTitle)
return &NormalizingManager{
Manager: baseManager,
normalizedAt: -1,

View file

@ -20,14 +20,20 @@ func init() {
matches := r.FindAllStringSubmatch(v11, 1000000000)
for i := range matches {
if matches[i][2] == "C" || matches[i][2] == "F" {
key, _ := strconv.Unquote(`"\u` + matches[i][1] + `"`)
key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4)
if err != nil {
panic(err)
}
splits := strings.Split(matches[i][3], " ")
var values []rune
for j := range splits {
value, _ := strconv.Unquote(`"\u` + splits[j] + `"`)
values = append(values, []rune(value)[0])
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
if err != nil {
panic(err)
}
values = append(values, rune(value))
}
foldMap[[]rune(key)[0]] = values
foldMap[rune(key)] = values
}
}
}

View file

@ -6,6 +6,7 @@ import (
)
var Normalize = normalizeGo
var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version
func NormalizeIfNecessary(name []byte, height int32) []byte {
if height < param.ActiveParams.NormalizedNameForkHeight {

View file

@ -37,6 +37,7 @@ import (
func init() {
Normalize = normalizeICU
NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()
}
func IcuVersion() string {

View file

@ -19,15 +19,47 @@ func BenchmarkNormalizeICU(b *testing.B) {
benchmarkNormalize(b, normalizeICU)
}
func TestBlock760150(t *testing.T) {
var testStrings = []string{
"Les-Masques-Blancs-Die-Dead-place-Sathonay-28-Août",
"Bez-komentu-výbuch-z-vnútra,-radšej-pozri-video...-",
"၂-နစ်အကြာမှာ",
"ငရဲပြည်မှ-6",
"@happyvision",
"ကမ္ဘာပျက်ကိန်း-9",
"ဝိညာဉ်နား၊-3",
"un-amore-nuovo-o-un-ritorno-cosa-mi-dona",
"è-innamorato-di-me-anche-se-non-lo-dice",
"ပြင်ဆင်ပါ-no.1",
"ပြင်ဆင်ပါ-no.4",
"ပြင်ဆင်ပါ-no.2",
"ပြင်ဆင်ပါ-no.3",
"ငရဲပြည်မှ-5",
"ပြင်ဆင်ပါ-no.6",
"ပြင်ဆင်ပါ-no.5",
"ပြင်ဆင်ပါ-no.7",
"ပြင်ဆင်ပါ-no.8",
"အချိန်-2",
"ဝိညာဉ်နား၊-4",
"ပြင်ဆင်ပါ-no.-13",
"ပြင်ဆင်ပါ-no.15",
"ပြင်ဆင်ပါ-9",
"schilddrüsenhormonsubstitution-nach",
"Linxextremismus-JPzuG_UBtEg",
"Ꮖ---N---------N-Ꮹ----on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
}
func TestBlock760150_1020105(t *testing.T) {
test, _ := hex.DecodeString("43efbfbd")
assert.True(t, utf8.Valid(test))
a := normalizeGo(test)
b := normalizeICU(test)
assert.Equal(t, a, b)
test2 := "Ꮖ---N---------N-Ꮹ----on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”"
a = normalizeGo([]byte(test2))
b = normalizeICU([]byte(test2))
assert.Equal(t, a, b)
for i, s := range testStrings {
a = normalizeGo([]byte(s))
b = normalizeICU([]byte(s))
assert.Equal(t, a, b, "%d: %s != %s", i, string(a), string(b))
// t.Logf("%s -> %s", s, string(b))
}
}