diff --git a/claimtrie/node/normalizing_manager.go b/claimtrie/node/normalizing_manager.go index 92bb5d9f..d35403cd 100644 --- a/claimtrie/node/normalizing_manager.go +++ b/claimtrie/node/normalizing_manager.go @@ -14,6 +14,7 @@ type NormalizingManager struct { // implements Manager } func NewNormalizingManager(baseManager Manager) Manager { + log.Info(normalization.NormalizeTitle) return &NormalizingManager{ Manager: baseManager, normalizedAt: -1, diff --git a/claimtrie/normalization/case_folder.go b/claimtrie/normalization/case_folder.go index de63b186..0d7e5747 100644 --- a/claimtrie/normalization/case_folder.go +++ b/claimtrie/normalization/case_folder.go @@ -20,14 +20,20 @@ func init() { matches := r.FindAllStringSubmatch(v11, 1000000000) for i := range matches { if matches[i][2] == "C" || matches[i][2] == "F" { - key, _ := strconv.Unquote(`"\u` + matches[i][1] + `"`) + key, err := strconv.ParseUint(matches[i][1], 16, len(matches[i][1])*4) + if err != nil { + panic(err) + } splits := strings.Split(matches[i][3], " ") var values []rune for j := range splits { - value, _ := strconv.Unquote(`"\u` + splits[j] + `"`) - values = append(values, []rune(value)[0]) + value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4) + if err != nil { + panic(err) + } + values = append(values, rune(value)) } - foldMap[[]rune(key)[0]] = values + foldMap[rune(key)] = values } } } diff --git a/claimtrie/normalization/normalizer.go b/claimtrie/normalization/normalizer.go index ac426c58..55275105 100644 --- a/claimtrie/normalization/normalizer.go +++ b/claimtrie/normalization/normalizer.go @@ -6,6 +6,7 @@ import ( ) var Normalize = normalizeGo +var NormalizeTitle = "Normalizing strings via Go. Casefold table version = 11.0.0, NFD version = " + norm.Version func NormalizeIfNecessary(name []byte, height int32) []byte { if height < param.ActiveParams.NormalizedNameForkHeight { diff --git a/claimtrie/normalization/normalizer_icu.go b/claimtrie/normalization/normalizer_icu.go index 39aaa8ae..d5093ba2 100644 --- a/claimtrie/normalization/normalizer_icu.go +++ b/claimtrie/normalization/normalizer_icu.go @@ -37,6 +37,7 @@ import ( func init() { Normalize = normalizeICU + NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion() } func IcuVersion() string { diff --git a/claimtrie/normalization/normalizer_icu_test.go b/claimtrie/normalization/normalizer_icu_test.go index 510c9926..b02f315a 100644 --- a/claimtrie/normalization/normalizer_icu_test.go +++ b/claimtrie/normalization/normalizer_icu_test.go @@ -19,15 +19,47 @@ func BenchmarkNormalizeICU(b *testing.B) { benchmarkNormalize(b, normalizeICU) } -func TestBlock760150(t *testing.T) { +var testStrings = []string{ + "Les-Masques-Blancs-Die-Dead-place-Sathonay-28-Août", + "Bez-komentu-výbuch-z-vnútra,-radšej-pozri-video...-", + "၂-နစ်အကြာမှာ", + "ငရဲပြည်မှ-6", + "@happyvision", + "ကမ္ဘာပျက်ကိန်း-9", + "ဝိညာဉ်နား၊-3", + "un-amore-nuovo-o-un-ritorno-cosa-mi-dona", + "è-innamorato-di-me-anche-se-non-lo-dice", + "ပြင်ဆင်ပါ-no.1", + "ပြင်ဆင်ပါ-no.4", + "ပြင်ဆင်ပါ-no.2", + "ပြင်ဆင်ပါ-no.3", + "ငရဲပြည်မှ-5", + "ပြင်ဆင်ပါ-no.6", + "ပြင်ဆင်ပါ-no.5", + "ပြင်ဆင်ပါ-no.7", + "ပြင်ဆင်ပါ-no.8", + "အချိန်-2", + "ဝိညာဉ်နား၊-4", + "ပြင်ဆင်ပါ-no.-13", + "ပြင်ဆင်ပါ-no.15", + "ပြင်ဆင်ပါ-9", + "schilddrüsenhormonsubstitution-nach", + "Linxextremismus-JPzuG_UBtEg", + "Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”", + "ABCDEFGHIJKLMNOPQRSTUVWXYZ", +} + +func TestBlock760150_1020105(t *testing.T) { test, _ := hex.DecodeString("43efbfbd") assert.True(t, utf8.Valid(test)) a := normalizeGo(test) b := normalizeICU(test) assert.Equal(t, a, b) - test2 := "Ꮖ-Ꮩ-Ꭺ-N--------Ꭺ-N-Ꮹ-Ꭼ-Ꮮ-Ꭺ-on-Instagram_-“Our-next-destination-is-East-and-Southeast-Asia--selfie--asia”" - a = normalizeGo([]byte(test2)) - b = normalizeICU([]byte(test2)) - assert.Equal(t, a, b) + for i, s := range testStrings { + a = normalizeGo([]byte(s)) + b = normalizeICU([]byte(s)) + assert.Equal(t, a, b, "%d: %s != %s", i, string(a), string(b)) + // t.Logf("%s -> %s", s, string(b)) + } }