lbcd/claimtrie/normalization/char_decomposer.go

177 lines
3.9 KiB
Go

package normalization
import (
"bufio"
_ "embed"
"strconv"
"strings"
"unicode/utf8"
)
//go:embed NFC_v11.txt
var decompositions string // the data file that came from ICU 63.2
var nfdMap map[rune][]rune
var nfdOrder map[rune]int32
func init() {
nfdMap = map[rune][]rune{}
nfdOrder = map[rune]int32{}
scanner := bufio.NewScanner(strings.NewReader(decompositions))
for scanner.Scan() {
line := scanner.Text()
if len(line) <= 0 || line[0] == '#' || line[0] == '*' {
continue
}
if strings.ContainsAny(line, ":") {
// it's a ordering def:
addOrdering(line)
continue
}
splits := strings.Split(line, "=")
if len(splits) <= 1 {
splits = strings.Split(line, ">")
if len(splits) <= 1 {
continue
}
}
key, err := strconv.ParseUint(splits[0], 16, len(splits[0])*4)
if err != nil {
panic(err)
}
splits = strings.Split(splits[1], " ")
values := make([]rune, 0, len(splits))
for j := range splits {
value, err := strconv.ParseUint(splits[j], 16, len(splits[j])*4)
if err != nil {
panic(err)
}
existing := nfdMap[rune(value)]
if len(existing) > 0 {
values = append(values, existing...)
} else {
values = append(values, rune(value))
}
}
nfdMap[rune(key)] = values
}
// run one more expansion pass to catch stragglers
for key, values := range nfdMap {
for i, value := range values {
other := nfdMap[value]
if len(other) > 0 {
newValues := make([]rune, len(values)+len(other)-1)
copy(newValues, values[:i])
copy(newValues[i:i+len(other)], other)
copy(newValues[i+len(other):], values[i+1:])
nfdMap[key] = newValues
}
}
}
// assert no more expansions are necessary:
for _, values := range nfdMap {
for _, value := range values {
other := nfdMap[value]
if len(other) > 0 {
panic("Failed in NFD expansion")
}
}
}
}
func addOrdering(line string) {
splits := strings.Split(line, ":")
ranges := strings.Split(splits[0], "..")
value, err := strconv.ParseUint(splits[1], 16, len(splits[1])*4)
if err != nil {
panic(err)
}
start, err := strconv.ParseUint(ranges[0], 16, len(ranges[0])*4)
if err != nil {
panic(err)
}
end := start
if len(ranges) > 1 {
end, err = strconv.ParseUint(ranges[1], 16, len(ranges[0])*4)
if err != nil {
panic(err)
}
}
for i := start; i <= end; i++ {
nfdOrder[rune(i)] = int32(value)
}
}
func decompose(name []byte) []byte {
// see https://unicode.org/reports/tr15/ section 1.3
runes := make([]rune, 0, len(name)) // we typically use ascii don't increase the length
for i := 0; i < len(name); {
r, w := utf8.DecodeRune(name[i:])
if r == utf8.RuneError && w < 2 {
// HACK: their RuneError is actually a valid character if coming from a width of 2 or more
return name
}
replacements := nfdMap[r]
if len(replacements) > 0 {
runes = append(runes, replacements...)
} else {
hanguls := decomposeHangul(r)
if len(hanguls) > 0 {
runes = append(runes, hanguls...)
} else {
runes = append(runes, r)
}
}
i += w
}
repairOrdering(runes)
return []byte(string(runes))
}
func decomposeHangul(s rune) []rune {
// see https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf
const SBase int32 = 0xAC00
const LBase int32 = 0x1100
const VBase int32 = 0x1161
const TBase int32 = 0x11A7
const LCount int32 = 19
const VCount int32 = 21
const TCount int32 = 28
const NCount = VCount * TCount // 588
const SCount = LCount * NCount // 11172
SIndex := s - SBase
if SIndex < 0 || SIndex >= SCount {
return nil
}
L := LBase + SIndex/NCount
V := VBase + (SIndex%NCount)/TCount
T := TBase + SIndex%TCount
result := []rune{L, V}
if T != TBase {
result = append(result, T)
}
return result
}
func repairOrdering(runes []rune) {
for i := 1; i < len(runes); i++ {
a := runes[i-1]
b := runes[i]
oa := nfdOrder[a]
ob := nfdOrder[b]
if oa > ob && ob > 0 {
runes[i-1], runes[i] = b, a
if i >= 2 {
i -= 2
} else {
i = 0
}
}
}
}