90 lines
2.3 KiB
Go
90 lines
2.3 KiB
Go
|
package normalization
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"bytes"
|
||
|
_ "embed"
|
||
|
"math/rand"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"testing"
|
||
|
|
||
|
"github.com/stretchr/testify/require"
|
||
|
)
|
||
|
|
||
|
func TestNormalizationGo(t *testing.T) {
|
||
|
testNormalization(t, normalizeGo)
|
||
|
}
|
||
|
|
||
|
func testNormalization(t *testing.T, normalize func(value []byte) []byte) {
|
||
|
|
||
|
r := require.New(t)
|
||
|
|
||
|
r.Equal("test", string(normalize([]byte("TESt"))))
|
||
|
r.Equal("test 23", string(normalize([]byte("tesT 23"))))
|
||
|
r.Equal("\xFF", string(normalize([]byte("\xFF"))))
|
||
|
r.Equal("\xC3\x28", string(normalize([]byte("\xC3\x28"))))
|
||
|
r.Equal("\xCF\x89", string(normalize([]byte("\xE2\x84\xA6"))))
|
||
|
r.Equal("\xD1\x84", string(normalize([]byte("\xD0\xA4"))))
|
||
|
r.Equal("\xD5\xA2", string(normalize([]byte("\xD4\xB2"))))
|
||
|
r.Equal("\xE3\x81\xB5\xE3\x82\x99", string(normalize([]byte("\xE3\x81\xB6"))))
|
||
|
r.Equal("\xE1\x84\x81\xE1\x85\xAA\xE1\x86\xB0", string(normalize([]byte("\xEA\xBD\x91"))))
|
||
|
}
|
||
|
|
||
|
func randSeq(n int) []byte {
|
||
|
var alphabet = []rune("abcdefghijklmnopqrstuvwxyz̃ABCDEFGHIJKLMNOPQRSTUVWXYZ̃")
|
||
|
|
||
|
b := make([]rune, n)
|
||
|
for i := range b {
|
||
|
b[i] = alphabet[rand.Intn(len(alphabet))]
|
||
|
}
|
||
|
return []byte(string(b))
|
||
|
}
|
||
|
|
||
|
func BenchmarkNormalize(b *testing.B) {
|
||
|
benchmarkNormalize(b, normalizeGo)
|
||
|
}
|
||
|
|
||
|
func benchmarkNormalize(b *testing.B, normalize func(value []byte) []byte) {
|
||
|
rand.Seed(42)
|
||
|
strings := make([][]byte, b.N)
|
||
|
for i := 0; i < b.N; i++ {
|
||
|
strings[i] = randSeq(32)
|
||
|
}
|
||
|
b.ResetTimer()
|
||
|
for i := 0; i < b.N; i++ {
|
||
|
s := normalize(strings[i])
|
||
|
require.True(b, len(s) >= 8)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//go:embed NormalizationTest_v11.txt
|
||
|
var nfdTests string
|
||
|
|
||
|
func TestDecomposition(t *testing.T) {
|
||
|
r := require.New(t)
|
||
|
|
||
|
scanner := bufio.NewScanner(strings.NewReader(nfdTests))
|
||
|
for scanner.Scan() {
|
||
|
line := scanner.Text()
|
||
|
if len(line) <= 0 || line[0] == '@' || line[0] == '#' {
|
||
|
continue
|
||
|
}
|
||
|
splits := strings.Split(line, ";")
|
||
|
source := convertToBytes(splits[0])
|
||
|
targetNFD := convertToBytes(splits[2])
|
||
|
fixed := decompose(source)
|
||
|
r.True(bytes.Equal(targetNFD, fixed), "Failed on %s -> %s. Got %U, not %U", splits[0], splits[2], fixed, targetNFD)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func convertToBytes(s string) []byte {
|
||
|
splits := strings.Split(s, " ")
|
||
|
var b bytes.Buffer
|
||
|
for i := range splits {
|
||
|
value, _ := strconv.ParseUint(splits[i], 16, len(splits[i])*4)
|
||
|
b.WriteRune(rune(value))
|
||
|
}
|
||
|
return b.Bytes()
|
||
|
}
|