lbcd/claimtrie/normalization/normalizer_icu.go

//go:build use_icu_normalization
// +build use_icu_normalization

package normalization

// #cgo CFLAGS: -O2
// #cgo LDFLAGS: -licuio -licui18n -licuuc -licudata
// #include <unicode/unorm2.h>
// #include <unicode/ustring.h>
// #include <unicode/uversion.h>
// int icu_version() {
//    UVersionInfo info;
//    u_getVersion(info);
//    return ((int)(info[0]) << 16) + info[1];
// }
// int normalize(char* name, int length, char* result) {
//   UErrorCode ec = U_ZERO_ERROR;
//   static const UNormalizer2* normalizer = NULL;
//   if (normalizer == NULL) normalizer = unorm2_getNFDInstance(&ec);
//   UChar dest[256]; // maximum claim name size is 255; we won't have more UTF16 chars than bytes
//   int dest_len;
//   u_strFromUTF8(dest, 256, &dest_len, name, length, &ec);
//   if (U_FAILURE(ec) || dest_len == 0) return 0;
//   UChar normalized[256];
//   dest_len = unorm2_normalize(normalizer, dest, dest_len, normalized, 256, &ec);
//   if (U_FAILURE(ec) || dest_len == 0) return 0;
//   dest_len = u_strFoldCase(dest, 256, normalized, dest_len, U_FOLD_CASE_DEFAULT, &ec);
//   if (U_FAILURE(ec) || dest_len == 0) return 0;
//   u_strToUTF8(result, 512, &dest_len, dest, dest_len, &ec);
//   return dest_len;
// }
import "C"
import (
	"bytes"
	"encoding/hex"
	"fmt"
	"unsafe"
)

func init() {
	Normalize = normalizeICU
	NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()
}

func IcuVersion() string {
	// TODO: we probably need to explode if it's not 63.2 as it affects consensus
	result := C.icu_version()
	return fmt.Sprintf("%d.%d", result>>16, result&0xffff)
}

func normalizeICU(value []byte) []byte {
	original := value
	if len(value) <= 0 {
		return value
	}

	other := normalizeGo(value)

	name := (*C.char)(unsafe.Pointer(&value[0]))
	length := C.int(len(value))

	// hopefully this is a stack alloc (but it may be a bit large for that):
	var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that
	pointer := unsafe.Pointer(&resultName[0])

	resultLength := C.normalize(name, length, (*C.char)(pointer))
	if resultLength > 0 {
		value = C.GoBytes(pointer, resultLength)
	}

	// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)
	if !bytes.Equal(other, value) {
		fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,
			hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))
	}
	return value
}
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`//go:build use_icu_normalization`
			`// +build use_icu_normalization`

			`package normalization`

			`// #cgo CFLAGS: -O2`
			`// #cgo LDFLAGS: -licuio -licui18n -licuuc -licudata`
			`// #include <unicode/unorm2.h>`
			`// #include <unicode/ustring.h>`
			`// #include <unicode/uversion.h>`
			`// int icu_version() {`
			`// UVersionInfo info;`
			`// u_getVersion(info);`
			`// return ((int)(info[0]) << 16) + info[1];`
			`// }`
			`// int normalize(char* name, int length, char* result) {`
			`// UErrorCode ec = U_ZERO_ERROR;`
			`// static const UNormalizer2* normalizer = NULL;`
			`// if (normalizer == NULL) normalizer = unorm2_getNFDInstance(&ec);`
			`// UChar dest[256]; // maximum claim name size is 255; we won't have more UTF16 chars than bytes`
			`// int dest_len;`
			`// u_strFromUTF8(dest, 256, &dest_len, name, length, &ec);`
			`// if (U_FAILURE(ec) \|\| dest_len == 0) return 0;`
			`// UChar normalized[256];`
			`// dest_len = unorm2_normalize(normalizer, dest, dest_len, normalized, 256, &ec);`
			`// if (U_FAILURE(ec) \|\| dest_len == 0) return 0;`
			`// dest_len = u_strFoldCase(dest, 256, normalized, dest_len, U_FOLD_CASE_DEFAULT, &ec);`
			`// if (U_FAILURE(ec) \|\| dest_len == 0) return 0;`
			`// u_strToUTF8(result, 512, &dest_len, dest, dest_len, &ec);`
			`// return dest_len;`
			`// }`
			`import "C"`
			`import (`
[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00			`"bytes"`
			`"encoding/hex"`
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`"fmt"`
			`"unsafe"`
			`)`

			`func init() {`
			`Normalize = normalizeICU`
			`NormalizeTitle = "Normalizing strings via ICU. ICU version = " + IcuVersion()`
			`}`

			`func IcuVersion() string {`
			`// TODO: we probably need to explode if it's not 63.2 as it affects consensus`
			`result := C.icu_version()`
			`return fmt.Sprintf("%d.%d", result>>16, result&0xffff)`
			`}`

			`func normalizeICU(value []byte) []byte {`
[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00			`original := value`
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`if len(value) <= 0 {`
			`return value`
			`}`
[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00
			`other := normalizeGo(value)`

[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`name := (*C.char)(unsafe.Pointer(&value[0]))`
			`length := C.int(len(value))`

			`// hopefully this is a stack alloc (but it may be a bit large for that):`
			`var resultName [512]byte // inputs are restricted to 255 chars; it shouldn't expand too much past that`
[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00			`pointer := unsafe.Pointer(&resultName[0])`
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00
[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00			`resultLength := C.normalize(name, length, (*C.char)(pointer))`
			`if resultLength > 0 {`
			`value = C.GoBytes(pointer, resultLength)`
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`}`

[lbry] removed dependency on text/norm, fixed NFD normalization 2021-12-29 20:24:19 +01:00			`// return resultName[0:resultLength] -- we want to shrink the pointer (not use a slice on 1024)`
			`if !bytes.Equal(other, value) {`
			`fmt.Printf("Failed with %s, %s != %s,\n\t%s, %s != %s,\n", original, value, other,`
			`hex.EncodeToString(original), hex.EncodeToString(value), hex.EncodeToString(other))`
			`}`
			`return value`
[lbry] claimtrie: import current snapshot Sync to tip Co-authored-by: Brannon King <countprimes@gmail.com> 2021-07-07 03:39:56 +02:00			`}`