improve language detection

This commit is contained in:
Niko Storni 2021-03-25 18:47:34 +01:00
parent d53d0a1d52
commit 198473b62b
3 changed files with 31 additions and 2 deletions

View file

@ -505,7 +505,7 @@ func (s *Sync) updateRemoteDB(claims []jsonrpc.Claim, ownClaims []jsonrpc.Claim)
log.Debugf("%s: Published but is not in database (%s - %s)", videoID, chainInfo.ClaimName, chainInfo.ClaimID) log.Debugf("%s: Published but is not in database (%s - %s)", videoID, chainInfo.ClaimName, chainInfo.ClaimID)
} }
if transferStatusMismatch { if transferStatusMismatch {
log.Debugf("%s: is marked as transferred %t on it's actually %t", videoID, sv.Transferred, transferred) log.Debugf("%s: is marked as transferred %t but it's actually %t", videoID, sv.Transferred, transferred)
} }
if !claimInDatabase || metadataDiffers || claimIDDiffers || claimNameDiffers || claimMarkedUnpublished || transferStatusMismatch { if !claimInDatabase || metadataDiffers || claimIDDiffers || claimNameDiffers || claimMarkedUnpublished || transferStatusMismatch {

View file

@ -415,10 +415,14 @@ func (v *YoutubeVideo) publish(daemon *jsonrpc.Client, params SyncParams) (*Sync
FeeCurrency: jsonrpc.Currency(params.Fee.Currency), FeeCurrency: jsonrpc.Currency(params.Fee.Currency),
} }
} }
info := whatlanggo.Detect(v.getAbbrevDescription()) info := whatlanggo.Detect(v.description)
info2 := whatlanggo.Detect(v.title)
if info.IsReliable() && info.Lang.Iso6391() != "" { if info.IsReliable() && info.Lang.Iso6391() != "" {
language := info.Lang.Iso6391() language := info.Lang.Iso6391()
languages = []string{language} languages = []string{language}
} else if info2.IsReliable() && info2.Lang.Iso6391() != "" {
language := info2.Lang.Iso6391()
languages = []string{language}
} }
options := jsonrpc.StreamCreateOptions{ options := jsonrpc.StreamCreateOptions{
ClaimCreateOptions: jsonrpc.ClaimCreateOptions{ ClaimCreateOptions: jsonrpc.ClaimCreateOptions{

View file

@ -0,0 +1,25 @@
package sources
import (
"testing"
"github.com/abadojack/whatlanggo"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
)
func TestLanguageDetection(t *testing.T) {
description := `Om lättkränkta muslimer, och den bristande logiken i vad som anses vara att vanära profeten. Från Moderata riksdagspolitikern Hanif Balis podcast "God Ton", avsnitt 108, från oktober 2020, efter terrordådet där en fransk lärare fick huvudet avskuret efter att undervisat sin mångkulturella klass om frihet.`
info := whatlanggo.Detect(description)
logrus.Infof("confidence: %.2f", info.Confidence)
assert.True(t, info.IsReliable())
assert.True(t, info.Lang.Iso6391() != "")
assert.Equal(t, "sv", info.Lang.Iso6391())
description = `🥳週四直播 | 晚上來開個賽車🔰歡迎各位一起來玩! - PonPonLin蹦蹦林`
info = whatlanggo.Detect(description)
logrus.Infof("confidence: %.2f", info.Confidence)
assert.True(t, info.IsReliable())
assert.True(t, info.Lang.Iso6391() != "")
assert.Equal(t, "zh", info.Lang.Iso6391())
}