further improve language detection

strip URLs from description to get better results
This commit is contained in:
Niko Storni 2021-03-25 19:07:26 +01:00
parent 198473b62b
commit 7c7ceed333
2 changed files with 31 additions and 1 deletions

View file

@ -415,7 +415,9 @@ func (v *YoutubeVideo) publish(daemon *jsonrpc.Client, params SyncParams) (*Sync
FeeCurrency: jsonrpc.Currency(params.Fee.Currency),
}
}
info := whatlanggo.Detect(v.description)
urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`)
descriptionSample := urlsRegex.ReplaceAllString(v.description, "")
info := whatlanggo.Detect(descriptionSample)
info2 := whatlanggo.Detect(v.title)
if info.IsReliable() && info.Lang.Iso6391() != "" {
language := info.Lang.Iso6391()

View file

@ -1,6 +1,7 @@
package sources
import (
"regexp"
"testing"
"github.com/abadojack/whatlanggo"
@ -22,4 +23,31 @@ func TestLanguageDetection(t *testing.T) {
assert.True(t, info.IsReliable())
assert.True(t, info.Lang.Iso6391() != "")
assert.Equal(t, "zh", info.Lang.Iso6391())
description = `成為這個頻道的會員並獲得獎勵
https://www.youtube.com/channel/UCOQFrooz-YGHjYb7s3-MrsQ/join
_____________________________________________
想聽我既音樂作品可以去下面LINK
streetvoice 街聲
https://streetvoice.com/CTLam331/
_____________________________________________
想學結他鋼琴
有關音樂制作工作
都可以搵我
大家快D訂閱喇
不定期出片
Website: http://ctlam331.wixsite.com/ctlamusic
FB PAGEhttps://www.facebook.com/ctlam331
IGctlamusic`
urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`)
descriptionSample := urlsRegex.ReplaceAllString(description, "")
info = whatlanggo.Detect(descriptionSample)
logrus.Infof("confidence: %.2f", info.Confidence)
assert.True(t, info.IsReliable())
assert.True(t, info.Lang.Iso6391() != "")
assert.Equal(t, "zh", info.Lang.Iso6391())
}