From 7c7ceed333ca305f5a1bffc48876a4e4c39c8223 Mon Sep 17 00:00:00 2001 From: Niko Storni Date: Thu, 25 Mar 2021 19:07:26 +0100 Subject: [PATCH] further improve language detection strip URLs from description to get better results --- sources/youtubeVideo.go | 4 +++- sources/youtubeVideo_test.go | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/sources/youtubeVideo.go b/sources/youtubeVideo.go index 8307d3e..117eaab 100644 --- a/sources/youtubeVideo.go +++ b/sources/youtubeVideo.go @@ -415,7 +415,9 @@ func (v *YoutubeVideo) publish(daemon *jsonrpc.Client, params SyncParams) (*Sync FeeCurrency: jsonrpc.Currency(params.Fee.Currency), } } - info := whatlanggo.Detect(v.description) + urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`) + descriptionSample := urlsRegex.ReplaceAllString(v.description, "") + info := whatlanggo.Detect(descriptionSample) info2 := whatlanggo.Detect(v.title) if info.IsReliable() && info.Lang.Iso6391() != "" { language := info.Lang.Iso6391() diff --git a/sources/youtubeVideo_test.go b/sources/youtubeVideo_test.go index 40f098f..b142594 100644 --- a/sources/youtubeVideo_test.go +++ b/sources/youtubeVideo_test.go @@ -1,6 +1,7 @@ package sources import ( + "regexp" "testing" "github.com/abadojack/whatlanggo" @@ -22,4 +23,31 @@ func TestLanguageDetection(t *testing.T) { assert.True(t, info.IsReliable()) assert.True(t, info.Lang.Iso6391() != "") assert.Equal(t, "zh", info.Lang.Iso6391()) + + description = `成為這個頻道的會員並獲得獎勵: +https://www.youtube.com/channel/UCOQFrooz-YGHjYb7s3-MrsQ/join +_____________________________________________ +想聽我既音樂作品可以去下面LINK +streetvoice 街聲: +https://streetvoice.com/CTLam331/ +_____________________________________________ +想學結他、鋼琴 +有關音樂制作工作 +都可以搵我~ +大家快D訂閱喇 +不定期出片 + + + + +Website: http://ctlam331.wixsite.com/ctlamusic +FB PAGE:https://www.facebook.com/ctlam331 +IG:ctlamusic` + urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`) + descriptionSample := urlsRegex.ReplaceAllString(description, "") + info = whatlanggo.Detect(descriptionSample) + logrus.Infof("confidence: %.2f", info.Confidence) + assert.True(t, info.IsReliable()) + assert.True(t, info.Lang.Iso6391() != "") + assert.Equal(t, "zh", info.Lang.Iso6391()) }