further improve language detection
strip URLs from description to get better results
This commit is contained in:
parent
198473b62b
commit
7c7ceed333
2 changed files with 31 additions and 1 deletions
|
@ -415,7 +415,9 @@ func (v *YoutubeVideo) publish(daemon *jsonrpc.Client, params SyncParams) (*Sync
|
||||||
FeeCurrency: jsonrpc.Currency(params.Fee.Currency),
|
FeeCurrency: jsonrpc.Currency(params.Fee.Currency),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
info := whatlanggo.Detect(v.description)
|
urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`)
|
||||||
|
descriptionSample := urlsRegex.ReplaceAllString(v.description, "")
|
||||||
|
info := whatlanggo.Detect(descriptionSample)
|
||||||
info2 := whatlanggo.Detect(v.title)
|
info2 := whatlanggo.Detect(v.title)
|
||||||
if info.IsReliable() && info.Lang.Iso6391() != "" {
|
if info.IsReliable() && info.Lang.Iso6391() != "" {
|
||||||
language := info.Lang.Iso6391()
|
language := info.Lang.Iso6391()
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package sources
|
package sources
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"regexp"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/abadojack/whatlanggo"
|
"github.com/abadojack/whatlanggo"
|
||||||
|
@ -22,4 +23,31 @@ func TestLanguageDetection(t *testing.T) {
|
||||||
assert.True(t, info.IsReliable())
|
assert.True(t, info.IsReliable())
|
||||||
assert.True(t, info.Lang.Iso6391() != "")
|
assert.True(t, info.Lang.Iso6391() != "")
|
||||||
assert.Equal(t, "zh", info.Lang.Iso6391())
|
assert.Equal(t, "zh", info.Lang.Iso6391())
|
||||||
|
|
||||||
|
description = `成為這個頻道的會員並獲得獎勵:
|
||||||
|
https://www.youtube.com/channel/UCOQFrooz-YGHjYb7s3-MrsQ/join
|
||||||
|
_____________________________________________
|
||||||
|
想聽我既音樂作品可以去下面LINK
|
||||||
|
streetvoice 街聲:
|
||||||
|
https://streetvoice.com/CTLam331/
|
||||||
|
_____________________________________________
|
||||||
|
想學結他、鋼琴
|
||||||
|
有關音樂制作工作
|
||||||
|
都可以搵我~
|
||||||
|
大家快D訂閱喇
|
||||||
|
不定期出片
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Website: http://ctlam331.wixsite.com/ctlamusic
|
||||||
|
FB PAGE:https://www.facebook.com/ctlam331
|
||||||
|
IG:ctlamusic`
|
||||||
|
urlsRegex := regexp.MustCompile(`(?m) ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)`)
|
||||||
|
descriptionSample := urlsRegex.ReplaceAllString(description, "")
|
||||||
|
info = whatlanggo.Detect(descriptionSample)
|
||||||
|
logrus.Infof("confidence: %.2f", info.Confidence)
|
||||||
|
assert.True(t, info.IsReliable())
|
||||||
|
assert.True(t, info.Lang.Iso6391() != "")
|
||||||
|
assert.Equal(t, "zh", info.Lang.Iso6391())
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue