2020-07-27 21:42:45 +02:00
package downloader
import (
2020-07-27 23:14:06 +02:00
"encoding/json"
2020-07-28 18:47:28 +02:00
"fmt"
2020-07-27 23:14:06 +02:00
"io"
2020-07-27 21:42:45 +02:00
"io/ioutil"
2020-07-29 03:34:08 +02:00
"net"
2020-07-28 17:05:24 +02:00
"net/http"
2020-07-28 18:47:28 +02:00
"net/url"
2020-07-27 21:42:45 +02:00
"os/exec"
"strings"
2020-07-28 17:05:24 +02:00
"time"
2020-07-27 21:42:45 +02:00
2020-07-28 18:47:28 +02:00
"github.com/davecgh/go-spew/spew"
2020-07-27 23:14:06 +02:00
"github.com/lbryio/ytsync/v5/downloader/ytdl"
2020-07-30 18:48:05 +02:00
"github.com/lbryio/ytsync/v5/ip_manager"
"github.com/lbryio/ytsync/v5/sdk"
2020-07-27 23:14:06 +02:00
2020-07-27 21:42:45 +02:00
"github.com/lbryio/lbry.go/v2/extras/errors"
2020-07-29 03:34:08 +02:00
"github.com/lbryio/lbry.go/v2/extras/stop"
2020-07-28 17:05:24 +02:00
"github.com/lbryio/lbry.go/v2/extras/util"
2020-07-27 21:42:45 +02:00
"github.com/sirupsen/logrus"
)
2020-07-30 18:48:05 +02:00
func GetPlaylistVideoIDs ( channelName string , maxVideos int , stopChan stop . Chan , pool * ip_manager . IPPool ) ( [ ] string , error ) {
2020-07-27 21:42:45 +02:00
args := [ ] string { "--skip-download" , "https://www.youtube.com/channel/" + channelName , "--get-id" , "--flat-playlist" }
2020-08-03 07:05:03 +02:00
ids , err := run ( channelName , args , true , true , stopChan , pool )
2020-07-27 21:57:19 +02:00
if err != nil {
return nil , errors . Err ( err )
}
videoIDs := make ( [ ] string , maxVideos )
for i , v := range ids {
2020-08-03 07:05:03 +02:00
logrus . Debugf ( "%d - video id %s" , i , v )
2020-07-27 21:57:19 +02:00
if i >= maxVideos {
break
}
videoIDs [ i ] = v
}
return videoIDs , nil
2020-07-27 21:42:45 +02:00
}
2020-07-30 17:13:19 +02:00
const releaseTimeFormat = "2006-01-02, 15:04:05 (MST)"
2020-07-30 18:48:05 +02:00
func GetVideoInformation ( config * sdk . APIConfig , videoID string , stopChan stop . Chan , ip * net . TCPAddr , pool * ip_manager . IPPool ) ( * ytdl . YtdlVideo , error ) {
2020-07-29 03:34:08 +02:00
args := [ ] string { "--skip-download" , "--print-json" , "https://www.youtube.com/watch?v=" + videoID }
2020-08-03 07:05:03 +02:00
results , err := run ( videoID , args , true , true , stopChan , pool )
2020-07-29 03:34:08 +02:00
if err != nil {
return nil , errors . Err ( err )
}
2020-07-27 23:14:06 +02:00
var video * ytdl . YtdlVideo
2020-07-29 03:34:08 +02:00
err = json . Unmarshal ( [ ] byte ( results [ 0 ] ) , & video )
if err != nil {
return nil , errors . Err ( err )
}
2020-07-28 17:05:24 +02:00
// now get an accurate time
2020-07-28 18:47:28 +02:00
const maxTries = 5
2020-07-28 17:05:24 +02:00
tries := 0
GetTime :
tries ++
2020-07-30 21:37:14 +02:00
t , err := getUploadTime ( config , videoID , ip , video . UploadDate )
2020-07-28 17:05:24 +02:00
if err != nil {
2020-07-29 03:34:08 +02:00
//slack(":warning: Upload time error: %v", err)
2020-07-30 17:38:22 +02:00
if tries <= maxTries && ( errors . Is ( err , errNotScraped ) || errors . Is ( err , errUploadTimeEmpty ) || errors . Is ( err , errStatusParse ) || errors . Is ( err , errConnectionIssue ) ) {
2020-07-29 03:34:08 +02:00
err := triggerScrape ( videoID , ip )
if err == nil {
time . Sleep ( 2 * time . Second ) // let them scrape it
goto GetTime
} else {
//slack("triggering scrape returned error: %v", err)
}
2020-07-28 18:47:28 +02:00
} else if ! errors . Is ( err , errNotScraped ) && ! errors . Is ( err , errUploadTimeEmpty ) {
2020-07-29 03:34:08 +02:00
//slack(":warning: Error while trying to get accurate upload time for %s: %v", videoID, err)
2020-07-28 18:47:28 +02:00
return nil , errors . Err ( err )
2020-07-28 17:05:24 +02:00
}
2020-07-28 18:47:28 +02:00
// do fallback below
2020-07-28 17:05:24 +02:00
}
2020-07-29 03:34:08 +02:00
//slack("After all that, upload time for %s is %s", videoID, t)
2020-07-28 17:05:24 +02:00
if t != "" {
parsed , err := time . Parse ( "2006-01-02, 15:04:05 (MST)" , t ) // this will probably be UTC, but Go's timezone parsing is fucked up. it ignores the timezone in the date
if err != nil {
return nil , errors . Err ( err )
}
2020-07-29 03:34:08 +02:00
slack ( ":exclamation: Got an accurate time for %s" , videoID )
2020-07-28 17:05:24 +02:00
video . UploadDateForReal = parsed
} else {
2020-07-29 03:34:08 +02:00
//slack(":warning: Could not get accurate time for %s. Falling back to time from upload ytdl: %s.", videoID, video.UploadDate)
2020-07-28 17:05:24 +02:00
// fall back to UploadDate from youtube-dl
video . UploadDateForReal , err = time . Parse ( "20060102" , video . UploadDate )
if err != nil {
return nil , err
}
}
2020-07-27 23:14:06 +02:00
return video , nil
2020-07-28 17:05:24 +02:00
}
var errNotScraped = errors . Base ( "not yet scraped by caa.iti.gr" )
2020-07-28 18:47:28 +02:00
var errUploadTimeEmpty = errors . Base ( "upload time is empty" )
2020-07-30 17:38:22 +02:00
var errStatusParse = errors . Base ( "could not parse status, got number, need string" )
var errConnectionIssue = errors . Base ( "there was a connection issue with the api" )
2020-07-28 18:47:28 +02:00
func slack ( format string , a ... interface { } ) {
fmt . Printf ( format + "\n" , a ... )
util . SendToSlack ( format , a ... )
}
2020-07-28 17:05:24 +02:00
2020-07-29 03:34:08 +02:00
func triggerScrape ( videoID string , ip * net . TCPAddr ) error {
//slack("Triggering scrape for %s", videoID)
2020-07-28 18:47:28 +02:00
u , err := url . Parse ( "https://caa.iti.gr/verify_videoV3" )
q := u . Query ( )
q . Set ( "twtimeline" , "0" )
q . Set ( "url" , "https://www.youtube.com/watch?v=" + videoID )
u . RawQuery = q . Encode ( )
2020-07-29 03:34:08 +02:00
//slack("GET %s", u.String())
client := getClient ( ip )
req , err := http . NewRequest ( http . MethodGet , u . String ( ) , nil )
if err != nil {
return errors . Err ( err )
}
req . Header . Set ( "User-Agent" , "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36" )
res , err := client . Do ( req )
2020-07-28 17:05:24 +02:00
if err != nil {
return errors . Err ( err )
}
defer res . Body . Close ( )
2020-07-29 03:34:08 +02:00
var response struct {
Message string ` json:"message" `
Status string ` json:"status" `
VideoURL string ` json:"video_url" `
}
err = json . NewDecoder ( res . Body ) . Decode ( & response )
if err != nil {
2020-07-30 17:38:22 +02:00
if strings . Contains ( err . Error ( ) , "cannot unmarshal number" ) {
return errors . Err ( errStatusParse )
}
if strings . Contains ( err . Error ( ) , "no route to host" ) {
return errors . Err ( errConnectionIssue )
}
2020-07-29 03:34:08 +02:00
return errors . Err ( err )
}
switch response . Status {
case "removed_video" :
return errors . Err ( "video previously removed from service" )
case "no_video" :
return errors . Err ( "they say 'video cannot be found'. wtf?" )
default :
spew . Dump ( response )
}
2020-07-28 18:47:28 +02:00
2020-07-28 17:05:24 +02:00
return nil
//https://caa.iti.gr/caa/api/v4/videos/reports/h-tuxHS5lSM
}
2020-07-30 21:37:14 +02:00
func getUploadTime ( config * sdk . APIConfig , videoID string , ip * net . TCPAddr , uploadDate string ) ( string , error ) {
2020-07-29 03:34:08 +02:00
//slack("Getting upload time for %s", videoID)
2020-07-30 17:13:19 +02:00
release , err := config . GetReleasedDate ( videoID )
if err != nil {
if release != nil {
const sqlTimeFormat = "2006-01-02 15:04:05"
sqlTime , err := time . ParseInLocation ( sqlTimeFormat , release . ReleaseTime , time . UTC )
if err != nil {
return sqlTime . Format ( releaseTimeFormat ) , nil
}
}
}
2020-07-30 21:37:14 +02:00
ytdlUploadDate , err := time . Parse ( "20060102" , uploadDate )
if err != nil {
logrus . Error ( err )
}
if time . Now ( ) . Add ( - 5 * 24 * time . Hour ) . After ( ytdlUploadDate ) {
return ytdlUploadDate . Format ( releaseTimeFormat ) , nil
}
2020-07-29 03:34:08 +02:00
client := getClient ( ip )
req , err := http . NewRequest ( http . MethodGet , "https://caa.iti.gr/get_verificationV3?url=https://www.youtube.com/watch?v=" + videoID , nil )
if err != nil {
return "" , errors . Err ( err )
}
req . Header . Set ( "User-Agent" , "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36" )
res , err := client . Do ( req )
2020-07-28 17:05:24 +02:00
if err != nil {
return "" , errors . Err ( err )
}
defer res . Body . Close ( )
var uploadTime struct {
Time string ` json:"video_upload_time" `
Message string ` json:"message" `
Status string ` json:"status" `
}
err = json . NewDecoder ( res . Body ) . Decode ( & uploadTime )
if err != nil {
return "" , errors . Err ( err )
}
if uploadTime . Status == "ERROR1" {
return "" , errNotScraped
}
2020-07-29 03:34:08 +02:00
if uploadTime . Status == "" && strings . HasPrefix ( uploadTime . Message , "CANNOT_RETRIEVE_REPORT_FOR_VIDEO_" ) {
return "" , errors . Err ( "cannot retrieve report for video" )
}
2020-07-28 18:47:28 +02:00
if uploadTime . Time == "" {
return "" , errUploadTimeEmpty
}
2020-07-28 17:05:24 +02:00
return uploadTime . Time , nil
2020-07-27 23:14:06 +02:00
}
2020-07-27 21:42:45 +02:00
2020-07-29 03:34:08 +02:00
func getClient ( ip * net . TCPAddr ) * http . Client {
if ip == nil {
return http . DefaultClient
}
return & http . Client {
Transport : & http . Transport {
Proxy : http . ProxyFromEnvironment ,
DialContext : ( & net . Dialer {
LocalAddr : ip ,
Timeout : 30 * time . Second ,
KeepAlive : 30 * time . Second ,
} ) . DialContext ,
MaxIdleConns : 100 ,
IdleConnTimeout : 90 * time . Second ,
TLSHandshakeTimeout : 10 * time . Second ,
ExpectContinueTimeout : 1 * time . Second ,
} ,
}
}
2020-07-30 18:48:05 +02:00
func run ( use string , args [ ] string , withStdErr , withStdOut bool , stopChan stop . Chan , pool * ip_manager . IPPool ) ( [ ] string , error ) {
2020-07-30 18:14:06 +02:00
var maxtries = 10
var attemps int
2020-08-03 07:05:03 +02:00
var useragent [ ] string
2020-07-30 18:03:07 +02:00
for {
2020-08-03 07:05:03 +02:00
sourceAddress , err := getIPFromPool ( use , stopChan , pool )
if err != nil {
return nil , err
2020-07-30 18:03:07 +02:00
}
2020-07-30 18:48:05 +02:00
defer pool . ReleaseIP ( sourceAddress )
2020-07-30 19:05:12 +02:00
argsForCommand := append ( args , "--source-address" , sourceAddress )
2020-08-03 07:05:03 +02:00
argsForCommand = append ( argsForCommand , useragent ... )
2020-07-30 19:05:12 +02:00
cmd := exec . Command ( "youtube-dl" , argsForCommand ... )
2020-07-30 19:05:49 +02:00
logrus . Printf ( "Running command youtube-dl %s" , strings . Join ( argsForCommand , " " ) )
2020-07-30 18:14:06 +02:00
var stderr io . ReadCloser
var errorLog [ ] byte
if withStdErr {
var err error
stderr , err = cmd . StderrPipe ( )
if err != nil {
return nil , errors . Err ( err )
}
2020-07-27 23:14:06 +02:00
}
2020-07-27 21:42:45 +02:00
2020-07-30 18:14:06 +02:00
var stdout io . ReadCloser
var outLog [ ] byte
if withStdOut {
var err error
stdout , err = cmd . StdoutPipe ( )
if err != nil {
return nil , errors . Err ( err )
}
2020-07-27 23:14:06 +02:00
2020-07-30 18:14:06 +02:00
if err := cmd . Start ( ) ; err != nil {
return nil , errors . Err ( err )
}
outLog , err = ioutil . ReadAll ( stdout )
if err != nil {
return nil , errors . Err ( err )
}
2020-08-03 07:05:03 +02:00
if withStdErr {
errorLog , err = ioutil . ReadAll ( stderr )
if err != nil {
return nil , errors . Err ( err )
}
}
2020-07-27 23:14:06 +02:00
}
2020-07-29 03:34:08 +02:00
2020-07-30 18:14:06 +02:00
done := make ( chan error , 1 )
go func ( ) {
attemps ++
done <- cmd . Wait ( )
} ( )
select {
case <- stopChan :
if err := cmd . Process . Kill ( ) ; err != nil {
return nil , errors . Prefix ( "failed to kill command after stopper cancellation" , err )
}
return nil , errors . Err ( "canceled by stopper" )
case err := <- done :
if err != nil {
if strings . Contains ( err . Error ( ) , "exit status 1" ) {
if strings . Contains ( string ( errorLog ) , "HTTP Error 429" ) || strings . Contains ( string ( errorLog ) , "returned non-zero exit status 8" ) {
2020-07-30 18:48:05 +02:00
pool . SetThrottled ( sourceAddress )
2020-07-30 19:37:44 +02:00
logrus . Debugf ( "known throttling error...try again (%d)" , attemps )
continue
2020-07-30 18:14:06 +02:00
}
2020-08-03 07:05:03 +02:00
if strings . Contains ( string ( errorLog ) , "YouTube said: Unable to extract video data" ) {
useragent = [ ] string { "--user-agent" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" }
if attemps == 1 {
useragent = [ ] string { "--user-agent" , "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" }
}
logrus . Debugf ( "known extraction issue, maybe user agent specification will work...try again (%d)" , attemps )
continue
}
2020-07-30 18:14:06 +02:00
if attemps > maxtries {
2020-07-30 18:48:05 +02:00
logrus . Debug ( "too many tries returning failure" )
2020-07-30 18:14:06 +02:00
break
}
}
2020-07-30 21:37:14 +02:00
logrus . Debugf ( "Unkown error, returning failure: %s" , err . Error ( ) )
2020-08-03 07:05:03 +02:00
return nil , errors . Prefix ( "youtube-dl " + strings . Join ( argsForCommand , " " ) + " [" + string ( errorLog ) + "] " , err )
2020-07-30 18:14:06 +02:00
}
return strings . Split ( strings . Replace ( string ( outLog ) , "\r\n" , "\n" , - 1 ) , "\n" ) , nil
2020-07-29 03:34:08 +02:00
}
2020-07-30 18:14:06 +02:00
if len ( errorLog ) > 0 {
return nil , errors . Err ( string ( errorLog ) )
}
2020-07-27 21:42:45 +02:00
}
}
2020-08-03 07:05:03 +02:00
func getIPFromPool ( use string , stopChan stop . Chan , pool * ip_manager . IPPool ) ( sourceAddress string , err error ) {
for {
sourceAddress , err = pool . GetIP ( use )
if err != nil {
if errors . Is ( err , ip_manager . ErrAllThrottled ) {
select {
case <- stopChan :
return "" , errors . Err ( "interrupted by user" )
default :
time . Sleep ( ip_manager . IPCooldownPeriod )
continue
}
} else {
return "" , err
}
}
break
}
return
}