ytsync/downloader/downloader.go
Mark Beamer Jr 5be3551abe
Mark video failed if we cannot get video info instead of failing the sync
Turn on error log after fixing lockup
Try different user agents if calls fail
Prevent double processing a channel in a row, just fail the sync instead
2020-08-05 15:47:53 -04:00

353 lines
10 KiB
Go

package downloader
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"net/http"
"net/url"
"os/exec"
"strings"
"time"
"github.com/davecgh/go-spew/spew"
"github.com/lbryio/ytsync/v5/downloader/ytdl"
"github.com/lbryio/ytsync/v5/ip_manager"
"github.com/lbryio/ytsync/v5/sdk"
"github.com/lbryio/lbry.go/v2/extras/errors"
"github.com/lbryio/lbry.go/v2/extras/stop"
"github.com/lbryio/lbry.go/v2/extras/util"
"github.com/sirupsen/logrus"
)
func GetPlaylistVideoIDs(channelName string, maxVideos int, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
args := []string{"--skip-download", "https://www.youtube.com/channel/" + channelName, "--get-id", "--flat-playlist"}
ids, err := run(channelName, args, true, true, stopChan, pool)
if err != nil {
return nil, errors.Err(err)
}
videoIDs := make([]string, maxVideos)
for i, v := range ids {
logrus.Debugf("%d - video id %s", i, v)
if i >= maxVideos {
break
}
videoIDs[i] = v
}
return videoIDs, nil
}
const releaseTimeFormat = "2006-01-02, 15:04:05 (MST)"
func GetVideoInformation(config *sdk.APIConfig, videoID string, stopChan stop.Chan, ip *net.TCPAddr, pool *ip_manager.IPPool) (*ytdl.YtdlVideo, error) {
args := []string{"--skip-download", "--print-json", "https://www.youtube.com/watch?v=" + videoID}
results, err := run(videoID, args, true, true, stopChan, pool)
if err != nil {
return nil, errors.Err(err)
}
var video *ytdl.YtdlVideo
err = json.Unmarshal([]byte(results[0]), &video)
if err != nil {
return nil, errors.Err(err)
}
// now get an accurate time
const maxTries = 5
tries := 0
GetTime:
tries++
t, err := getUploadTime(config, videoID, ip, video.UploadDate)
if err != nil {
//slack(":warning: Upload time error: %v", err)
if tries <= maxTries && (errors.Is(err, errNotScraped) || errors.Is(err, errUploadTimeEmpty) || errors.Is(err, errStatusParse) || errors.Is(err, errConnectionIssue)) {
err := triggerScrape(videoID, ip)
if err == nil {
time.Sleep(2 * time.Second) // let them scrape it
goto GetTime
} else {
//slack("triggering scrape returned error: %v", err)
}
} else if !errors.Is(err, errNotScraped) && !errors.Is(err, errUploadTimeEmpty) {
//slack(":warning: Error while trying to get accurate upload time for %s: %v", videoID, err)
return nil, errors.Err(err)
}
// do fallback below
}
//slack("After all that, upload time for %s is %s", videoID, t)
if t != "" {
parsed, err := time.Parse("2006-01-02, 15:04:05 (MST)", t) // this will probably be UTC, but Go's timezone parsing is fucked up. it ignores the timezone in the date
if err != nil {
return nil, errors.Err(err)
}
slack(":exclamation: Got an accurate time for %s", videoID)
video.UploadDateForReal = parsed
} else {
//slack(":warning: Could not get accurate time for %s. Falling back to time from upload ytdl: %s.", videoID, video.UploadDate)
// fall back to UploadDate from youtube-dl
video.UploadDateForReal, err = time.Parse("20060102", video.UploadDate)
if err != nil {
return nil, err
}
}
return video, nil
}
var errNotScraped = errors.Base("not yet scraped by caa.iti.gr")
var errUploadTimeEmpty = errors.Base("upload time is empty")
var errStatusParse = errors.Base("could not parse status, got number, need string")
var errConnectionIssue = errors.Base("there was a connection issue with the api")
func slack(format string, a ...interface{}) {
fmt.Printf(format+"\n", a...)
util.SendToSlack(format, a...)
}
func triggerScrape(videoID string, ip *net.TCPAddr) error {
//slack("Triggering scrape for %s", videoID)
u, err := url.Parse("https://caa.iti.gr/verify_videoV3")
q := u.Query()
q.Set("twtimeline", "0")
q.Set("url", "https://www.youtube.com/watch?v="+videoID)
u.RawQuery = q.Encode()
//slack("GET %s", u.String())
client := getClient(ip)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return errors.Err(err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36")
res, err := client.Do(req)
if err != nil {
return errors.Err(err)
}
defer res.Body.Close()
var response struct {
Message string `json:"message"`
Status string `json:"status"`
VideoURL string `json:"video_url"`
}
err = json.NewDecoder(res.Body).Decode(&response)
if err != nil {
if strings.Contains(err.Error(), "cannot unmarshal number") {
return errors.Err(errStatusParse)
}
if strings.Contains(err.Error(), "no route to host") {
return errors.Err(errConnectionIssue)
}
return errors.Err(err)
}
switch response.Status {
case "removed_video":
return errors.Err("video previously removed from service")
case "no_video":
return errors.Err("they say 'video cannot be found'. wtf?")
default:
spew.Dump(response)
}
return nil
//https://caa.iti.gr/caa/api/v4/videos/reports/h-tuxHS5lSM
}
func getUploadTime(config *sdk.APIConfig, videoID string, ip *net.TCPAddr, uploadDate string) (string, error) {
//slack("Getting upload time for %s", videoID)
release, err := config.GetReleasedDate(videoID)
if err != nil {
if release != nil {
const sqlTimeFormat = "2006-01-02 15:04:05"
sqlTime, err := time.ParseInLocation(sqlTimeFormat, release.ReleaseTime, time.UTC)
if err != nil {
return sqlTime.Format(releaseTimeFormat), nil
}
}
}
ytdlUploadDate, err := time.Parse("20060102", uploadDate)
if err != nil {
logrus.Error(err)
}
if time.Now().Add(-5 * 24 * time.Hour).After(ytdlUploadDate) {
return ytdlUploadDate.Format(releaseTimeFormat), nil
}
client := getClient(ip)
req, err := http.NewRequest(http.MethodGet, "https://caa.iti.gr/get_verificationV3?url=https://www.youtube.com/watch?v="+videoID, nil)
if err != nil {
return "", errors.Err(err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36")
res, err := client.Do(req)
if err != nil {
return "", errors.Err(err)
}
defer res.Body.Close()
var uploadTime struct {
Time string `json:"video_upload_time"`
Message string `json:"message"`
Status string `json:"status"`
}
err = json.NewDecoder(res.Body).Decode(&uploadTime)
if err != nil {
return "", errors.Err(err)
}
if uploadTime.Status == "ERROR1" {
return "", errNotScraped
}
if uploadTime.Status == "" && strings.HasPrefix(uploadTime.Message, "CANNOT_RETRIEVE_REPORT_FOR_VIDEO_") {
return "", errors.Err("cannot retrieve report for video")
}
if uploadTime.Time == "" {
return "", errUploadTimeEmpty
}
return uploadTime.Time, nil
}
func getClient(ip *net.TCPAddr) *http.Client {
if ip == nil {
return http.DefaultClient
}
return &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
LocalAddr: ip,
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
}
}
func run(use string, args []string, withStdErr, withStdOut bool, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
var maxtries = 10
var attemps int
var useragent []string
for {
sourceAddress, err := getIPFromPool(use, stopChan, pool)
if err != nil {
return nil, err
}
defer pool.ReleaseIP(sourceAddress)
argsForCommand := append(args, "--source-address", sourceAddress)
argsForCommand = append(argsForCommand, useragent...)
cmd := exec.Command("youtube-dl", argsForCommand...)
logrus.Printf("Running command youtube-dl %s", strings.Join(argsForCommand, " "))
var stderr io.ReadCloser
var errorLog []byte
if withStdErr {
var err error
stderr, err = cmd.StderrPipe()
if err != nil {
return nil, errors.Err(err)
}
}
var stdout io.ReadCloser
var outLog []byte
if withStdOut {
var err error
stdout, err = cmd.StdoutPipe()
if err != nil {
return nil, errors.Err(err)
}
if err := cmd.Start(); err != nil {
return nil, errors.Err(err)
}
outLog, err = ioutil.ReadAll(stdout)
if err != nil {
return nil, errors.Err(err)
}
if withStdErr {
errorLog, err = ioutil.ReadAll(stderr)
if err != nil {
return nil, errors.Err(err)
}
}
}
done := make(chan error, 1)
go func() {
attemps++
done <- cmd.Wait()
}()
select {
case <-stopChan:
if err := cmd.Process.Kill(); err != nil {
return nil, errors.Prefix("failed to kill command after stopper cancellation", err)
}
return nil, errors.Err("canceled by stopper")
case err := <-done:
if err != nil {
if strings.Contains(err.Error(), "exit status 1") {
if strings.Contains(string(errorLog), "HTTP Error 429") || strings.Contains(string(errorLog), "returned non-zero exit status 8") {
pool.SetThrottled(sourceAddress)
logrus.Debugf("known throttling error...try again (%d)", attemps)
continue
}
if strings.Contains(string(errorLog), "YouTube said: Unable to extract video data") {
useragent = []string{"--user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
if attemps == 1 {
useragent = []string{"--user-agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
}
logrus.Debugf("known extraction issue, maybe user agent specification will work...try again (%d)", attemps)
continue
}
if attemps > maxtries {
logrus.Debug("too many tries returning failure")
break
}
}
logrus.Debugf("Unkown error, returning failure: %s", err.Error())
return nil, errors.Prefix("youtube-dl "+strings.Join(argsForCommand, " ")+" ["+string(errorLog)+"] ", err)
}
return strings.Split(strings.Replace(string(outLog), "\r\n", "\n", -1), "\n"), nil
}
if len(errorLog) > 0 {
return nil, errors.Err(string(errorLog))
}
}
}
func getIPFromPool(use string, stopChan stop.Chan, pool *ip_manager.IPPool) (sourceAddress string, err error) {
for {
sourceAddress, err = pool.GetIP(use)
if err != nil {
if errors.Is(err, ip_manager.ErrAllThrottled) {
select {
case <-stopChan:
return "", errors.Err("interrupted by user")
default:
time.Sleep(ip_manager.IPCooldownPeriod)
continue
}
} else {
return "", err
}
}
break
}
return
}