ytsync/downloader/downloader.go

361 lines
11 KiB
Go
Raw Permalink Normal View History

package downloader
import (
2020-07-27 23:14:06 +02:00
"encoding/json"
"fmt"
2020-07-27 23:14:06 +02:00
"io"
"io/ioutil"
"net"
"net/http"
"net/url"
"os/exec"
"strings"
"time"
"github.com/davecgh/go-spew/spew"
2020-07-27 23:14:06 +02:00
"github.com/lbryio/ytsync/v5/downloader/ytdl"
2020-07-30 18:48:05 +02:00
"github.com/lbryio/ytsync/v5/ip_manager"
"github.com/lbryio/ytsync/v5/sdk"
2020-07-27 23:14:06 +02:00
"github.com/lbryio/lbry.go/v2/extras/errors"
"github.com/lbryio/lbry.go/v2/extras/stop"
"github.com/lbryio/lbry.go/v2/extras/util"
"github.com/sirupsen/logrus"
)
2020-07-30 18:48:05 +02:00
func GetPlaylistVideoIDs(channelName string, maxVideos int, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
args := []string{"--skip-download", "https://www.youtube.com/channel/" + channelName, "--get-id", "--flat-playlist"}
ids, err := run(channelName, args, true, true, stopChan, pool)
2020-07-27 21:57:19 +02:00
if err != nil {
return nil, errors.Err(err)
}
videoIDs := make([]string, maxVideos)
for i, v := range ids {
logrus.Debugf("%d - video id %s", i, v)
2020-07-27 21:57:19 +02:00
if i >= maxVideos {
break
}
videoIDs[i] = v
}
return videoIDs, nil
}
const releaseTimeFormat = "2006-01-02, 15:04:05 (MST)"
2020-07-30 18:48:05 +02:00
func GetVideoInformation(config *sdk.APIConfig, videoID string, stopChan stop.Chan, ip *net.TCPAddr, pool *ip_manager.IPPool) (*ytdl.YtdlVideo, error) {
args := []string{"--skip-download", "--print-json", "https://www.youtube.com/watch?v=" + videoID}
results, err := run(videoID, args, true, true, stopChan, pool)
if err != nil {
return nil, errors.Err(err)
}
2020-07-27 23:14:06 +02:00
var video *ytdl.YtdlVideo
err = json.Unmarshal([]byte(results[0]), &video)
if err != nil {
return nil, errors.Err(err)
}
// now get an accurate time
const maxTries = 5
tries := 0
GetTime:
tries++
t, err := getUploadTime(config, videoID, ip, video.UploadDate)
if err != nil {
//slack(":warning: Upload time error: %v", err)
if tries <= maxTries && (errors.Is(err, errNotScraped) || errors.Is(err, errUploadTimeEmpty) || errors.Is(err, errStatusParse) || errors.Is(err, errConnectionIssue)) {
err := triggerScrape(videoID, ip)
if err == nil {
time.Sleep(2 * time.Second) // let them scrape it
goto GetTime
} else {
//slack("triggering scrape returned error: %v", err)
}
} else if !errors.Is(err, errNotScraped) && !errors.Is(err, errUploadTimeEmpty) {
//slack(":warning: Error while trying to get accurate upload time for %s: %v", videoID, err)
2020-08-06 02:12:05 +02:00
if t == "" {
return nil, errors.Err(err)
} else {
t = "" //TODO: get rid of the other piece below?
}
}
// do fallback below
}
//slack("After all that, upload time for %s is %s", videoID, t)
if t != "" {
parsed, err := time.Parse("2006-01-02, 15:04:05 (MST)", t) // this will probably be UTC, but Go's timezone parsing is fucked up. it ignores the timezone in the date
if err != nil {
return nil, errors.Err(err)
}
//slack(":exclamation: Got an accurate time for %s", videoID)
video.UploadDateForReal = parsed
2020-08-06 02:12:05 +02:00
} else { //TODO: this is the piece that isn't needed!
slack(":warning: Could not get accurate time for %s. Falling back to time from upload ytdl: %s.", videoID, video.UploadDate)
// fall back to UploadDate from youtube-dl
video.UploadDateForReal, err = time.Parse("20060102", video.UploadDate)
if err != nil {
return nil, err
}
}
2020-07-27 23:14:06 +02:00
return video, nil
}
var errNotScraped = errors.Base("not yet scraped by caa.iti.gr")
var errUploadTimeEmpty = errors.Base("upload time is empty")
var errStatusParse = errors.Base("could not parse status, got number, need string")
var errConnectionIssue = errors.Base("there was a connection issue with the api")
func slack(format string, a ...interface{}) {
fmt.Printf(format+"\n", a...)
util.SendToSlack(format, a...)
}
func triggerScrape(videoID string, ip *net.TCPAddr) error {
//slack("Triggering scrape for %s", videoID)
u, err := url.Parse("https://caa.iti.gr/verify_videoV3")
q := u.Query()
q.Set("twtimeline", "0")
q.Set("url", "https://www.youtube.com/watch?v="+videoID)
u.RawQuery = q.Encode()
//slack("GET %s", u.String())
client := getClient(ip)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return errors.Err(err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36")
res, err := client.Do(req)
if err != nil {
return errors.Err(err)
}
defer res.Body.Close()
var response struct {
Message string `json:"message"`
Status string `json:"status"`
VideoURL string `json:"video_url"`
}
err = json.NewDecoder(res.Body).Decode(&response)
if err != nil {
if strings.Contains(err.Error(), "cannot unmarshal number") {
return errors.Err(errStatusParse)
}
if strings.Contains(err.Error(), "no route to host") {
return errors.Err(errConnectionIssue)
}
return errors.Err(err)
}
switch response.Status {
case "removed_video":
return errors.Err("video previously removed from service")
case "no_video":
return errors.Err("they say 'video cannot be found'. wtf?")
default:
spew.Dump(response)
}
return nil
//https://caa.iti.gr/caa/api/v4/videos/reports/h-tuxHS5lSM
}
func getUploadTime(config *sdk.APIConfig, videoID string, ip *net.TCPAddr, uploadDate string) (string, error) {
//slack("Getting upload time for %s", videoID)
release, err := config.GetReleasedDate(videoID)
if err != nil {
2020-08-06 00:25:54 +02:00
logrus.Error(err)
}
if release != nil {
2020-08-06 02:12:05 +02:00
//const sqlTimeFormat = "2006-01-02 15:04:05"
sqlTime, err := time.ParseInLocation(time.RFC3339, release.ReleaseTime, time.UTC)
2020-08-06 00:25:54 +02:00
if err == nil {
return sqlTime.Format(releaseTimeFormat), nil
2020-08-06 02:12:05 +02:00
} else {
logrus.Error(err)
}
}
ytdlUploadDate, err := time.Parse("20060102", uploadDate)
if err != nil {
logrus.Error(err)
}
2020-08-06 02:12:05 +02:00
if time.Now().AddDate(0, 0, -2).After(ytdlUploadDate) {
return ytdlUploadDate.Format(releaseTimeFormat), nil
}
client := getClient(ip)
req, err := http.NewRequest(http.MethodGet, "https://caa.iti.gr/get_verificationV3?url=https://www.youtube.com/watch?v="+videoID, nil)
if err != nil {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errors.Err(err)
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36")
res, err := client.Do(req)
if err != nil {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errors.Err(err)
}
defer res.Body.Close()
var uploadTime struct {
Time string `json:"video_upload_time"`
Message string `json:"message"`
Status string `json:"status"`
}
err = json.NewDecoder(res.Body).Decode(&uploadTime)
if err != nil {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errors.Err(err)
}
if uploadTime.Status == "ERROR1" {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errNotScraped
}
if uploadTime.Status == "" && strings.HasPrefix(uploadTime.Message, "CANNOT_RETRIEVE_REPORT_FOR_VIDEO_") {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errors.Err("cannot retrieve report for video")
}
if uploadTime.Time == "" {
2020-08-06 02:12:05 +02:00
return ytdlUploadDate.Format(releaseTimeFormat), errUploadTimeEmpty
}
return uploadTime.Time, nil
2020-07-27 23:14:06 +02:00
}
func getClient(ip *net.TCPAddr) *http.Client {
if ip == nil {
return http.DefaultClient
}
return &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
LocalAddr: ip,
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
}
}
2020-07-30 18:48:05 +02:00
func run(use string, args []string, withStdErr, withStdOut bool, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
2020-07-30 18:14:06 +02:00
var maxtries = 10
var attemps int
var useragent []string
2020-07-30 18:03:07 +02:00
for {
sourceAddress, err := getIPFromPool(use, stopChan, pool)
if err != nil {
return nil, err
2020-07-30 18:03:07 +02:00
}
2020-07-30 18:48:05 +02:00
defer pool.ReleaseIP(sourceAddress)
2020-07-30 19:05:12 +02:00
argsForCommand := append(args, "--source-address", sourceAddress)
argsForCommand = append(argsForCommand, useragent...)
2020-07-30 19:05:12 +02:00
cmd := exec.Command("youtube-dl", argsForCommand...)
2020-07-30 19:05:49 +02:00
logrus.Printf("Running command youtube-dl %s", strings.Join(argsForCommand, " "))
2020-07-30 18:14:06 +02:00
var stderr io.ReadCloser
var errorLog []byte
if withStdErr {
var err error
stderr, err = cmd.StderrPipe()
if err != nil {
return nil, errors.Err(err)
}
2020-07-27 23:14:06 +02:00
}
2020-07-30 18:14:06 +02:00
var stdout io.ReadCloser
var outLog []byte
if withStdOut {
var err error
stdout, err = cmd.StdoutPipe()
if err != nil {
return nil, errors.Err(err)
}
2020-07-27 23:14:06 +02:00
2020-07-30 18:14:06 +02:00
if err := cmd.Start(); err != nil {
return nil, errors.Err(err)
}
outLog, err = ioutil.ReadAll(stdout)
if err != nil {
return nil, errors.Err(err)
}
if withStdErr {
errorLog, err = ioutil.ReadAll(stderr)
if err != nil {
return nil, errors.Err(err)
}
}
2020-07-27 23:14:06 +02:00
}
2020-07-30 18:14:06 +02:00
done := make(chan error, 1)
go func() {
attemps++
done <- cmd.Wait()
}()
select {
case <-stopChan:
if err := cmd.Process.Kill(); err != nil {
return nil, errors.Prefix("failed to kill command after stopper cancellation", err)
}
return nil, errors.Err("canceled by stopper")
case err := <-done:
if err != nil {
if strings.Contains(err.Error(), "exit status 1") {
if strings.Contains(string(errorLog), "HTTP Error 429") || strings.Contains(string(errorLog), "returned non-zero exit status 8") {
2020-07-30 18:48:05 +02:00
pool.SetThrottled(sourceAddress)
logrus.Debugf("known throttling error...try again (%d)", attemps)
continue
2020-07-30 18:14:06 +02:00
}
if strings.Contains(string(errorLog), "YouTube said: Unable to extract video data") {
useragent = []string{"--user-agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
if attemps == 1 {
useragent = []string{"--user-agent", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}
}
logrus.Debugf("known extraction issue, maybe user agent specification will work...try again (%d)", attemps)
continue
}
2020-07-30 18:14:06 +02:00
if attemps > maxtries {
2020-07-30 18:48:05 +02:00
logrus.Debug("too many tries returning failure")
2020-07-30 18:14:06 +02:00
break
}
}
logrus.Debugf("Unkown error, returning failure: %s", err.Error())
return nil, errors.Prefix("youtube-dl "+strings.Join(argsForCommand, " ")+" ["+string(errorLog)+"] ", err)
2020-07-30 18:14:06 +02:00
}
return strings.Split(strings.Replace(string(outLog), "\r\n", "\n", -1), "\n"), nil
}
2020-07-30 18:14:06 +02:00
if len(errorLog) > 0 {
return nil, errors.Err(string(errorLog))
}
}
}
func getIPFromPool(use string, stopChan stop.Chan, pool *ip_manager.IPPool) (sourceAddress string, err error) {
for {
sourceAddress, err = pool.GetIP(use)
if err != nil {
if errors.Is(err, ip_manager.ErrAllThrottled) {
select {
case <-stopChan:
return "", errors.Err("interrupted by user")
default:
time.Sleep(ip_manager.IPCooldownPeriod)
continue
}
} else {
return "", err
}
}
break
}
return
}