ytsync/downloader/downloader.go
Niko Storni e454cdb4c9 fix post live detection
prevent unlisted videos from ever publishing (even if they were public before and we know about them)
fix timestamp on videos
update user agent
2022-08-10 21:26:36 +02:00

315 lines
8.5 KiB
Go

package downloader
import (
"encoding/json"
"fmt"
"io/ioutil"
"math"
"net"
"net/http"
"net/url"
"os"
"os/exec"
"path"
"strings"
"time"
"github.com/davecgh/go-spew/spew"
"github.com/lbryio/ytsync/v5/downloader/ytdl"
"github.com/lbryio/ytsync/v5/ip_manager"
"github.com/lbryio/ytsync/v5/sdk"
"github.com/lbryio/ytsync/v5/shared"
util2 "github.com/lbryio/ytsync/v5/util"
"github.com/lbryio/lbry.go/v2/extras/errors"
"github.com/lbryio/lbry.go/v2/extras/stop"
"github.com/lbryio/lbry.go/v2/extras/util"
"github.com/sirupsen/logrus"
)
func GetPlaylistVideoIDs(channelName string, maxVideos int, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
args := []string{"--skip-download", "https://www.youtube.com/channel/" + channelName + "/videos", "--get-id", "--flat-playlist", "--cookies", "cookies.txt", "--playlist-end", fmt.Sprintf("%d", maxVideos)}
ids, err := run(channelName, args, stopChan, pool)
if err != nil {
return nil, errors.Err(err)
}
videoIDs := make([]string, 0, maxVideos)
for i, v := range ids {
if v == "" {
continue
}
logrus.Debugf("%d - video id %s", i, v)
if i >= maxVideos {
break
}
videoIDs = append(videoIDs, v)
}
return videoIDs, nil
}
const releaseTimeFormat = "2006-01-02, 15:04:05 (MST)"
func GetVideoInformation(videoID string, stopChan stop.Chan, pool *ip_manager.IPPool) (*ytdl.YtdlVideo, error) {
args := []string{
"--skip-download",
"--write-info-json",
fmt.Sprintf("https://www.youtube.com/watch?v=%s", videoID),
"--cookies",
"cookies.txt",
"-o",
path.Join(util2.GetVideoMetadataDir(), videoID),
}
_, err := run(videoID, args, stopChan, pool)
if err != nil {
return nil, errors.Err(err)
}
f, err := os.Open(path.Join(util2.GetVideoMetadataDir(), videoID+".info.json"))
if err != nil {
return nil, errors.Err(err)
}
// defer the closing of our jsonFile so that we can parse it later on
defer f.Close()
// read our opened jsonFile as a byte array.
byteValue, _ := ioutil.ReadAll(f)
var video *ytdl.YtdlVideo
err = json.Unmarshal(byteValue, &video)
if err != nil {
return nil, errors.Err(err)
}
return video, nil
}
var errNotScraped = errors.Base("not yet scraped by caa.iti.gr")
var errUploadTimeEmpty = errors.Base("upload time is empty")
var errStatusParse = errors.Base("could not parse status, got number, need string")
var errConnectionIssue = errors.Base("there was a connection issue with the api")
func slack(format string, a ...interface{}) {
fmt.Printf(format+"\n", a...)
util.SendToSlack(format, a...)
}
func triggerScrape(videoID string, ip *net.TCPAddr) error {
//slack("Triggering scrape for %s", videoID)
u, err := url.Parse("https://caa.iti.gr/verify_videoV3")
q := u.Query()
q.Set("twtimeline", "0")
q.Set("url", "https://www.youtube.com/watch?v="+videoID)
u.RawQuery = q.Encode()
//slack("GET %s", u.String())
client := getClient(ip)
req, err := http.NewRequest(http.MethodGet, u.String(), nil)
if err != nil {
return errors.Err(err)
}
req.Header.Set("User-Agent", ChromeUA)
res, err := client.Do(req)
if err != nil {
return errors.Err(err)
}
defer res.Body.Close()
var response struct {
Message string `json:"message"`
Status string `json:"status"`
VideoURL string `json:"video_url"`
}
err = json.NewDecoder(res.Body).Decode(&response)
if err != nil {
if strings.Contains(err.Error(), "cannot unmarshal number") {
return errors.Err(errStatusParse)
}
if strings.Contains(err.Error(), "no route to host") {
return errors.Err(errConnectionIssue)
}
return errors.Err(err)
}
switch response.Status {
case "removed_video":
return errors.Err("video previously removed from service")
case "no_video":
return errors.Err("they say 'video cannot be found'. wtf?")
default:
spew.Dump(response)
}
return nil
//https://caa.iti.gr/caa/api/v4/videos/reports/h-tuxHS5lSM
}
func getUploadTime(config *sdk.APIConfig, videoID string, ip *net.TCPAddr, uploadDate string) (string, error) {
//slack("Getting upload time for %s", videoID)
release, err := config.GetReleasedDate(videoID)
if err != nil {
logrus.Error(err)
}
ytdlUploadDate, err := time.Parse("20060102", uploadDate)
if err != nil {
logrus.Error(err)
}
if release != nil {
//const sqlTimeFormat = "2006-01-02 15:04:05"
sqlTime, err := time.ParseInLocation(time.RFC3339, release.ReleaseTime, time.UTC)
if err == nil {
hoursDiff := math.Abs(sqlTime.Sub(ytdlUploadDate).Hours())
if hoursDiff > 48 {
logrus.Infof("upload day from APIs differs from the ytdl one by more than 2 days.")
} else {
return sqlTime.Format(releaseTimeFormat), nil
}
} else {
logrus.Error(err)
}
}
return ytdlUploadDate.Format(releaseTimeFormat), nil
}
func getClient(ip *net.TCPAddr) *http.Client {
if ip == nil {
return http.DefaultClient
}
return &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyFromEnvironment,
DialContext: (&net.Dialer{
LocalAddr: ip,
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
}).DialContext,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
TLSHandshakeTimeout: 10 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
},
}
}
const (
GoogleBotUA = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
ChromeUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36"
maxAttempts = 3
extractionError = "YouTube said: Unable to extract video data"
throttledError = "HTTP Error 429"
AlternateThrottledError = "returned non-zero exit status 8"
youtubeDlError = "exit status 1"
videoPremiereError = "Premieres in"
liveEventError = "This live event will begin in"
)
func run(use string, args []string, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) {
var useragent []string
var lastError error
for attempts := 0; attempts < maxAttempts; attempts++ {
sourceAddress, err := getIPFromPool(use, stopChan, pool)
if err != nil {
return nil, err
}
argsForCommand := append(args, "--source-address", sourceAddress)
argsForCommand = append(argsForCommand, useragent...)
binary := "yt-dlp"
cmd := exec.Command(binary, argsForCommand...)
res, err := runCmd(cmd, stopChan)
pool.ReleaseIP(sourceAddress)
if err == nil {
return res, nil
}
lastError = err
if strings.Contains(err.Error(), youtubeDlError) {
if util.SubstringInSlice(err.Error(), shared.ErrorsNoRetry) {
break
}
if strings.Contains(err.Error(), extractionError) {
logrus.Warnf("known extraction error: %s", errors.FullTrace(err))
useragent = nextUA(useragent)
}
if strings.Contains(err.Error(), throttledError) || strings.Contains(err.Error(), AlternateThrottledError) {
pool.SetThrottled(sourceAddress)
//we don't want throttle errors to count toward the max retries
attempts--
}
}
}
return nil, lastError
}
func nextUA(current []string) []string {
if len(current) == 0 {
return []string{"--user-agent", GoogleBotUA}
}
return []string{"--user-agent", ChromeUA}
}
func runCmd(cmd *exec.Cmd, stopChan stop.Chan) ([]string, error) {
logrus.Infof("running yt-dlp cmd: %s", strings.Join(cmd.Args, " "))
var err error
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, errors.Err(err)
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, errors.Err(err)
}
err = cmd.Start()
if err != nil {
return nil, errors.Err(err)
}
outLog, err := ioutil.ReadAll(stdout)
if err != nil {
return nil, errors.Err(err)
}
errorLog, err := ioutil.ReadAll(stderr)
if err != nil {
return nil, errors.Err(err)
}
done := make(chan error, 1)
go func() {
done <- cmd.Wait()
}()
select {
case <-stopChan:
err := cmd.Process.Kill()
if err != nil {
return nil, errors.Prefix("failed to kill command after stopper cancellation", err)
}
return nil, errors.Err("interrupted by user")
case err := <-done:
if err != nil {
return nil, errors.Prefix("yt-dlp "+strings.Join(cmd.Args, " ")+" ["+string(errorLog)+"]", err)
}
return strings.Split(strings.Replace(string(outLog), "\r\n", "\n", -1), "\n"), nil
}
}
func getIPFromPool(use string, stopChan stop.Chan, pool *ip_manager.IPPool) (sourceAddress string, err error) {
for {
sourceAddress, err = pool.GetIP(use)
if err != nil {
if errors.Is(err, ip_manager.ErrAllThrottled) {
select {
case <-stopChan:
return "", errors.Err("interrupted by user")
default:
time.Sleep(ip_manager.IPCooldownPeriod)
continue
}
} else {
return "", err
}
}
break
}
return
}