package downloader import ( "encoding/json" "fmt" "io/ioutil" "math" "net" "net/http" "net/url" "os" "os/exec" "path" "strings" "time" "github.com/davecgh/go-spew/spew" "github.com/lbryio/ytsync/v5/downloader/ytdl" "github.com/lbryio/ytsync/v5/ip_manager" "github.com/lbryio/ytsync/v5/sdk" "github.com/lbryio/ytsync/v5/shared" util2 "github.com/lbryio/ytsync/v5/util" "github.com/lbryio/lbry.go/v2/extras/errors" "github.com/lbryio/lbry.go/v2/extras/stop" "github.com/lbryio/lbry.go/v2/extras/util" "github.com/sirupsen/logrus" ) func GetPlaylistVideoIDs(channelName string, maxVideos int, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) { args := []string{"--skip-download", "https://www.youtube.com/channel/" + channelName + "/videos", "--get-id", "--flat-playlist", "--cookies", "cookies.txt", "--playlist-end", fmt.Sprintf("%d", maxVideos)} ids, err := run(channelName, args, stopChan, pool) if err != nil { return nil, errors.Err(err) } videoIDs := make([]string, 0, maxVideos) for i, v := range ids { if v == "" { continue } logrus.Debugf("%d - video id %s", i, v) if i >= maxVideos { break } videoIDs = append(videoIDs, v) } return videoIDs, nil } const releaseTimeFormat = "2006-01-02, 15:04:05 (MST)" func GetVideoInformation(videoID string, stopChan stop.Chan, pool *ip_manager.IPPool) (*ytdl.YtdlVideo, error) { args := []string{ "--skip-download", "--write-info-json", fmt.Sprintf("https://www.youtube.com/watch?v=%s", videoID), "--cookies", "cookies.txt", "-o", path.Join(util2.GetVideoMetadataDir(), videoID), } _, err := run(videoID, args, stopChan, pool) if err != nil { return nil, errors.Err(err) } f, err := os.Open(path.Join(util2.GetVideoMetadataDir(), videoID+".info.json")) if err != nil { return nil, errors.Err(err) } // defer the closing of our jsonFile so that we can parse it later on defer f.Close() // read our opened jsonFile as a byte array. byteValue, _ := ioutil.ReadAll(f) var video *ytdl.YtdlVideo err = json.Unmarshal(byteValue, &video) if err != nil { return nil, errors.Err(err) } return video, nil } var errNotScraped = errors.Base("not yet scraped by caa.iti.gr") var errUploadTimeEmpty = errors.Base("upload time is empty") var errStatusParse = errors.Base("could not parse status, got number, need string") var errConnectionIssue = errors.Base("there was a connection issue with the api") func slack(format string, a ...interface{}) { fmt.Printf(format+"\n", a...) util.SendToSlack(format, a...) } func triggerScrape(videoID string, ip *net.TCPAddr) error { //slack("Triggering scrape for %s", videoID) u, err := url.Parse("https://caa.iti.gr/verify_videoV3") q := u.Query() q.Set("twtimeline", "0") q.Set("url", "https://www.youtube.com/watch?v="+videoID) u.RawQuery = q.Encode() //slack("GET %s", u.String()) client := getClient(ip) req, err := http.NewRequest(http.MethodGet, u.String(), nil) if err != nil { return errors.Err(err) } req.Header.Set("User-Agent", ChromeUA) res, err := client.Do(req) if err != nil { return errors.Err(err) } defer res.Body.Close() var response struct { Message string `json:"message"` Status string `json:"status"` VideoURL string `json:"video_url"` } err = json.NewDecoder(res.Body).Decode(&response) if err != nil { if strings.Contains(err.Error(), "cannot unmarshal number") { return errors.Err(errStatusParse) } if strings.Contains(err.Error(), "no route to host") { return errors.Err(errConnectionIssue) } return errors.Err(err) } switch response.Status { case "removed_video": return errors.Err("video previously removed from service") case "no_video": return errors.Err("they say 'video cannot be found'. wtf?") default: spew.Dump(response) } return nil //https://caa.iti.gr/caa/api/v4/videos/reports/h-tuxHS5lSM } func getUploadTime(config *sdk.APIConfig, videoID string, ip *net.TCPAddr, uploadDate string) (string, error) { //slack("Getting upload time for %s", videoID) release, err := config.GetReleasedDate(videoID) if err != nil { logrus.Error(err) } ytdlUploadDate, err := time.Parse("20060102", uploadDate) if err != nil { logrus.Error(err) } if release != nil { //const sqlTimeFormat = "2006-01-02 15:04:05" sqlTime, err := time.ParseInLocation(time.RFC3339, release.ReleaseTime, time.UTC) if err == nil { hoursDiff := math.Abs(sqlTime.Sub(ytdlUploadDate).Hours()) if hoursDiff > 48 { logrus.Infof("upload day from APIs differs from the ytdl one by more than 2 days.") } else { return sqlTime.Format(releaseTimeFormat), nil } } else { logrus.Error(err) } } return ytdlUploadDate.Format(releaseTimeFormat), nil } func getClient(ip *net.TCPAddr) *http.Client { if ip == nil { return http.DefaultClient } return &http.Client{ Transport: &http.Transport{ Proxy: http.ProxyFromEnvironment, DialContext: (&net.Dialer{ LocalAddr: ip, Timeout: 30 * time.Second, KeepAlive: 30 * time.Second, }).DialContext, MaxIdleConns: 100, IdleConnTimeout: 90 * time.Second, TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, }, } } const ( GoogleBotUA = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" ChromeUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36" maxAttempts = 3 extractionError = "YouTube said: Unable to extract video data" throttledError = "HTTP Error 429" AlternateThrottledError = "returned non-zero exit status 8" youtubeDlError = "exit status 1" videoPremiereError = "Premieres in" liveEventError = "This live event will begin in" ) func run(use string, args []string, stopChan stop.Chan, pool *ip_manager.IPPool) ([]string, error) { var useragent []string var lastError error for attempts := 0; attempts < maxAttempts; attempts++ { sourceAddress, err := getIPFromPool(use, stopChan, pool) if err != nil { return nil, err } argsForCommand := append(args, "--source-address", sourceAddress) argsForCommand = append(argsForCommand, useragent...) binary := "yt-dlp" cmd := exec.Command(binary, argsForCommand...) res, err := runCmd(cmd, stopChan) pool.ReleaseIP(sourceAddress) if err == nil { return res, nil } lastError = err if strings.Contains(err.Error(), youtubeDlError) { if util.SubstringInSlice(err.Error(), shared.ErrorsNoRetry) { break } if strings.Contains(err.Error(), extractionError) { logrus.Warnf("known extraction error: %s", errors.FullTrace(err)) useragent = nextUA(useragent) } if strings.Contains(err.Error(), throttledError) || strings.Contains(err.Error(), AlternateThrottledError) { pool.SetThrottled(sourceAddress) //we don't want throttle errors to count toward the max retries attempts-- } } } return nil, lastError } func nextUA(current []string) []string { if len(current) == 0 { return []string{"--user-agent", GoogleBotUA} } return []string{"--user-agent", ChromeUA} } func runCmd(cmd *exec.Cmd, stopChan stop.Chan) ([]string, error) { logrus.Infof("running yt-dlp cmd: %s", strings.Join(cmd.Args, " ")) var err error stderr, err := cmd.StderrPipe() if err != nil { return nil, errors.Err(err) } stdout, err := cmd.StdoutPipe() if err != nil { return nil, errors.Err(err) } err = cmd.Start() if err != nil { return nil, errors.Err(err) } outLog, err := ioutil.ReadAll(stdout) if err != nil { return nil, errors.Err(err) } errorLog, err := ioutil.ReadAll(stderr) if err != nil { return nil, errors.Err(err) } done := make(chan error, 1) go func() { done <- cmd.Wait() }() select { case <-stopChan: err := cmd.Process.Kill() if err != nil { return nil, errors.Prefix("failed to kill command after stopper cancellation", err) } return nil, errors.Err("interrupted by user") case err := <-done: if err != nil { return nil, errors.Prefix("yt-dlp "+strings.Join(cmd.Args, " ")+" ["+string(errorLog)+"]", err) } return strings.Split(strings.Replace(string(outLog), "\r\n", "\n", -1), "\n"), nil } } func getIPFromPool(use string, stopChan stop.Chan, pool *ip_manager.IPPool) (sourceAddress string, err error) { for { sourceAddress, err = pool.GetIP(use) if err != nil { if errors.Is(err, ip_manager.ErrAllThrottled) { select { case <-stopChan: return "", errors.Err("interrupted by user") default: time.Sleep(ip_manager.IPCooldownPeriod) continue } } else { return "", err } } break } return }