separate uploader into reusable component

This commit is contained in:
Alex Grintsvayg 2019-07-02 12:13:20 -04:00
parent 825e699114
commit 38401bf8d6
No known key found for this signature in database
GPG key ID: AEB3F089F86A22B5
4 changed files with 270 additions and 243 deletions

View file

@ -1,45 +1,20 @@
package cmd package cmd
import ( import (
"encoding/json"
"io/ioutil"
"os" "os"
"os/signal" "os/signal"
"path"
"sync"
"syscall" "syscall"
"time"
"github.com/lbryio/reflector.go/db" "github.com/lbryio/reflector.go/db"
"github.com/lbryio/reflector.go/peer" "github.com/lbryio/reflector.go/reflector"
"github.com/lbryio/reflector.go/store" "github.com/lbryio/reflector.go/store"
"github.com/lbryio/lbry.go/extras/stop"
log "github.com/sirupsen/logrus"
"github.com/spf13/cobra" "github.com/spf13/cobra"
) )
var uploadWorkers int var uploadWorkers int
var uploadSkipExistsCheck bool var uploadSkipExistsCheck bool
const (
sdInc = 1
blobInc = 2
errInc = 3
)
type uploaderParams struct {
workerWG *sync.WaitGroup
counterWG *sync.WaitGroup
stopper *stop.Group
pathChan chan string
countChan chan int
sdCount int
blobCount int
errCount int
}
func init() { func init() {
var cmd = &cobra.Command{ var cmd = &cobra.Command{
Use: "upload PATH", Use: "upload PATH",
@ -53,216 +28,23 @@ func init() {
} }
func uploadCmd(cmd *cobra.Command, args []string) { func uploadCmd(cmd *cobra.Command, args []string) {
startTime := time.Now()
db := new(db.SQL) db := new(db.SQL)
err := db.Connect(globalConfig.DBConn) err := db.Connect(globalConfig.DBConn)
checkErr(err) checkErr(err)
params := uploaderParams{ st := store.NewDBBackedS3Store(
workerWG: &sync.WaitGroup{}, store.NewS3BlobStore(globalConfig.AwsID, globalConfig.AwsSecret, globalConfig.BucketRegion, globalConfig.BucketName),
counterWG: &sync.WaitGroup{}, db)
pathChan: make(chan string),
countChan: make(chan int),
stopper: stop.New()}
setInterrupt(params.stopper) uploader := reflector.NewUploader(db, st, uploadWorkers, uploadSkipExistsCheck)
paths, err := getPaths(args[0])
checkErr(err)
totalCount := len(paths)
hashes := make([]string, len(paths))
for i, p := range paths {
hashes[i] = path.Base(p)
}
log.Println("checking for existing blobs")
exists := make(map[string]bool)
if !uploadSkipExistsCheck {
exists, err = db.HasBlobs(hashes)
checkErr(err)
}
existsCount := len(exists)
log.Printf("%d new blobs to upload", totalCount-existsCount)
startUploadWorkers(&params)
params.counterWG.Add(1)
go func() {
defer params.counterWG.Done()
runCountReceiver(&params, startTime, totalCount, existsCount)
}()
Upload:
for _, f := range paths {
if exists[path.Base(f)] {
continue
}
select {
case params.pathChan <- f:
case <-params.stopper.Ch():
log.Warnln("Caught interrupt, quitting at first opportunity...")
break Upload
}
}
close(params.pathChan)
params.workerWG.Wait()
close(params.countChan)
params.counterWG.Wait()
params.stopper.Stop()
log.Println("SUMMARY")
log.Printf("%d blobs total", totalCount)
log.Printf("%d SD blobs uploaded", params.sdCount)
log.Printf("%d content blobs uploaded", params.blobCount)
log.Printf("%d blobs already stored", existsCount)
log.Printf("%d errors encountered", params.errCount)
}
func isJSON(data []byte) bool {
var js json.RawMessage
return json.Unmarshal(data, &js) == nil
}
func newBlobStore() *store.DBBackedS3Store {
db := new(db.SQL)
err := db.Connect(globalConfig.DBConn)
checkErr(err)
s3 := store.NewS3BlobStore(globalConfig.AwsID, globalConfig.AwsSecret, globalConfig.BucketRegion, globalConfig.BucketName)
return store.NewDBBackedS3Store(s3, db)
}
func setInterrupt(stopper *stop.Group) {
interruptChan := make(chan os.Signal, 1) interruptChan := make(chan os.Signal, 1)
signal.Notify(interruptChan, os.Interrupt, syscall.SIGTERM) signal.Notify(interruptChan, os.Interrupt, syscall.SIGTERM)
go func() { go func() {
<-interruptChan <-interruptChan
stopper.Stop() uploader.Stop()
}() }()
}
func startUploadWorkers(params *uploaderParams) { err = uploader.Upload(args[0])
for i := 0; i < uploadWorkers; i++ {
params.workerWG.Add(1)
go func(i int) {
defer params.workerWG.Done()
defer func(i int) {
log.Printf("worker %d quitting", i)
}(i)
blobStore := newBlobStore()
launchFileUploader(params, blobStore, i)
}(i)
}
}
func launchFileUploader(params *uploaderParams, blobStore *store.DBBackedS3Store, worker int) {
for {
select {
case <-params.stopper.Ch():
return
case filepath, ok := <-params.pathChan:
if !ok {
return
}
blob, err := ioutil.ReadFile(filepath)
checkErr(err) checkErr(err)
hash := peer.GetBlobHash(blob)
if hash != path.Base(filepath) {
log.Errorf("worker %d: file name does not match hash (%s != %s), skipping", worker, filepath, hash)
select {
case params.countChan <- errInc:
case <-params.stopper.Ch():
}
continue
}
if isJSON(blob) {
log.Printf("worker %d: PUTTING SD BLOB %s", worker, hash)
err := blobStore.PutSD(hash, blob)
if err != nil {
log.Error("PutSD Error: ", err)
}
select {
case params.countChan <- sdInc:
case <-params.stopper.Ch():
}
} else {
log.Printf("worker %d: putting %s", worker, hash)
err = blobStore.Put(hash, blob)
if err != nil {
log.Error("put Blob Error: ", err)
}
select {
case params.countChan <- blobInc:
case <-params.stopper.Ch():
}
}
}
}
}
func runCountReceiver(params *uploaderParams, startTime time.Time, totalCount int, existsCount int) {
for {
select {
case <-params.stopper.Ch():
return
case countType, ok := <-params.countChan:
if !ok {
return
}
switch countType {
case sdInc:
params.sdCount++
case blobInc:
params.blobCount++
case errInc:
params.errCount++
}
}
if (params.sdCount+params.blobCount)%50 == 0 {
log.Printf("%d of %d done (%s elapsed, %.3fs per blob)", params.sdCount+params.blobCount, totalCount-existsCount, time.Since(startTime).String(), time.Since(startTime).Seconds()/float64(params.sdCount+params.blobCount))
}
}
}
func getPaths(path string) ([]string, error) {
info, err := os.Stat(path)
if err != nil {
return nil, err
}
if info.Mode().IsRegular() {
return []string{path}, nil
}
f, err := os.Open(path)
if err != nil {
return nil, err
}
files, err := f.Readdir(-1)
if err != nil {
return nil, err
}
err = f.Close()
if err != nil {
return nil, err
}
var filenames []string
for _, file := range files {
if !file.IsDir() {
filenames = append(filenames, path+"/"+file.Name())
}
}
return filenames, nil
} }

View file

@ -2,8 +2,6 @@ package peer
import ( import (
"bufio" "bufio"
"crypto/sha512"
"encoding/hex"
"encoding/json" "encoding/json"
"io" "io"
"net" "net"
@ -215,7 +213,7 @@ func (s *Server) handleBlobRequest(data []byte) ([]byte, error) {
} }
response, err := json.Marshal(blobResponse{IncomingBlob: incomingBlob{ response, err := json.Marshal(blobResponse{IncomingBlob: incomingBlob{
BlobHash: GetBlobHash(blob), BlobHash: reflector.BlobHash(blob),
Length: len(blob), Length: len(blob),
}}) }})
if err != nil { if err != nil {
@ -268,7 +266,7 @@ func (s *Server) handleCompositeRequest(data []byte) ([]byte, error) {
return []byte{}, err return []byte{}, err
} else { } else {
response.IncomingBlob = incomingBlob{ response.IncomingBlob = incomingBlob{
BlobHash: GetBlobHash(blob), BlobHash: reflector.BlobHash(blob),
Length: len(blob), Length: len(blob),
} }
s.stats.AddBlob() s.stats.AddBlob()
@ -336,7 +334,7 @@ func readNextRequest(conn net.Conn) ([]byte, error) {
} }
// yes, this is how the peer protocol knows when the request finishes // yes, this is how the peer protocol knows when the request finishes
if isValidJSON(request) { if reflector.IsValidJSON(request) {
break break
} }
} }
@ -358,17 +356,6 @@ func readNextRequest(conn net.Conn) ([]byte, error) {
return request, nil return request, nil
} }
func isValidJSON(b []byte) bool {
var r json.RawMessage
return json.Unmarshal(b, &r) == nil
}
// GetBlobHash returns the sha512 hash hex encoded string of the blob byte slice.
func GetBlobHash(blob []byte) string {
hashBytes := sha512.Sum384(blob)
return hex.EncodeToString(hashBytes[:])
}
const ( const (
maxRequestSize = 64 * (2 ^ 10) // 64kb maxRequestSize = 64 * (2 ^ 10) // 64kb
paymentRateAccepted = "RATE_ACCEPTED" paymentRateAccepted = "RATE_ACCEPTED"

View file

@ -403,11 +403,17 @@ func (s *Server) quitting() bool {
} }
} }
// BlobHash returns the sha512 hash hex encoded string of the blob byte slice.
func BlobHash(blob []byte) string { func BlobHash(blob []byte) string {
hashBytes := sha512.Sum384(blob) hashBytes := sha512.Sum384(blob)
return hex.EncodeToString(hashBytes[:]) return hex.EncodeToString(hashBytes[:])
} }
func IsValidJSON(b []byte) bool {
var r json.RawMessage
return json.Unmarshal(b, &r) == nil
}
//type errorResponse struct { //type errorResponse struct {
// Error string `json:"error"` // Error string `json:"error"`
//} //}

252
reflector/uploader.go Normal file
View file

@ -0,0 +1,252 @@
package reflector
import (
"io/ioutil"
"os"
"path"
"sync"
"time"
"github.com/lbryio/reflector.go/db"
"github.com/lbryio/reflector.go/store"
"github.com/lbryio/lbry.go/extras/errors"
"github.com/lbryio/lbry.go/extras/stop"
log "github.com/sirupsen/logrus"
)
const (
sdInc = 1
blobInc = 2
errInc = 3
)
type Uploader struct {
db *db.SQL
store *store.DBBackedS3Store // could just be store.BlobStore interface
workers int
skipExistsCheck bool
stopper *stop.Group
countChan chan int
count struct {
total, alreadyStored, sd, blob, err int
}
}
func NewUploader(db *db.SQL, store *store.DBBackedS3Store, workers int, skipExistsCheck bool) *Uploader {
return &Uploader{
db: db,
store: store,
workers: workers,
skipExistsCheck: skipExistsCheck,
stopper: stop.New(),
countChan: make(chan int),
}
}
func (u *Uploader) Stop() {
log.Infoln("stopping uploader")
u.stopper.StopAndWait()
}
func (u *Uploader) Upload(dirOrFilePath string) error {
paths, err := getPaths(dirOrFilePath)
if err != nil {
return err
}
u.count.total = len(paths)
hashes := make([]string, len(paths))
for i, p := range paths {
hashes[i] = path.Base(p)
}
log.Infoln("checking for existing blobs")
var exists map[string]bool
if !u.skipExistsCheck {
exists, err = u.db.HasBlobs(hashes)
if err != nil {
return err
}
u.count.alreadyStored = len(exists)
}
log.Infof("%d new blobs to upload", u.count.total-u.count.alreadyStored)
workerWG := sync.WaitGroup{}
pathChan := make(chan string)
for i := 0; i < u.workers; i++ {
workerWG.Add(1)
go func(i int) {
defer workerWG.Done()
defer func(i int) { log.Debugf("worker %d quitting", i) }(i)
u.worker(pathChan)
}(i)
}
countWG := sync.WaitGroup{}
countWG.Add(1)
go func() {
defer countWG.Done()
u.counter()
}()
Upload:
for _, f := range paths {
if exists != nil && exists[path.Base(f)] {
continue
}
select {
case pathChan <- f:
case <-u.stopper.Ch():
break Upload
}
}
close(pathChan)
workerWG.Wait()
close(u.countChan)
countWG.Wait()
u.stopper.Stop()
log.Infoln("SUMMARY")
log.Infof("%d blobs total", u.count.total)
log.Infof("%d blobs already stored", u.count.alreadyStored)
log.Infof("%d SD blobs uploaded", u.count.sd)
log.Infof("%d content blobs uploaded", u.count.blob)
log.Infof("%d errors encountered", u.count.err)
return nil
}
// worker reads paths from a channel and uploads them
func (u *Uploader) worker(pathChan chan string) {
for {
select {
case <-u.stopper.Ch():
return
case filepath, ok := <-pathChan:
if !ok {
return
}
err := u.uploadBlob(filepath)
if err != nil {
log.Errorln(err)
}
}
}
}
// uploadBlob uploads a blob
func (u *Uploader) uploadBlob(filepath string) error {
blob, err := ioutil.ReadFile(filepath)
if err != nil {
return err
}
hash := BlobHash(blob)
if hash != path.Base(filepath) {
return errors.Err("file name does not match hash (%s != %s), skipping", filepath, hash)
select {
case u.countChan <- errInc:
case <-u.stopper.Ch():
}
return nil
}
if IsValidJSON(blob) {
log.Debugf("Uploading SD blob %s", hash)
err := u.store.PutSD(hash, blob)
if err != nil {
return errors.Prefix("Uploading SD blob "+hash, err)
}
select {
case u.countChan <- sdInc:
case <-u.stopper.Ch():
}
} else {
log.Debugf("Uploading blob %s", hash)
err = u.store.Put(hash, blob)
if err != nil {
return errors.Prefix("Uploading blob "+hash, err)
}
select {
case u.countChan <- blobInc:
case <-u.stopper.Ch():
}
}
return nil
}
// counter updates the counts of how many sd blobs and content blobs were uploaded, and how many
// errors were encountered. It occasionally prints the upload progress to debug.
func (u *Uploader) counter() {
start := time.Now()
for {
select {
case <-u.stopper.Ch():
return
case countType, ok := <-u.countChan:
if !ok {
return
}
switch countType {
case sdInc:
u.count.sd++
case blobInc:
u.count.blob++
case errInc:
u.count.err++
}
}
if (u.count.sd+u.count.blob)%50 == 0 {
log.Infof("%d of %d done (%s elapsed, %.3fs per blob)", u.count.sd+u.count.blob, u.count.total-u.count.alreadyStored, time.Since(start).String(), time.Since(start).Seconds()/float64(u.count.sd+u.count.blob))
}
}
}
// getPaths returns the paths for files to upload. it takes a path to a file or a dir. for a file,
// it returns the full path to that file. for a dir, it returns the paths for all the files in the
// dir
func getPaths(dirOrFilePath string) ([]string, error) {
info, err := os.Stat(dirOrFilePath)
if err != nil {
return nil, errors.Err(err)
}
if info.Mode().IsRegular() {
return []string{dirOrFilePath}, nil
}
f, err := os.Open(dirOrFilePath)
if err != nil {
return nil, errors.Err(err)
}
files, err := f.Readdir(-1)
if err != nil {
return nil, errors.Err(err)
}
err = f.Close()
if err != nil {
return nil, errors.Err(err)
}
var filenames []string
for _, file := range files {
if !file.IsDir() {
filenames = append(filenames, dirOrFilePath+"/"+file.Name())
}
}
return filenames, nil
}