optimize batch insertions

reduce touch time to every 6 hours
This commit is contained in:
Niko Storni 2021-01-07 01:26:30 +01:00
parent 6f95b3395f
commit 74925ebba2

View file

@ -3,18 +3,22 @@ package db
import ( import (
"context" "context"
"database/sql" "database/sql"
"fmt"
"runtime"
"strings" "strings"
"time" "time"
"github.com/lbryio/lbry.go/v2/dht/bits" "github.com/lbryio/lbry.go/v2/dht/bits"
"github.com/lbryio/lbry.go/v2/extras/errors" "github.com/lbryio/lbry.go/v2/extras/errors"
qt "github.com/lbryio/lbry.go/v2/extras/query" qt "github.com/lbryio/lbry.go/v2/extras/query"
"github.com/lbryio/lbry.go/v2/extras/stop"
"github.com/lbryio/lbry.go/v2/stream" "github.com/lbryio/lbry.go/v2/stream"
"github.com/go-sql-driver/mysql" "github.com/go-sql-driver/mysql"
_ "github.com/go-sql-driver/mysql" // blank import for db driver ensures its imported even if its not used _ "github.com/go-sql-driver/mysql" // blank import for db driver ensures its imported even if its not used
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"github.com/volatiletech/null" "github.com/volatiletech/null"
"go.uber.org/atomic"
) )
// SdBlob is a special blob that contains information on the rest of the blobs in the stream // SdBlob is a special blob that contains information on the rest of the blobs in the stream
@ -104,33 +108,62 @@ func (s *SQL) AddBlobs(hash []string) error {
} }
// Split the slice into batches of 20 items. // Split the slice into batches of 20 items.
batch := 10000 batch := 10000
totalBlobs := int64(len(hash))
work := make(chan []string, 1000)
stopper := stop.New()
var totalInserted atomic.Int64
start := time.Now()
go func() {
for i := 0; i < len(hash); i += batch { for i := 0; i < len(hash); i += batch {
j := i + batch j := i + batch
if j > len(hash) { if j > len(hash) {
j = len(hash) j = len(hash)
} }
err := s.insertBlobs(hash[i:j]) // Process the batch. work <- hash[i:j]
}
log.Infof("done loading %d hashes in the work queue", len(hash))
close(work)
}()
for i := 0; i < runtime.NumCPU(); i++ {
stopper.Add(1)
go func(worker int) {
log.Infof("starting worker %d", worker)
defer stopper.Done()
for hashes := range work {
inserted := totalInserted.Load()
remaining := totalBlobs - inserted
if inserted > 0 {
timePerBlob := time.Since(start).Microseconds() / inserted
remainingTime := time.Duration(remaining*timePerBlob) * time.Microsecond
log.Infof("[T%d] processing batch of %d items. ETA: %s", worker, len(hashes), remainingTime.String())
}
err := s.insertBlobs(hashes) // Process the batch.
if err != nil { if err != nil {
log.Errorf("error while inserting batch: %s", errors.FullTrace(err)) log.Errorf("error while inserting batch: %s", errors.FullTrace(err))
} }
totalInserted.Add(int64(len(hashes)))
} }
}(i)
}
stopper.Wait()
return nil return nil
} }
func (s *SQL) insertBlobs(hashes []string) error { func (s *SQL) insertBlobs(hashes []string) error {
var ( var (
q string q string
args []interface{} //args []interface{}
) )
dayAgo := time.Now().AddDate(0, 0, -1) dayAgo := time.Now().AddDate(0, 0, -1).Format("2006-01-02 15:04:05")
q = "insert into blob_ (hash, is_stored, length, last_accessed_at) values " q = "insert into blob_ (hash, is_stored, length, last_accessed_at) values "
for _, hash := range hashes { for _, hash := range hashes {
q += "(?,?,?,?)," // prepared statements slow everything down by a lot due to reflection
args = append(args, hash, true, stream.MaxBlobSize, dayAgo) // for this specific instance we'll go ahead and hardcode the query to make it go faster
q += fmt.Sprintf("('%s',1,%d,'%s'),", hash, stream.MaxBlobSize, dayAgo)
//args = append(args, hash, true, stream.MaxBlobSize, dayAgo)
} }
q = strings.TrimSuffix(q, ",") q = strings.TrimSuffix(q, ",")
_, err := s.exec(q, args...) _, err := s.exec(q)
if err != nil { if err != nil {
return err return err
} }
@ -292,7 +325,7 @@ func (s *SQL) hasBlobs(hashes []string) (map[string]bool, []uint64, error) {
var needsTouch []uint64 var needsTouch []uint64
exists := make(map[string]bool) exists := make(map[string]bool)
touchDeadline := time.Now().AddDate(0, 0, -1) // touch blob if last accessed before this time touchDeadline := time.Now().Add(-6 * time.Hour) // touch blob if last accessed before this time
maxBatchSize := 10000 maxBatchSize := 10000
doneIndex := 0 doneIndex := 0