reflector.go/internal/metrics/metrics.go

320 lines
11 KiB
Go
Raw Normal View History

2019-12-29 02:42:03 +01:00
package metrics
import (
"context"
"encoding/json"
"errors"
"io"
"net/http"
"strings"
"syscall"
"time"
ee "github.com/lbryio/lbry.go/v2/extras/errors"
"github.com/lbryio/lbry.go/v2/extras/stop"
2019-12-29 02:42:03 +01:00
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
log "github.com/sirupsen/logrus"
)
type Server struct {
srv *http.Server
stop *stop.Stopper
}
func NewServer(address string, path string) *Server {
h := http.NewServeMux()
h.Handle(path, promhttp.Handler())
return &Server{
srv: &http.Server{
Addr: address,
Handler: h,
//https://blog.cloudflare.com/the-complete-guide-to-golang-net-http-timeouts/
//https://blog.cloudflare.com/exposing-go-on-the-internet/
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
IdleTimeout: 120 * time.Second,
},
stop: stop.New(),
}
}
func (s *Server) Start() {
s.stop.Add(1)
go func() {
defer s.stop.Done()
err := s.srv.ListenAndServe()
if err != nil && !errors.Is(err, http.ErrServerClosed) {
log.Error(err)
}
}()
}
func (s *Server) Shutdown() {
2020-07-01 00:14:51 +02:00
_ = s.srv.Shutdown(context.Background())
2019-12-29 02:42:03 +01:00
s.stop.StopAndWait()
}
2019-12-29 17:57:43 +01:00
const (
2020-10-22 19:49:02 +02:00
ns = "reflector"
subsystemCache = "cache"
subsystemITTT = "ittt"
2019-12-29 17:57:43 +01:00
labelDirection = "direction"
labelErrorType = "error_type"
DirectionUpload = "upload" // to reflector
DirectionDownload = "download" // from reflector
2020-10-22 19:49:02 +02:00
LabelCacheType = "cache_type"
LabelComponent = "component"
2020-10-22 19:49:02 +02:00
LabelSource = "source"
2020-07-09 04:28:34 +02:00
2020-01-13 22:16:46 +01:00
errConnReset = "conn_reset"
errReadConnReset = "read_conn_reset"
errWriteConnReset = "write_conn_reset"
errReadConnTimedOut = "read_conn_timed_out"
errNoNetworkActivity = "no_network_activity"
2020-01-13 22:16:46 +01:00
errWriteConnTimedOut = "write_conn_timed_out"
errWriteBrokenPipe = "write_broken_pipe"
errEPipe = "e_pipe"
errETimedout = "e_timedout"
errIOTimeout = "io_timeout"
errUnexpectedEOF = "unexpected_eof"
errUnexpectedEOFStr = "unexpected_eof_str"
errJSONSyntax = "json_syntax"
errBlobTooBig = "blob_too_big"
errDeadlineExceeded = "deadline_exceeded"
errHashMismatch = "hash_mismatch"
errZeroByteBlob = "zero_byte_blob"
errInvalidCharacter = "invalid_character"
2020-07-01 20:10:32 +02:00
errBlobNotFound = "blob_not_found"
errNoErr = "no_error"
2020-10-06 16:11:36 +02:00
errQuicProto = "quic_protocol_violation"
2020-01-13 22:16:46 +01:00
errOther = "other"
2019-12-29 17:57:43 +01:00
)
2019-12-29 02:42:03 +01:00
var (
ErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: ns,
Name: "error_total",
Help: "Total number of errors",
}, []string{labelDirection, labelErrorType})
2019-12-29 02:42:03 +01:00
BlobDownloadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
2019-12-29 17:57:43 +01:00
Name: "blob_download_total",
2019-12-29 02:42:03 +01:00
Help: "Total number of blobs downloaded from reflector",
})
2020-07-09 04:28:34 +02:00
PeerDownloadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "peer_download_total",
Help: "Total number of blobs downloaded from reflector through tcp protocol",
})
Http3DownloadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "http3_blob_download_total",
Help: "Total number of blobs downloaded from reflector through QUIC protocol",
})
2021-05-21 19:09:02 +02:00
HttpDownloadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "http_blob_download_total",
Help: "Total number of blobs downloaded from reflector through HTTP protocol",
})
CacheHitCount = promauto.NewCounterVec(prometheus.CounterOpts{
2020-07-09 04:28:34 +02:00
Namespace: ns,
2020-10-22 19:49:02 +02:00
Subsystem: subsystemCache,
Name: "hit_total",
2020-07-09 04:28:34 +02:00
Help: "Total number of blobs retrieved from the cache storage",
}, []string{LabelCacheType, LabelComponent})
ThisHitCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Subsystem: subsystemITTT,
Name: "this_hit_total",
Help: "Total number of blobs retrieved from the this storage",
})
ThatHitCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Subsystem: subsystemITTT,
Name: "that_hit_total",
Help: "Total number of blobs retrieved from the that storage",
})
CacheMissCount = promauto.NewCounterVec(prometheus.CounterOpts{
2020-07-09 04:28:34 +02:00
Namespace: ns,
2020-10-22 19:49:02 +02:00
Subsystem: subsystemCache,
Name: "miss_total",
2020-07-09 04:28:34 +02:00
Help: "Total number of blobs retrieved from origin rather than cache storage",
}, []string{LabelCacheType, LabelComponent})
CacheOriginRequestsCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: ns,
2020-10-22 19:49:02 +02:00
Subsystem: subsystemCache,
Name: "origin_requests_total",
Help: "How many Get requests are in flight from the cache to the origin",
}, []string{LabelCacheType, LabelComponent})
// during thundering-herd situations, the metric below should be a lot smaller than the metric above
CacheWaitingRequestsCount = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: ns,
2020-10-22 19:49:02 +02:00
Subsystem: subsystemCache,
Name: "waiting_requests_total",
Help: "How many cache requests are waiting for an in-flight origin request",
}, []string{LabelCacheType, LabelComponent})
2020-10-22 19:49:02 +02:00
CacheLRUEvictCount = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: ns,
Subsystem: subsystemCache,
Name: "evict_total",
Help: "Count of blobs evicted from cache",
}, []string{LabelCacheType, LabelComponent})
CacheRetrievalSpeed = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: ns,
Name: "speed_mbps",
Help: "Speed of blob retrieval from cache or from origin",
}, []string{LabelCacheType, LabelComponent, LabelSource})
2019-12-29 02:42:03 +01:00
BlobUploadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "blob_upload_total",
Help: "Total number of blobs uploaded to reflector",
})
SDBlobUploadCount = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "sdblob_upload_total",
Help: "Total number of SD blobs (and therefore streams) uploaded to reflector",
})
MtrInBytesTcp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "tcp_in_bytes",
Help: "Total number of bytes downloaded through TCP",
})
MtrOutBytesTcp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "tcp_out_bytes",
Help: "Total number of bytes streamed out through TCP",
})
MtrInBytesUdp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "udp_in_bytes",
Help: "Total number of bytes downloaded through UDP",
})
2021-05-21 05:49:02 +02:00
MtrInBytesHttp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "http_in_bytes",
Help: "Total number of bytes downloaded through HTTP",
})
MtrOutBytesUdp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "udp_out_bytes",
Help: "Total number of bytes streamed out through UDP",
})
2021-05-21 19:09:02 +02:00
MtrOutBytesHttp = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "http_out_bytes",
Help: "Total number of bytes streamed out through UDP",
})
MtrInBytesReflector = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "reflector_in_bytes",
Help: "Total number of incoming bytes (from users)",
})
MtrOutBytesReflector = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "s3_out_bytes",
Help: "Total number of outgoing bytes (to S3)",
})
MtrInBytesS3 = promauto.NewCounter(prometheus.CounterOpts{
Namespace: ns,
Name: "s3_in_bytes",
Help: "Total number of incoming bytes (from S3-CF)",
})
2021-04-06 20:00:36 +02:00
Http3BlobReqQueue = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: ns,
Name: "http3_blob_request_queue_size",
Help: "Blob requests of https queue size",
})
RoutinesQueue = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: ns,
Name: "routines",
Help: "routines running by type",
}, []string{"package", "kind"})
2019-12-29 02:42:03 +01:00
)
func CacheLabels(name, component string) prometheus.Labels {
return prometheus.Labels{
LabelCacheType: name,
LabelComponent: component,
}
}
2019-12-29 17:57:43 +01:00
func TrackError(direction string, e error) (shouldLog bool) { // shouldLog is a hack, but whatever
2019-12-29 02:42:03 +01:00
if e == nil {
return
}
err := ee.Wrap(e, 0)
2019-12-29 17:57:43 +01:00
errType := errOther
2020-01-07 14:38:40 +01:00
if strings.Contains(err.Error(), "i/o timeout") {
2019-12-29 17:57:43 +01:00
errType = errIOTimeout
2019-12-29 02:42:03 +01:00
} else if errors.Is(e, syscall.ECONNRESET) {
2020-01-07 14:38:40 +01:00
// Looks like we're getting this when direction == "download", but read_conn_reset and
// write_conn_reset when its "upload"
2019-12-29 17:57:43 +01:00
errType = errConnReset
2020-01-03 16:27:29 +01:00
} else if errors.Is(e, context.DeadlineExceeded) {
errType = errDeadlineExceeded
2019-12-29 02:42:03 +01:00
} else if strings.Contains(err.Error(), "read: connection reset by peer") { // the other side closed the connection using TCP reset
2019-12-29 17:57:43 +01:00
errType = errReadConnReset
} else if strings.Contains(err.Error(), "write: connection reset by peer") { // the other side closed the connection using TCP reset
errType = errWriteConnReset
} else if errors.Is(e, syscall.ETIMEDOUT) {
2020-01-03 20:02:00 +01:00
errType = errETimedout
2019-12-29 17:57:43 +01:00
} else if strings.Contains(err.Error(), "read: connection timed out") { // the other side closed the connection using TCP reset
2020-01-03 20:02:00 +01:00
//log.Warnln("read conn timed out is not the same as ETIMEDOUT")
2019-12-29 17:57:43 +01:00
errType = errReadConnTimedOut
} else if strings.Contains(err.Error(), "NO_ERROR: No recent network activity") { // the other side closed the QUIC connection
//log.Warnln("read conn timed out is not the same as ETIMEDOUT")
errType = errNoNetworkActivity
2020-01-13 22:16:46 +01:00
} else if strings.Contains(err.Error(), "write: connection timed out") {
errType = errWriteConnTimedOut
2019-12-29 02:42:03 +01:00
} else if errors.Is(e, io.ErrUnexpectedEOF) {
2019-12-29 17:57:43 +01:00
errType = errUnexpectedEOF
2019-12-29 02:42:03 +01:00
} else if strings.Contains(err.Error(), "unexpected EOF") { // tried to read from closed pipe or socket
2020-01-03 16:27:29 +01:00
errType = errUnexpectedEOFStr
2019-12-29 02:42:03 +01:00
} else if errors.Is(e, syscall.EPIPE) {
2020-01-03 16:27:29 +01:00
errType = errEPipe
2019-12-29 02:42:03 +01:00
} else if strings.Contains(err.Error(), "write: broken pipe") { // tried to write to a pipe or socket that was closed by the peer
2020-01-03 18:28:01 +01:00
// I believe this is the same as EPipe when direction == "download", but not for upload
2019-12-29 17:57:43 +01:00
errType = errWriteBrokenPipe
2020-01-02 19:22:51 +01:00
//} else if errors.Is(e, reflector.ErrBlobTooBig) { # this creates a circular import
// errType = errBlobTooBig
2019-12-29 17:57:43 +01:00
} else if strings.Contains(err.Error(), "blob must be at most") {
2020-01-02 19:22:51 +01:00
//log.Warnln("blob must be at most X bytes is not the same as ErrBlobTooBig")
2019-12-29 17:57:43 +01:00
errType = errBlobTooBig
2020-01-07 14:38:40 +01:00
} else if strings.Contains(err.Error(), "hash of received blob data does not match hash from send request") {
errType = errHashMismatch
2020-07-01 20:10:32 +02:00
} else if strings.Contains(err.Error(), "blob not found") {
errType = errBlobNotFound
2020-01-13 22:16:46 +01:00
} else if strings.Contains(err.Error(), "0-byte blob received") {
errType = errZeroByteBlob
2020-10-06 16:17:50 +02:00
} else if strings.Contains(err.Error(), "PROTOCOL_VIOLATION: tried to retire connection") {
2020-10-06 16:11:36 +02:00
errType = errQuicProto
2020-01-13 22:16:46 +01:00
} else if strings.Contains(err.Error(), "invalid character") {
errType = errInvalidCharacter
2019-12-29 17:57:43 +01:00
} else if _, ok := e.(*json.SyntaxError); ok {
errType = errJSONSyntax
} else if strings.Contains(err.Error(), "NO_ERROR") {
errType = errNoErr
2019-12-29 02:42:03 +01:00
} else {
2020-01-03 18:28:01 +01:00
log.Warnf("error '%s' for direction '%s' is not being tracked", err.TypeName(), direction)
2019-12-29 02:42:03 +01:00
shouldLog = true
}
2019-12-29 17:57:43 +01:00
ErrorCount.With(map[string]string{
labelDirection: direction,
labelErrorType: errType,
}).Inc()
2019-12-29 02:42:03 +01:00
return
}