From 5d8a2d697ce2bb13f655a7d445f0a6345943e7e8 Mon Sep 17 00:00:00 2001 From: Alex Grintsvayg Date: Sun, 29 Dec 2019 11:57:43 -0500 Subject: [PATCH] use labels for different error types --- internal/metrics/metrics.go | 99 ++++++++++++++++++++----------------- peer/server.go | 2 +- reflector/server.go | 7 +-- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 96cc6d9..89aed11 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -10,6 +10,8 @@ import ( "syscall" "time" + "github.com/lbryio/reflector.go/reflector" + ee "github.com/lbryio/lbry.go/v2/extras/errors" "github.com/lbryio/lbry.go/v2/extras/stop" @@ -57,12 +59,31 @@ func (s *Server) Shutdown() { s.stop.StopAndWait() } -const ns = "reflector" +const ( + ns = "reflector" + + labelDirection = "direction" + labelErrorType = "error_type" + + DirectionUpload = "upload" // to reflector + DirectionDownload = "download" // from reflector + + errConnReset = "conn_reset" + errReadConnReset = "read_conn_reset" + errWriteConnReset = "write_conn_reset" + errReadConnTimedOut = "read_conn_timed_out" + errWriteBrokenPipe = "write_broken_pipe" + errIOTimeout = "io_timeout" + errUnexpectedEOF = "unexpected_eof" + errJSONSyntax = "json_syntax" + errBlobTooBig = "blob_too_big" + errOther = "other" +) var ( BlobDownloadCount = promauto.NewCounter(prometheus.CounterOpts{ Namespace: ns, - Name: "download_total", + Name: "blob_download_total", Help: "Total number of blobs downloaded from reflector", }) BlobUploadCount = promauto.NewCounter(prometheus.CounterOpts{ @@ -75,74 +96,64 @@ var ( Name: "sdblob_upload_total", Help: "Total number of SD blobs (and therefore streams) uploaded to reflector", }) - ErrorCount = promauto.NewCounter(prometheus.CounterOpts{ + ErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ Namespace: ns, Name: "error_total", Help: "Total number of errors", - }) - IOTimeoutCount = promauto.NewCounter(prometheus.CounterOpts{ - Namespace: ns, - Name: "error_io_timeout_total", - Help: "Total number of 'i/o timeout' errors", - }) - ReadConnResetCount = promauto.NewCounter(prometheus.CounterOpts{ - Namespace: ns, - Name: "error_read_conn_reset_total", - Help: "Total number of 'read: connection reset by peer' errors", - }) - UnexpectedEOFCount = promauto.NewCounter(prometheus.CounterOpts{ - Namespace: ns, - Name: "error_unexpected_eof_total", - Help: "Total number of 'unexpected EOF' errors", - }) - BrokenPipeCount = promauto.NewCounter(prometheus.CounterOpts{ - Namespace: ns, - Name: "error_broken_pipe_total", - Help: "Total number of 'write: broken pipe' errors", - }) - JSONSyntaxErrorCount = promauto.NewCounter(prometheus.CounterOpts{ - Namespace: ns, - Name: "error_json_syntax_total", - Help: "Total number of JSON syntax errors", - }) + }, []string{labelDirection, labelErrorType}) ) -func TrackError(e error) (shouldLog bool) { // shouldLog is a hack, but whatever +func TrackError(direction string, e error) (shouldLog bool) { // shouldLog is a hack, but whatever if e == nil { return } - ErrorCount.Inc() - err := ee.Wrap(e, 0) + errType := errOther //name := err.TypeName() if errors.Is(e, context.DeadlineExceeded) { - IOTimeoutCount.Inc() + errType = errIOTimeout } else if strings.Contains(err.Error(), "i/o timeout") { // hit a read or write deadline log.Warnln("i/o timeout is not the same as context.DeadlineExceeded") - IOTimeoutCount.Inc() + errType = errIOTimeout } else if errors.Is(e, syscall.ECONNRESET) { - ReadConnResetCount.Inc() + errType = errConnReset } else if strings.Contains(err.Error(), "read: connection reset by peer") { // the other side closed the connection using TCP reset - log.Warnln("conn reset by peer is not the same as ECONNRESET") - ReadConnResetCount.Inc() + log.Warnln("read conn reset by peer is not the same as ECONNRESET") + errType = errReadConnReset + } else if strings.Contains(err.Error(), "write: connection reset by peer") { // the other side closed the connection using TCP reset + log.Warnln("write conn reset by peer is not the same as ECONNRESET") + errType = errWriteConnReset + } else if errors.Is(e, syscall.ETIMEDOUT) { + errType = errReadConnTimedOut + } else if strings.Contains(err.Error(), "read: connection timed out") { // the other side closed the connection using TCP reset + log.Warnln("read conn timed out is not the same as ETIMEDOUT") + errType = errReadConnTimedOut } else if errors.Is(e, io.ErrUnexpectedEOF) { - UnexpectedEOFCount.Inc() + errType = errUnexpectedEOF } else if strings.Contains(err.Error(), "unexpected EOF") { // tried to read from closed pipe or socket log.Warnln("unexpected eof is not the same as io.ErrUnexpectedEOF") - UnexpectedEOFCount.Inc() + errType = errUnexpectedEOF } else if errors.Is(e, syscall.EPIPE) { - BrokenPipeCount.Inc() + errType = errWriteBrokenPipe } else if strings.Contains(err.Error(), "write: broken pipe") { // tried to write to a pipe or socket that was closed by the peer log.Warnln("broken pipe is not the same as EPIPE") - BrokenPipeCount.Inc() + errType = errWriteBrokenPipe + } else if errors.Is(e, reflector.ErrBlobTooBig) { + errType = errBlobTooBig + } else if strings.Contains(err.Error(), "blob must be at most") { + log.Warnln("blob must be at most X bytes is not the same as ErrBlobTooBig") + errType = errBlobTooBig + } else if _, ok := e.(*json.SyntaxError); ok { + errType = errJSONSyntax } else { shouldLog = true } - if _, ok := e.(*json.SyntaxError); ok { - JSONSyntaxErrorCount.Inc() - } + ErrorCount.With(map[string]string{ + labelDirection: direction, + labelErrorType: errType, + }).Inc() return } diff --git a/peer/server.go b/peer/server.go index 7d39f1c..3c6556a 100644 --- a/peer/server.go +++ b/peer/server.go @@ -283,7 +283,7 @@ func (s *Server) logError(e error) { if e == nil { return } - shouldLog := metrics.TrackError(e) + shouldLog := metrics.TrackError(metrics.DirectionDownload, e) if shouldLog { log.Errorln(errors.FullTrace(e)) } diff --git a/reflector/server.go b/reflector/server.go index 01f6395..d305c5c 100644 --- a/reflector/server.go +++ b/reflector/server.go @@ -8,7 +8,6 @@ import ( "io" "io/ioutil" "net" - "strconv" "time" "github.com/lbryio/reflector.go/internal/metrics" @@ -33,6 +32,8 @@ const ( maxBlobSize = stream.MaxBlobSize ) +var ErrBlobTooBig = errors.Base("blob must be at most %d bytes", maxBlobSize) + // Server is and instance of the reflector server. It houses the blob store and listener. type Server struct { Timeout time.Duration // timeout to read or write next message @@ -167,7 +168,7 @@ func (s *Server) doError(conn net.Conn, err error) error { if err == nil { return nil } - shouldLog := metrics.TrackError(err) + shouldLog := metrics.TrackError(metrics.DirectionUpload, err) if shouldLog { log.Errorln(errors.FullTrace(err)) } @@ -305,7 +306,7 @@ func (s *Server) readBlobRequest(conn net.Conn) (int, string, bool, error) { return blobSize, blobHash, isSdBlob, errors.Err("blob hash is empty") } if blobSize > maxBlobSize { - return blobSize, blobHash, isSdBlob, errors.Err("blob must be at most " + strconv.Itoa(maxBlobSize) + " bytes") + return blobSize, blobHash, isSdBlob, errors.Err(ErrBlobTooBig) } if blobSize == 0 { return blobSize, blobHash, isSdBlob, errors.Err("0-byte blob received")