move wallet server prometheus

-only run wallet server metrics for the wallet server
This commit is contained in:
Jack Robison 2020-04-23 20:21:27 -04:00
parent 79624febc0
commit 36c05fc4b9
No known key found for this signature in database
GPG key ID: DF25C68FE0239BB2
8 changed files with 181 additions and 111 deletions

32
lbry/prometheus.py Normal file
View file

@ -0,0 +1,32 @@
import logging
from aiohttp import web
from prometheus_client import generate_latest as prom_generate_latest
class PrometheusServer:
def __init__(self, logger=None):
self.runner = None
self.logger = logger or logging.getLogger(__name__)
async def start(self, interface: str, port: int):
prom_app = web.Application()
prom_app.router.add_get('/metrics', self.handle_metrics_get_request)
self.runner = web.AppRunner(prom_app)
await self.runner.setup()
metrics_site = web.TCPSite(self.runner, interface, port, shutdown_timeout=.5)
await metrics_site.start()
self.logger.info('metrics server listening on %s:%i', *metrics_site._server.sockets[0].getsockname()[:2])
async def handle_metrics_get_request(self, request: web.Request):
try:
return web.Response(
text=prom_generate_latest().decode(),
content_type='text/plain; version=0.0.4'
)
except Exception:
self.logger.exception('could not generate prometheus data')
raise
async def stop(self):
await self.runner.cleanup()

View file

@ -39,7 +39,7 @@ from lbry.wallet.tasks import TaskGroup
from .jsonrpc import Request, JSONRPCConnection, JSONRPCv2, JSONRPC, Batch, Notification
from .jsonrpc import RPCError, ProtocolError
from .framing import BadMagicError, BadChecksumError, OversizedPayloadError, BitcoinFramer, NewlineFramer
from lbry.wallet.server.prometheus import NOTIFICATION_COUNT, RESPONSE_TIMES, REQUEST_ERRORS_COUNT, RESET_CONNECTIONS
from lbry.wallet.server import prometheus
class Connector:
@ -388,7 +388,7 @@ class RPCSession(SessionBase):
except MemoryError:
self.logger.warning('received oversized message from %s:%s, dropping connection',
self._address[0], self._address[1])
RESET_CONNECTIONS.labels(version=self.client_version).inc()
prometheus.METRICS.RESET_CONNECTIONS.labels(version=self.client_version).inc()
self._close()
return
@ -422,7 +422,7 @@ class RPCSession(SessionBase):
'internal server error')
if isinstance(request, Request):
message = request.send_result(result)
RESPONSE_TIMES.labels(
prometheus.METRICS.RESPONSE_TIMES.labels(
method=request.method,
version=self.client_version
).observe(time.perf_counter() - start)
@ -430,7 +430,7 @@ class RPCSession(SessionBase):
await self._send_message(message)
if isinstance(result, Exception):
self._bump_errors()
REQUEST_ERRORS_COUNT.labels(
prometheus.METRICS.REQUEST_ERRORS_COUNT.labels(
method=request.method,
version=self.client_version
).inc()
@ -467,7 +467,7 @@ class RPCSession(SessionBase):
async def send_notification(self, method, args=()):
"""Send an RPC notification over the network."""
message = self.connection.send_notification(Notification(method, args))
NOTIFICATION_COUNT.labels(method=method, version=self.client_version).inc()
prometheus.METRICS.NOTIFICATION_COUNT.labels(method=method, version=self.client_version).inc()
await self._send_message(message)
def send_batch(self, raise_errors=False):

View file

@ -10,7 +10,7 @@ from lbry.wallet.server.daemon import DaemonError
from lbry.wallet.server.hash import hash_to_hex_str, HASHX_LEN
from lbry.wallet.server.util import chunks, class_logger
from lbry.wallet.server.leveldb import FlushData
from lbry.wallet.server.prometheus import BLOCK_COUNT, BLOCK_UPDATE_TIMES, REORG_COUNT
from lbry.wallet.server import prometheus
class Prefetcher:
@ -199,8 +199,8 @@ class BlockProcessor:
cache.clear()
await self._maybe_flush()
processed_time = time.perf_counter() - start
BLOCK_COUNT.set(self.height)
BLOCK_UPDATE_TIMES.observe(processed_time)
prometheus.METRICS.BLOCK_COUNT.set(self.height)
prometheus.METRICS.BLOCK_UPDATE_TIMES.observe(processed_time)
if not self.db.first_sync:
s = '' if len(blocks) == 1 else 's'
self.logger.info('processed {:,d} block{} in {:.1f}s'.format(len(blocks), s, processed_time))
@ -255,7 +255,7 @@ class BlockProcessor:
last -= len(raw_blocks)
await self.run_in_thread_with_lock(self.db.sql.delete_claims_above_height, self.height)
await self.prefetcher.reset_height(self.height)
REORG_COUNT.inc()
prometheus.METRICS.REORG_COUNT.inc()
async def reorg_hashes(self, count):
"""Return a pair (start, last, hashes) of blocks to back up during a

View file

@ -10,7 +10,8 @@ import aiohttp
from lbry.wallet.rpc.jsonrpc import RPCError
from lbry.wallet.server.util import hex_to_bytes, class_logger
from lbry.wallet.rpc import JSONRPC
from lbry.wallet.server.prometheus import LBRYCRD_REQUEST_TIMES, LBRYCRD_PENDING_COUNT
from lbry.wallet.server import prometheus
class DaemonError(Exception):
"""Raised when the daemon returns an error in its results."""
@ -129,7 +130,7 @@ class Daemon:
while True:
try:
for method in methods:
LBRYCRD_PENDING_COUNT.labels(method=method).inc()
prometheus.METRICS.LBRYCRD_PENDING_COUNT.labels(method=method).inc()
result = await self._send_data(data)
result = processor(result)
if on_good_message:
@ -154,7 +155,7 @@ class Daemon:
on_good_message = 'running normally'
finally:
for method in methods:
LBRYCRD_PENDING_COUNT.labels(method=method).dec()
prometheus.METRICS.LBRYCRD_PENDING_COUNT.labels(method=method).dec()
await asyncio.sleep(retry)
retry = max(min(self.max_retry, retry * 2), self.init_retry)
@ -175,7 +176,7 @@ class Daemon:
if params:
payload['params'] = params
result = await self._send(payload, processor)
LBRYCRD_REQUEST_TIMES.labels(method=method).observe(time.perf_counter() - start)
prometheus.METRICS.LBRYCRD_REQUEST_TIMES.labels(method=method).observe(time.perf_counter() - start)
return result
async def _send_vector(self, method, params_iterable, replace_errs=False):
@ -200,7 +201,7 @@ class Daemon:
result = []
if payload:
result = await self._send(payload, processor)
LBRYCRD_REQUEST_TIMES.labels(method=method).observe(time.perf_counter()-start)
prometheus.METRICS.LBRYCRD_REQUEST_TIMES.labels(method=method).observe(time.perf_counter()-start)
return result
async def _is_rpc_available(self, method):

View file

@ -1,89 +1,125 @@
import os
from aiohttp import web
from prometheus_client import Counter, Info, generate_latest as prom_generate_latest, Histogram, Gauge
from prometheus_client import Counter, Info, Histogram, Gauge
from lbry import __version__ as version
from lbry.build_info import BUILD, COMMIT_HASH, DOCKER_TAG
from lbry.wallet.server import util
import lbry.wallet.server.version as wallet_server_version
NAMESPACE = "wallet_server"
CPU_COUNT = f"{os.cpu_count()}"
VERSION_INFO = Info('build', 'Wallet server build info (e.g. version, commit hash)', namespace=NAMESPACE)
VERSION_INFO.info({
'build': BUILD,
"commit": COMMIT_HASH,
"docker_tag": DOCKER_TAG,
'version': version,
"min_version": util.version_string(wallet_server_version.PROTOCOL_MIN),
"cpu_count": CPU_COUNT
})
SESSIONS_COUNT = Gauge("session_count", "Number of connected client sessions", namespace=NAMESPACE,
labelnames=("version", ))
REQUESTS_COUNT = Counter("requests_count", "Number of requests received", namespace=NAMESPACE,
labelnames=("method", "version"))
RESPONSE_TIMES = Histogram("response_time", "Response times", namespace=NAMESPACE, labelnames=("method", "version"))
NOTIFICATION_COUNT = Counter("notification", "Number of notifications sent (for subscriptions)",
namespace=NAMESPACE, labelnames=("method", "version"))
REQUEST_ERRORS_COUNT = Counter("request_error", "Number of requests that returned errors", namespace=NAMESPACE,
labelnames=("method", "version"))
SQLITE_INTERRUPT_COUNT = Counter("interrupt", "Number of interrupted queries", namespace=NAMESPACE)
SQLITE_OPERATIONAL_ERROR_COUNT = Counter(
"operational_error", "Number of queries that raised operational errors", namespace=NAMESPACE
)
SQLITE_INTERNAL_ERROR_COUNT = Counter(
"internal_error", "Number of queries raising unexpected errors", namespace=NAMESPACE
)
SQLITE_EXECUTOR_TIMES = Histogram("executor_time", "SQLite executor times", namespace=NAMESPACE)
SQLITE_PENDING_COUNT = Gauge(
"pending_queries_count", "Number of pending and running sqlite queries", namespace=NAMESPACE
)
LBRYCRD_REQUEST_TIMES = Histogram(
"lbrycrd_request", "lbrycrd requests count", namespace=NAMESPACE, labelnames=("method",)
)
LBRYCRD_PENDING_COUNT = Gauge(
"lbrycrd_pending_count", "Number of lbrycrd rpcs that are in flight", namespace=NAMESPACE, labelnames=("method",)
)
CLIENT_VERSIONS = Counter(
"clients", "Number of connections received per client version",
namespace=NAMESPACE, labelnames=("version",)
)
BLOCK_COUNT = Gauge(
"block_count", "Number of processed blocks", namespace=NAMESPACE
)
BLOCK_UPDATE_TIMES = Histogram("block_time", "Block update times", namespace=NAMESPACE)
REORG_COUNT = Gauge(
"reorg_count", "Number of reorgs", namespace=NAMESPACE
)
RESET_CONNECTIONS = Counter(
"reset_clients", "Number of reset connections by client version",
namespace=NAMESPACE, labelnames=("version",)
)
class PrometheusMetrics:
VERSION_INFO: Info
SESSIONS_COUNT: Gauge
REQUESTS_COUNT: Counter
RESPONSE_TIMES: Histogram
NOTIFICATION_COUNT: Counter
REQUEST_ERRORS_COUNT: Counter
SQLITE_INTERRUPT_COUNT: Counter
SQLITE_OPERATIONAL_ERROR_COUNT: Counter
SQLITE_INTERNAL_ERROR_COUNT: Counter
SQLITE_EXECUTOR_TIMES: Histogram
SQLITE_PENDING_COUNT: Gauge
LBRYCRD_REQUEST_TIMES: Histogram
LBRYCRD_PENDING_COUNT: Gauge
CLIENT_VERSIONS: Counter
BLOCK_COUNT: Gauge
BLOCK_UPDATE_TIMES: Histogram
REORG_COUNT: Gauge
RESET_CONNECTIONS: Counter
__slots__ = [
'VERSION_INFO',
'SESSIONS_COUNT',
'REQUESTS_COUNT',
'RESPONSE_TIMES',
'NOTIFICATION_COUNT',
'REQUEST_ERRORS_COUNT',
'SQLITE_INTERRUPT_COUNT',
'SQLITE_OPERATIONAL_ERROR_COUNT',
'SQLITE_INTERNAL_ERROR_COUNT',
'SQLITE_EXECUTOR_TIMES',
'SQLITE_PENDING_COUNT',
'LBRYCRD_REQUEST_TIMES',
'LBRYCRD_PENDING_COUNT',
'CLIENT_VERSIONS',
'BLOCK_COUNT',
'BLOCK_UPDATE_TIMES',
'REORG_COUNT',
'RESET_CONNECTIONS',
'_installed',
'namespace',
'cpu_count'
]
class PrometheusServer:
def __init__(self):
self.logger = util.class_logger(__name__, self.__class__.__name__)
self.runner = None
self._installed = False
self.namespace = "wallet_server"
self.cpu_count = f"{os.cpu_count()}"
async def start(self, port: int):
prom_app = web.Application()
prom_app.router.add_get('/metrics', self.handle_metrics_get_request)
self.runner = web.AppRunner(prom_app)
await self.runner.setup()
def uninstall(self):
self._installed = False
for item in self.__slots__:
if not item.startswith('_') and item not in ('namespace', 'cpu_count'):
current = getattr(self, item, None)
if current:
setattr(self, item, None)
del current
metrics_site = web.TCPSite(self.runner, "0.0.0.0", port, shutdown_timeout=.5)
await metrics_site.start()
self.logger.info('metrics server listening on %s:%i', *metrics_site._server.sockets[0].getsockname()[:2])
def install(self):
if self._installed:
return
self._installed = True
self.VERSION_INFO = Info('build', 'Wallet server build info (e.g. version, commit hash)', namespace=self.namespace)
self.VERSION_INFO.info({
'build': BUILD,
"commit": COMMIT_HASH,
"docker_tag": DOCKER_TAG,
'version': version,
"min_version": util.version_string(wallet_server_version.PROTOCOL_MIN),
"cpu_count": self.cpu_count
})
self.SESSIONS_COUNT = Gauge("session_count", "Number of connected client sessions", namespace=self.namespace,
labelnames=("version",))
self.REQUESTS_COUNT = Counter("requests_count", "Number of requests received", namespace=self.namespace,
labelnames=("method", "version"))
self.RESPONSE_TIMES = Histogram("response_time", "Response times", namespace=self.namespace,
labelnames=("method", "version"))
self.NOTIFICATION_COUNT = Counter("notification", "Number of notifications sent (for subscriptions)",
namespace=self.namespace, labelnames=("method", "version"))
self.REQUEST_ERRORS_COUNT = Counter("request_error", "Number of requests that returned errors", namespace=self.namespace,
labelnames=("method", "version"))
self.SQLITE_INTERRUPT_COUNT = Counter("interrupt", "Number of interrupted queries", namespace=self.namespace)
self.SQLITE_OPERATIONAL_ERROR_COUNT = Counter(
"operational_error", "Number of queries that raised operational errors", namespace=self.namespace
)
self.SQLITE_INTERNAL_ERROR_COUNT = Counter(
"internal_error", "Number of queries raising unexpected errors", namespace=self.namespace
)
self.SQLITE_EXECUTOR_TIMES = Histogram("executor_time", "SQLite executor times", namespace=self.namespace)
self.SQLITE_PENDING_COUNT = Gauge(
"pending_queries_count", "Number of pending and running sqlite queries", namespace=self.namespace
)
self.LBRYCRD_REQUEST_TIMES = Histogram(
"lbrycrd_request", "lbrycrd requests count", namespace=self.namespace, labelnames=("method",)
)
self.LBRYCRD_PENDING_COUNT = Gauge(
"lbrycrd_pending_count", "Number of lbrycrd rpcs that are in flight", namespace=self.namespace,
labelnames=("method",)
)
self.CLIENT_VERSIONS = Counter(
"clients", "Number of connections received per client version",
namespace=self.namespace, labelnames=("version",)
)
self.BLOCK_COUNT = Gauge(
"block_count", "Number of processed blocks", namespace=self.namespace
)
self.BLOCK_UPDATE_TIMES = Histogram("block_time", "Block update times", namespace=self.namespace)
self.REORG_COUNT = Gauge(
"reorg_count", "Number of reorgs", namespace=self.namespace
)
self.RESET_CONNECTIONS = Counter(
"reset_clients", "Number of reset connections by client version",
namespace=self.namespace, labelnames=("version",)
)
async def handle_metrics_get_request(self, request: web.Request):
try:
return web.Response(
text=prom_generate_latest().decode(),
content_type='text/plain; version=0.0.4'
)
except Exception:
self.logger.exception('could not generate prometheus data')
raise
async def stop(self):
await self.runner.cleanup()
METRICS = PrometheusMetrics()

View file

@ -6,7 +6,8 @@ import typing
import lbry
from lbry.wallet.server.mempool import MemPool, MemPoolAPI
from lbry.wallet.server.prometheus import PrometheusServer
from lbry.prometheus import PrometheusServer
from lbry.wallet.server.prometheus import METRICS
class Notifications:
@ -92,6 +93,7 @@ class Server:
)
async def start(self):
METRICS.install()
env = self.env
min_str, max_str = env.coin.SESSIONCLS.protocol_min_max_strings()
self.log.info(f'software version: {lbry.__version__}')
@ -121,6 +123,7 @@ class Server:
self.prometheus_server = None
self.shutdown_event.set()
await self.daemon.close()
METRICS.uninstall()
def run(self):
loop = asyncio.get_event_loop()
@ -143,4 +146,4 @@ class Server:
async def start_prometheus(self):
if not self.prometheus_server and self.env.prometheus_port:
self.prometheus_server = PrometheusServer()
await self.prometheus_server.start(self.env.prometheus_port)
await self.prometheus_server.start("0.0.0.0", self.env.prometheus_port)

View file

@ -27,9 +27,7 @@ from lbry.wallet.server.db.writer import LBRYLevelDB
from lbry.wallet.server.db import reader
from lbry.wallet.server.websocket import AdminWebSocket
from lbry.wallet.server.metrics import ServerLoadData, APICallMetrics
from lbry.wallet.server.prometheus import REQUESTS_COUNT, SQLITE_INTERRUPT_COUNT, SQLITE_INTERNAL_ERROR_COUNT
from lbry.wallet.server.prometheus import SQLITE_OPERATIONAL_ERROR_COUNT, SQLITE_EXECUTOR_TIMES, SESSIONS_COUNT
from lbry.wallet.server.prometheus import SQLITE_PENDING_COUNT, CLIENT_VERSIONS
from lbry.wallet.server import prometheus
from lbry.wallet.rpc.framing import NewlineFramer
import lbry.wallet.server.version as VERSION
@ -677,7 +675,7 @@ class SessionBase(RPCSession):
context = {'conn_id': f'{self.session_id}'}
self.logger = util.ConnectionLogger(self.logger, context)
self.group = self.session_mgr.add_session(self)
SESSIONS_COUNT.labels(version=self.client_version).inc()
prometheus.METRICS.SESSIONS_COUNT.labels(version=self.client_version).inc()
peer_addr_str = self.peer_address_str()
self.logger.info(f'{self.kind} {peer_addr_str}, '
f'{self.session_mgr.session_count():,d} total')
@ -686,7 +684,7 @@ class SessionBase(RPCSession):
"""Handle client disconnection."""
super().connection_lost(exc)
self.session_mgr.remove_session(self)
SESSIONS_COUNT.labels(version=self.client_version).dec()
prometheus.METRICS.SESSIONS_COUNT.labels(version=self.client_version).dec()
msg = ''
if not self._can_send.is_set():
msg += ' whilst paused'
@ -710,7 +708,7 @@ class SessionBase(RPCSession):
"""Handle an incoming request. ElectrumX doesn't receive
notifications from client sessions.
"""
REQUESTS_COUNT.labels(method=request.method, version=self.client_version).inc()
prometheus.METRICS.REQUESTS_COUNT.labels(method=request.method, version=self.client_version).inc()
if isinstance(request, Request):
handler = self.request_handlers.get(request.method)
handler = partial(handler, self)
@ -946,7 +944,7 @@ class LBRYElectrumX(SessionBase):
async def run_in_executor(self, query_name, func, kwargs):
start = time.perf_counter()
try:
SQLITE_PENDING_COUNT.inc()
prometheus.METRICS.SQLITE_PENDING_COUNT.inc()
result = await asyncio.get_running_loop().run_in_executor(
self.session_mgr.query_executor, func, kwargs
)
@ -955,18 +953,18 @@ class LBRYElectrumX(SessionBase):
except reader.SQLiteInterruptedError as error:
metrics = self.get_metrics_or_placeholder_for_api(query_name)
metrics.query_interrupt(start, error.metrics)
SQLITE_INTERRUPT_COUNT.inc()
prometheus.METRICS.prometheus.METRICS.SQLITE_INTERRUPT_COUNT.inc()
raise RPCError(JSONRPC.QUERY_TIMEOUT, 'sqlite query timed out')
except reader.SQLiteOperationalError as error:
metrics = self.get_metrics_or_placeholder_for_api(query_name)
metrics.query_error(start, error.metrics)
SQLITE_OPERATIONAL_ERROR_COUNT.inc()
prometheus.METRICS.SQLITE_OPERATIONAL_ERROR_COUNT.inc()
raise RPCError(JSONRPC.INTERNAL_ERROR, 'query failed to execute')
except Exception:
log.exception("dear devs, please handle this exception better")
metrics = self.get_metrics_or_placeholder_for_api(query_name)
metrics.query_error(start, {})
SQLITE_INTERNAL_ERROR_COUNT.inc()
prometheus.METRICS.SQLITE_INTERNAL_ERROR_COUNT.inc()
raise RPCError(JSONRPC.INTERNAL_ERROR, 'unknown server error')
else:
if self.env.track_metrics:
@ -975,8 +973,8 @@ class LBRYElectrumX(SessionBase):
metrics.query_response(start, metrics_data)
return base64.b64encode(result).decode()
finally:
SQLITE_PENDING_COUNT.dec()
SQLITE_EXECUTOR_TIMES.observe(time.perf_counter() - start)
prometheus.METRICS.SQLITE_PENDING_COUNT.dec()
prometheus.METRICS.SQLITE_EXECUTOR_TIMES.observe(time.perf_counter() - start)
async def run_and_cache_query(self, query_name, function, kwargs):
metrics = self.get_metrics_or_placeholder_for_api(query_name)
@ -1443,10 +1441,10 @@ class LBRYElectrumX(SessionBase):
raise RPCError(BAD_REQUEST,
f'unsupported client: {client_name}')
if self.client_version != client_name[:17]:
SESSIONS_COUNT.labels(version=self.client_version).dec()
prometheus.METRICS.SESSIONS_COUNT.labels(version=self.client_version).dec()
self.client_version = client_name[:17]
SESSIONS_COUNT.labels(version=self.client_version).inc()
CLIENT_VERSIONS.labels(version=self.client_version).inc()
prometheus.METRICS.SESSIONS_COUNT.labels(version=self.client_version).inc()
prometheus.METRICS.CLIENT_VERSIONS.labels(version=self.client_version).inc()
# Find the highest common protocol version. Disconnect if
# that protocol version in unsupported.

View file

@ -2,7 +2,7 @@ import logging
import asyncio
from binascii import hexlify
from lbry.testcase import CommandTestCase
from lbry.wallet.server.prometheus import REORG_COUNT
from lbry.wallet.server import prometheus
class BlockchainReorganizationTests(CommandTestCase):
@ -16,7 +16,7 @@ class BlockchainReorganizationTests(CommandTestCase):
)
async def test_reorg(self):
REORG_COUNT.set(0)
prometheus.METRICS.REORG_COUNT.set(0)
# invalidate current block, move forward 2
self.assertEqual(self.ledger.headers.height, 206)
await self.assertBlockHash(206)
@ -26,7 +26,7 @@ class BlockchainReorganizationTests(CommandTestCase):
self.assertEqual(self.ledger.headers.height, 207)
await self.assertBlockHash(206)
await self.assertBlockHash(207)
self.assertEqual(1, REORG_COUNT._samples()[0][2])
self.assertEqual(1, prometheus.METRICS.REORG_COUNT._samples()[0][2])
# invalidate current block, move forward 3
await self.blockchain.invalidate_block((await self.ledger.headers.hash(206)).decode())
@ -36,7 +36,7 @@ class BlockchainReorganizationTests(CommandTestCase):
await self.assertBlockHash(206)
await self.assertBlockHash(207)
await self.assertBlockHash(208)
self.assertEqual(2, REORG_COUNT._samples()[0][2])
self.assertEqual(2, prometheus.METRICS.REORG_COUNT._samples()[0][2])
async def test_reorg_change_claim_height(self):
# sanity check