lbry-sdk/lbry/db/query_context.py

653 lines
22 KiB
Python
Raw Normal View History

2020-06-05 06:35:22 +02:00
import os
import time
2020-07-06 05:03:45 +02:00
import functools
2020-06-30 23:32:51 +02:00
from io import BytesIO
2020-06-05 06:35:22 +02:00
import multiprocessing as mp
from decimal import Decimal
2020-06-19 20:28:34 +02:00
from typing import Dict, List, Optional, Tuple
2020-06-30 23:32:51 +02:00
from dataclasses import dataclass, field
2020-06-05 06:35:22 +02:00
from contextvars import ContextVar
2020-07-06 05:03:45 +02:00
from sqlalchemy import create_engine, inspect, bindparam, func, exists, case
from sqlalchemy.future import select
2020-06-05 06:35:22 +02:00
from sqlalchemy.engine import Engine, Connection
2020-06-30 23:32:51 +02:00
from sqlalchemy.sql import Insert
try:
from pgcopy import CopyManager
except ImportError:
CopyManager = None
2020-06-05 06:35:22 +02:00
from lbry.event import EventQueuePublisher
from lbry.blockchain.ledger import Ledger
from lbry.blockchain.transaction import Transaction, Output, Input
from lbry.schema.tags import clean_tags
from lbry.schema.result import Censor
from lbry.schema.mime_types import guess_stream_type
from .utils import pg_insert, chunk
2020-06-26 16:39:16 +02:00
from .tables import Block, TX, TXO, TXI, Claim, Tag, Support
2020-06-05 06:35:22 +02:00
from .constants import TXO_TYPES, STREAM_TYPES
_context: ContextVar['QueryContext'] = ContextVar('_context')
@dataclass
class QueryContext:
engine: Engine
connection: Connection
ledger: Ledger
message_queue: mp.Queue
stop_event: mp.Event
stack: List[List]
metrics: Dict
is_tracking_metrics: bool
blocked_streams: Dict
blocked_channels: Dict
filtered_streams: Dict
filtered_channels: Dict
pid: int
# QueryContext __enter__/__exit__ state
print_timers: List
current_timer_name: Optional[str] = None
current_timer_time: float = 0
current_progress: Optional['ProgressContext'] = None
2020-06-30 23:32:51 +02:00
copy_managers: Dict[str, CopyManager] = field(default_factory=dict)
2020-06-05 06:35:22 +02:00
@property
def is_postgres(self):
return self.connection.dialect.name == 'postgresql'
@property
def is_sqlite(self):
return self.connection.dialect.name == 'sqlite'
def raise_unsupported_dialect(self):
raise RuntimeError(f'Unsupported database dialect: {self.connection.dialect.name}.')
def get_resolve_censor(self) -> Censor:
return Censor(self.blocked_streams, self.blocked_channels)
def get_search_censor(self) -> Censor:
return Censor(self.filtered_streams, self.filtered_channels)
2020-06-30 23:32:51 +02:00
def pg_copy(self, table, rows):
connection = self.connection.connection
copy_manager = self.copy_managers.get(table.name)
if copy_manager is None:
self.copy_managers[table.name] = copy_manager = CopyManager(
self.connection.connection, table.name, rows[0].keys()
)
copy_manager.copy(map(dict.values, rows), BytesIO)
connection.commit()
2020-06-05 06:35:22 +02:00
def execute(self, sql, *args):
return self.connection.execute(sql, *args)
def fetchone(self, sql, *args):
row = self.connection.execute(sql, *args).fetchone()
return dict(row._mapping) if row else row
def fetchall(self, sql, *args):
rows = self.connection.execute(sql, *args).fetchall()
return [dict(row._mapping) for row in rows]
2020-07-06 05:03:45 +02:00
def fetchtotal(self, condition):
sql = select(func.count('*').label('total')).where(condition)
return self.fetchone(sql)['total']
def fetchmax(self, column):
sql = select(func.max(column).label('max_result'))
return self.fetchone(sql)['max_result']
def has_records(self, table):
sql = select(exists([1], from_obj=table).label('result'))
return self.fetchone(sql)['result']
2020-06-05 06:35:22 +02:00
def insert_or_ignore(self, table):
if self.is_sqlite:
return table.insert().prefix_with("OR IGNORE")
elif self.is_postgres:
return pg_insert(table).on_conflict_do_nothing()
else:
self.raise_unsupported_dialect()
def insert_or_replace(self, table, replace):
if self.is_sqlite:
return table.insert().prefix_with("OR REPLACE")
elif self.is_postgres:
insert = pg_insert(table)
return insert.on_conflict_do_update(
table.primary_key, set_={col: getattr(insert.excluded, col) for col in replace}
)
else:
self.raise_unsupported_dialect()
def has_table(self, table):
return inspect(self.engine).has_table(table)
def get_bulk_loader(self) -> 'BulkLoader':
return BulkLoader(self)
def reset_metrics(self):
self.stack = []
self.metrics = {}
def with_timer(self, timer_name: str) -> 'QueryContext':
self.current_timer_name = timer_name
return self
def __enter__(self) -> 'QueryContext':
self.current_timer_time = time.perf_counter()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.current_timer_name and self.current_timer_name in self.print_timers:
elapsed = time.perf_counter() - self.current_timer_time
print(f"{self.print_timers} in {elapsed:.6f}s", flush=True)
self.current_timer_name = None
self.current_timer_time = 0
self.current_progress = None
def context(with_timer: str = None) -> 'QueryContext':
if isinstance(with_timer, str):
return _context.get().with_timer(with_timer)
return _context.get()
def initialize(
ledger: Ledger, message_queue: mp.Queue, stop_event: mp.Event,
track_metrics=False, block_and_filter=None, print_timers=None):
url = ledger.conf.db_url_or_default
engine = create_engine(url)
connection = engine.connect()
if block_and_filter is not None:
blocked_streams, blocked_channels, filtered_streams, filtered_channels = block_and_filter
else:
blocked_streams = blocked_channels = filtered_streams = filtered_channels = {}
_context.set(
QueryContext(
pid=os.getpid(),
engine=engine, connection=connection,
ledger=ledger, message_queue=message_queue, stop_event=stop_event,
stack=[], metrics={}, is_tracking_metrics=track_metrics,
blocked_streams=blocked_streams, blocked_channels=blocked_channels,
filtered_streams=filtered_streams, filtered_channels=filtered_channels,
print_timers=print_timers or []
)
)
def uninitialize():
ctx = _context.get(None)
if ctx is not None:
if ctx.connection:
ctx.connection.close()
2020-06-28 04:24:59 +02:00
if ctx.engine:
ctx.engine.dispose()
2020-06-05 06:35:22 +02:00
_context.set(None)
2020-07-06 05:03:45 +02:00
class Event:
_events: List['Event'] = []
__slots__ = 'id', 'name', 'unit', 'step_size'
def __init__(self, name: str, unit: str, step_size: int):
self.name = name
self.unit = unit
self.step_size = step_size
@classmethod
def get_by_id(cls, event_id) -> 'Event':
return cls._events[event_id]
@classmethod
def get_by_name(cls, name) -> 'Event':
for event in cls._events:
if event.name == name:
return event
@classmethod
def add(cls, name: str, unit: str, step_size: int) -> 'Event':
assert cls.get_by_name(name) is None, f"Event {name} already exists."
event = cls(name, unit, step_size)
cls._events.append(event)
event.id = cls._events.index(event)
return event
def event_emitter(name: str, unit: str, step_size=1):
event = Event.add(name, unit, step_size)
def wrapper(f):
@functools.wraps(f)
def with_progress(*args, **kwargs):
with progress(event, step_size=step_size) as p:
return f(*args, **kwargs, p=p)
return with_progress
return wrapper
2020-06-05 06:35:22 +02:00
class ProgressPublisher(EventQueuePublisher):
def message_to_event(self, message):
2020-07-06 05:03:45 +02:00
event = Event.get_by_id(message[0])
2020-06-05 06:35:22 +02:00
d = {
2020-07-06 05:03:45 +02:00
"event": event.name,
2020-06-05 06:35:22 +02:00
"data": {
"pid": message[1],
"step": message[2],
"total": message[3],
2020-07-06 05:03:45 +02:00
"unit": event.unit
2020-06-05 06:35:22 +02:00
}
}
if len(message) > 4 and isinstance(message[4], dict):
d['data'].update(message[4])
return d
2020-06-19 20:28:34 +02:00
class BreakProgress(Exception):
"""Break out of progress when total is 0."""
2020-06-05 06:35:22 +02:00
class ProgressContext:
def __init__(self, ctx: QueryContext, event: Event, step_size=1):
self.ctx = ctx
self.event = event
self.extra = None
self.step_size = step_size
self.last_step = -1
self.total = 0
def __enter__(self) -> 'ProgressContext':
self.ctx.__enter__()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
2020-06-26 16:39:16 +02:00
self.ctx.message_queue.put(self.get_event_args(self.total))
self.ctx.__exit__(exc_type, exc_val, exc_tb)
2020-06-19 20:28:34 +02:00
if exc_type == BreakProgress:
return True
2020-06-05 06:35:22 +02:00
def start(self, total, extra=None):
2020-06-19 20:28:34 +02:00
if not total:
raise BreakProgress
2020-06-05 06:35:22 +02:00
self.total = total
if extra is not None:
self.extra = extra
self.step(0)
def step(self, done):
send_condition = (
# enforce step rate
(self.step_size == 1 or done % self.step_size == 0) and
# deduplicate finish event by not sending a step where done == total
done < self.total and
# deduplicate same step
done != self.last_step
)
if send_condition:
self.ctx.message_queue.put_nowait(self.get_event_args(done))
self.last_step = done
def get_event_args(self, done):
if self.extra is not None:
2020-07-06 05:03:45 +02:00
return self.event.id, self.ctx.pid, done, self.total, self.extra
return self.event.id, self.ctx.pid, done, self.total
2020-06-05 06:35:22 +02:00
def progress(e: Event, step_size=1) -> ProgressContext:
2020-07-06 05:03:45 +02:00
ctx = context(e.name)
2020-06-05 06:35:22 +02:00
ctx.current_progress = ProgressContext(ctx, e, step_size=step_size)
return ctx.current_progress
class BulkLoader:
def __init__(self, ctx: QueryContext):
self.ctx = ctx
self.ledger = ctx.ledger
self.blocks = []
self.txs = []
self.txos = []
self.txis = []
2020-06-19 20:28:34 +02:00
self.supports = []
2020-06-05 06:35:22 +02:00
self.claims = []
self.tags = []
2020-06-19 20:28:34 +02:00
self.update_claims = []
self.delete_tags = []
2020-06-05 06:35:22 +02:00
@staticmethod
2020-06-19 20:28:34 +02:00
def block_to_row(block: Block) -> dict:
2020-06-05 06:35:22 +02:00
return {
'block_hash': block.block_hash,
'previous_hash': block.prev_block_hash,
'file_number': block.file_number,
'height': 0 if block.is_first_block else block.height,
2020-06-19 20:28:34 +02:00
'timestamp': block.timestamp,
2020-06-05 06:35:22 +02:00
}
@staticmethod
2020-06-19 20:28:34 +02:00
def tx_to_row(block_hash: bytes, tx: Transaction) -> dict:
2020-06-05 06:35:22 +02:00
row = {
'tx_hash': tx.hash,
'block_hash': block_hash,
'raw': tx.raw,
'height': tx.height,
'position': tx.position,
'is_verified': tx.is_verified,
2020-06-19 20:28:34 +02:00
'timestamp': tx.timestamp,
'day': tx.day,
2020-06-05 06:35:22 +02:00
'purchased_claim_hash': None,
}
txos = tx.outputs
if len(txos) >= 2 and txos[1].can_decode_purchase_data:
txos[0].purchase = txos[1]
row['purchased_claim_hash'] = txos[1].purchase_data.claim_hash
return row
@staticmethod
2020-06-19 20:28:34 +02:00
def txi_to_row(tx: Transaction, txi: Input) -> dict:
2020-06-05 06:35:22 +02:00
return {
'tx_hash': tx.hash,
'txo_hash': txi.txo_ref.hash,
'position': txi.position,
2020-07-06 05:03:45 +02:00
'height': tx.height,
2020-06-05 06:35:22 +02:00
}
2020-06-19 20:28:34 +02:00
def txo_to_row(self, tx: Transaction, txo: Output) -> dict:
2020-06-05 06:35:22 +02:00
row = {
'tx_hash': tx.hash,
'txo_hash': txo.hash,
'address': txo.get_address(self.ledger) if txo.has_address else None,
'position': txo.position,
'amount': txo.amount,
2020-06-22 01:51:09 +02:00
'height': tx.height,
2020-06-05 06:35:22 +02:00
'script_offset': txo.script.offset,
'script_length': txo.script.length,
'txo_type': 0,
'claim_id': None,
'claim_hash': None,
'claim_name': None,
'channel_hash': None,
2020-07-06 05:03:45 +02:00
'signature': None,
'signature_digest': None,
2020-06-22 01:51:09 +02:00
'public_key': None,
'public_key_hash': None
2020-06-05 06:35:22 +02:00
}
if txo.is_claim:
if txo.can_decode_claim:
claim = txo.claim
row['txo_type'] = TXO_TYPES.get(claim.claim_type, TXO_TYPES['stream'])
if claim.is_signed:
row['channel_hash'] = claim.signing_channel_hash
2020-07-06 05:03:45 +02:00
row['signature'] = txo.get_encoded_signature()
row['signature_digest'] = txo.get_signature_digest(self.ledger)
2020-06-22 01:51:09 +02:00
if claim.is_channel:
row['public_key'] = claim.channel.public_key_bytes
row['public_key_hash'] = self.ledger.address_to_hash160(
self.ledger.public_key_to_address(claim.channel.public_key_bytes)
)
2020-06-05 06:35:22 +02:00
else:
row['txo_type'] = TXO_TYPES['stream']
elif txo.is_support:
row['txo_type'] = TXO_TYPES['support']
2020-06-22 01:51:09 +02:00
if txo.can_decode_support:
claim = txo.support
if claim.is_signed:
row['channel_hash'] = claim.signing_channel_hash
2020-06-05 06:35:22 +02:00
elif txo.purchase is not None:
row['txo_type'] = TXO_TYPES['purchase']
row['claim_id'] = txo.purchased_claim_id
row['claim_hash'] = txo.purchased_claim_hash
if txo.script.is_claim_involved:
row['claim_id'] = txo.claim_id
row['claim_hash'] = txo.claim_hash
try:
row['claim_name'] = txo.claim_name.replace('\x00', '')
2020-06-05 06:35:22 +02:00
except UnicodeDecodeError:
pass
return row
2020-07-06 05:03:45 +02:00
def claim_to_rows(
self, txo: Output, timestamp: int, staked_support_amount: int, staked_support_count: int,
signature: bytes = None, signature_digest: bytes = None, channel_public_key: bytes = None,
) -> Tuple[dict, List]:
d = {
'claim_type': None,
2020-06-05 06:35:22 +02:00
'address': txo.get_address(self.ledger),
2020-07-06 05:03:45 +02:00
'txo_hash': txo.hash,
2020-06-05 06:35:22 +02:00
'amount': txo.amount,
2020-07-06 05:03:45 +02:00
'height': txo.tx_ref.height,
'timestamp': timestamp,
# support
'staked_amount': txo.amount + staked_support_amount,
'staked_support_amount': staked_support_amount,
'staked_support_count': staked_support_count,
# basic metadata
2020-06-05 06:35:22 +02:00
'title': None,
'description': None,
2020-07-06 05:03:45 +02:00
'author': None,
2020-06-05 06:35:22 +02:00
# streams
'stream_type': None,
'media_type': None,
2020-07-06 05:03:45 +02:00
'duration': None,
'release_time': None,
2020-06-05 06:35:22 +02:00
'fee_amount': 0,
2020-06-22 01:51:09 +02:00
'fee_currency': None,
2020-06-05 06:35:22 +02:00
# reposts
'reposted_claim_hash': None,
2020-06-19 20:28:34 +02:00
# signed claims
'channel_hash': None,
2020-06-22 01:51:09 +02:00
'is_signature_valid': None,
2020-06-05 06:35:22 +02:00
}
2020-07-06 05:03:45 +02:00
claim = txo.can_decode_claim
if not claim:
return d, []
2020-06-05 06:35:22 +02:00
if claim.is_stream:
2020-07-06 05:03:45 +02:00
d['claim_type'] = TXO_TYPES['stream']
d['stream_type'] = STREAM_TYPES[guess_stream_type(d['media_type'])]
d['media_type'] = claim.stream.source.media_type
d['title'] = claim.stream.title.replace('\x00', '')
d['description'] = claim.stream.description.replace('\x00', '')
d['author'] = claim.stream.author.replace('\x00', '')
2020-06-05 06:35:22 +02:00
if claim.stream.video and claim.stream.video.duration:
2020-07-06 05:03:45 +02:00
d['duration'] = claim.stream.video.duration
2020-06-05 06:35:22 +02:00
if claim.stream.audio and claim.stream.audio.duration:
2020-07-06 05:03:45 +02:00
d['duration'] = claim.stream.audio.duration
2020-06-05 06:35:22 +02:00
if claim.stream.release_time:
2020-07-06 05:03:45 +02:00
d['release_time'] = claim.stream.release_time
2020-06-05 06:35:22 +02:00
if claim.stream.has_fee:
fee = claim.stream.fee
if isinstance(fee.amount, Decimal):
2020-07-06 05:03:45 +02:00
d['fee_amount'] = int(fee.amount*1000)
if isinstance(fee.currency, str):
d['fee_currency'] = fee.currency.lower()
2020-06-05 06:35:22 +02:00
elif claim.is_repost:
2020-07-06 05:03:45 +02:00
d['claim_type'] = TXO_TYPES['repost']
d['reposted_claim_hash'] = claim.repost.reference.claim_hash
2020-06-05 06:35:22 +02:00
elif claim.is_channel:
2020-07-06 05:03:45 +02:00
d['claim_type'] = TXO_TYPES['channel']
2020-06-19 20:28:34 +02:00
if claim.is_signed:
2020-07-06 05:03:45 +02:00
d['channel_hash'] = claim.signing_channel_hash
d['is_signature_valid'] = Output.is_signature_valid(
signature, signature_digest, channel_public_key
)
2020-06-05 06:35:22 +02:00
2020-07-06 05:03:45 +02:00
tags = []
if claim.message.tags:
claim_hash = txo.claim_hash
tags = [
{'claim_hash': claim_hash, 'tag': tag}
for tag in clean_tags(claim.message.tags)
]
2020-06-19 20:28:34 +02:00
2020-07-06 05:03:45 +02:00
return d, tags
2020-06-19 20:28:34 +02:00
2020-07-06 05:03:45 +02:00
def support_to_row(self, txo):
2020-06-19 20:28:34 +02:00
tx = txo.tx_ref.tx
2020-07-06 05:03:45 +02:00
d = {
2020-06-22 01:51:09 +02:00
'txo_hash': txo.ref.hash,
2020-07-06 05:03:45 +02:00
'claim_hash': txo.claim_hash,
2020-06-19 20:28:34 +02:00
'address': txo.get_address(self.ledger),
'amount': txo.amount,
'height': tx.height,
2020-06-22 01:51:09 +02:00
'emoji': None,
'channel_hash': None,
'signature': None,
'signature_digest': None,
2020-06-19 20:28:34 +02:00
}
support = txo.can_decode_support
if support:
2020-07-06 05:03:45 +02:00
d['emoji'] = support.emoji
2020-06-19 20:28:34 +02:00
if support.is_signed:
2020-07-06 05:03:45 +02:00
d['channel_hash'] = support.signing_channel_hash
d['signature'] = txo.get_encoded_signature()
d['signature_digest'] = txo.get_signature_digest(None)
return d
def add_block(self, block: Block):
self.blocks.append(self.block_to_row(block))
for tx in block.txs:
self.add_transaction(block.block_hash, tx)
return self
def add_transaction(self, block_hash: bytes, tx: Transaction):
self.txs.append(self.tx_to_row(block_hash, tx))
for txi in tx.inputs:
if txi.coinbase is None:
self.txis.append(self.txi_to_row(tx, txi))
for txo in tx.outputs:
self.txos.append(self.txo_to_row(tx, txo))
2020-06-19 20:28:34 +02:00
return self
2020-06-05 06:35:22 +02:00
2020-07-06 05:03:45 +02:00
def add_support(self, txo: Output):
self.supports.append(self.support_to_row(txo))
def add_claim(
self, txo: Output, short_url: str,
creation_height: int, activation_height: int, expiration_height: int,
takeover_height: int = None, channel_url: str = None, **extra):
try:
claim_name = txo.claim_name.replace('\x00', '')
normalized_name = txo.normalized_name
except UnicodeDecodeError:
return self
d, tags = self.claim_to_rows(txo, **extra)
d['claim_hash'] = txo.claim_hash
d['claim_id'] = txo.claim_id
d['claim_name'] = claim_name
d['normalized'] = normalized_name
d['short_url'] = short_url
d['creation_height'] = creation_height
d['activation_height'] = activation_height
d['expiration_height'] = expiration_height
d['takeover_height'] = takeover_height
d['is_controlling'] = takeover_height is not None
if d['is_signature_valid']:
d['canonical_url'] = channel_url + '/' + short_url
else:
d['canonical_url'] = None
self.claims.append(d)
self.tags.extend(tags)
2020-06-05 06:35:22 +02:00
return self
2020-07-06 05:03:45 +02:00
def update_claim(self, txo: Output, channel_url: Optional[str], **extra):
d, tags = self.claim_to_rows(txo, **extra)
d['pk'] = txo.claim_hash
d['channel_url'] = channel_url
d['set_canonical_url'] = d['is_signature_valid']
self.update_claims.append(d)
self.delete_tags.append({'pk': txo.claim_hash})
self.tags.extend(tags)
return self
def get_queries(self):
return (
2020-06-19 20:28:34 +02:00
(Block.insert(), self.blocks),
(TX.insert(), self.txs),
(TXO.insert(), self.txos),
(TXI.insert(), self.txis),
(Claim.insert(), self.claims),
2020-07-06 05:03:45 +02:00
(Tag.delete().where(Tag.c.claim_hash == bindparam('pk')), self.delete_tags),
(Claim.update().where(Claim.c.claim_hash == bindparam('pk')).values(
canonical_url=case([
(bindparam('set_canonical_url'), bindparam('channel_url') + '/' + Claim.c.short_url)
], else_=None)
), self.update_claims),
2020-06-19 20:28:34 +02:00
(Tag.insert(), self.tags),
(Support.insert(), self.supports),
2020-06-05 06:35:22 +02:00
)
2020-07-06 05:03:45 +02:00
def save(self, unit_table, batch_size=10000):
queries = self.get_queries()
2020-06-05 06:35:22 +02:00
p = self.ctx.current_progress
done = row_scale = 0
if p:
progress_total, row_total = 0, sum(len(q[1]) for q in queries)
2020-06-19 20:28:34 +02:00
for sql, rows in queries:
if sql.table == unit_table:
progress_total += len(rows)
2020-06-05 06:35:22 +02:00
if not progress_total:
assert row_total == 0, "Rows used for progress are empty but other rows present."
return
row_scale = row_total / progress_total
p.start(progress_total)
execute = self.ctx.connection.execute
2020-06-19 20:28:34 +02:00
for sql, rows in queries:
2020-06-30 23:32:51 +02:00
if not rows:
continue
if self.ctx.is_postgres and isinstance(sql, Insert):
self.ctx.pg_copy(sql.table, rows)
2020-06-05 06:35:22 +02:00
if p:
2020-06-30 23:32:51 +02:00
done += int(len(rows) / row_scale)
2020-06-05 06:35:22 +02:00
p.step(done)
2020-06-30 23:32:51 +02:00
else:
for chunk_rows in chunk(rows, batch_size):
try:
execute(sql, chunk_rows)
except Exception:
for row in chunk_rows:
try:
execute(sql, [row])
except Exception:
p.ctx.message_queue.put_nowait(
(Event.COMPLETE.value, os.getpid(), 1, 1)
)
with open('badrow', 'a') as badrow:
badrow.write(repr(sql))
badrow.write('\n')
badrow.write(repr(row))
badrow.write('\n')
print(sql)
print(row)
raise
if p:
done += int(len(chunk_rows)/row_scale)
p.step(done)
2020-07-06 05:03:45 +02:00
def flush(self, done_counter_table) -> int:
execute = self.ctx.connection.execute
done = 0
for sql, rows in self.get_queries():
if not rows:
continue
if self.ctx.is_postgres and isinstance(sql, Insert):
self.ctx.pg_copy(sql.table, rows)
else:
execute(sql, rows)
if sql.table == done_counter_table:
done += len(rows)
rows.clear()
return done