lbry-sdk/lbry/wallet/server/db/elasticsearch/sync.py

import argparse
import asyncio
import logging
import os
from collections import namedtuple
from multiprocessing import Process

import sqlite3
from elasticsearch import AsyncElasticsearch
from elasticsearch.helpers import async_bulk
from lbry.wallet.server.env import Env
from lbry.wallet.server.coin import LBC
from lbry.wallet.server.db.elasticsearch.search import extract_doc, SearchIndex, IndexVersionMismatch


async def get_all(db, shard_num, shards_total, limit=0, index_name='claims'):
    logging.info("shard %d starting", shard_num)

    def namedtuple_factory(cursor, row):
        Row = namedtuple('Row', (d[0] for d in cursor.description))
        return Row(*row)
    db.row_factory = namedtuple_factory
    total = db.execute(f"select count(*) as total from claim where height % {shards_total} = {shard_num};").fetchone()[0]
    for num, claim in enumerate(db.execute(f"""
SELECT claimtrie.claim_hash as is_controlling,
       claimtrie.last_take_over_height,
       (select group_concat(tag, ',,') from tag where tag.claim_hash in (claim.claim_hash, claim.reposted_claim_hash)) as tags,
       (select group_concat(language, ' ') from language where language.claim_hash in (claim.claim_hash, claim.reposted_claim_hash)) as languages,
       (select cr.has_source from claim cr where cr.claim_hash = claim.reposted_claim_hash) as reposted_has_source,
       (select cr.claim_type from claim cr where cr.claim_hash = claim.reposted_claim_hash) as reposted_claim_type,
       claim.*
FROM claim LEFT JOIN claimtrie USING (claim_hash)
WHERE claim.height % {shards_total} = {shard_num}
ORDER BY claim.height desc
""")):
        claim = dict(claim._asdict())
        claim['has_source'] = bool(claim.pop('reposted_has_source') or claim['has_source'])
        claim['censor_type'] = 0
        claim['censoring_channel_hash'] = None
        claim['tags'] = claim['tags'].split(',,') if claim['tags'] else []
        claim['languages'] = claim['languages'].split(' ') if claim['languages'] else []
        if num % 10_000 == 0:
            logging.info("%d/%d", num, total)
        yield extract_doc(claim, index_name)
        if 0 < limit <= num:
            break


async def consume(producer, index_name):
    env = Env(LBC)
    logging.info("ES sync host: %s:%i", env.elastic_host, env.elastic_port)
    es = AsyncElasticsearch([{'host': env.elastic_host, 'port': env.elastic_port}])
    try:
        await async_bulk(es, producer, request_timeout=120)
        await es.indices.refresh(index=index_name)
    finally:
        await es.close()


async def make_es_index(index=None):
    env = Env(LBC)
    if index is None:
        index = SearchIndex('', elastic_host=env.elastic_host, elastic_port=env.elastic_port)

    try:
        return await index.start()
    except IndexVersionMismatch as err:
        logging.info(
            "dropping ES search index (version %s) for upgrade to version %s", err.got_version, err.expected_version
        )
        await index.delete_index()
        await index.stop()
        return await index.start()
    finally:
        index.stop()


async def run(db_path, clients, blocks, shard, index_name='claims'):
    db = sqlite3.connect(db_path, isolation_level=None, check_same_thread=False, uri=True)
    db.execute('pragma journal_mode=wal;')
    db.execute('pragma temp_store=memory;')
    producer = get_all(db, shard, clients, limit=blocks, index_name=index_name)
    await asyncio.gather(*(consume(producer, index_name=index_name) for _ in range(min(8, clients))))


def __run(args, shard):
    asyncio.run(run(args.db_path, args.clients, args.blocks, shard))


def run_elastic_sync():
    logging.basicConfig(level=logging.INFO)
    logging.getLogger('aiohttp').setLevel(logging.WARNING)
    logging.getLogger('elasticsearch').setLevel(logging.WARNING)

    logging.info('lbry.server starting')
    parser = argparse.ArgumentParser(prog="lbry-hub-elastic-sync")
    parser.add_argument("db_path", type=str)
    parser.add_argument("-c", "--clients", type=int, default=16)
    parser.add_argument("-b", "--blocks", type=int, default=0)
    parser.add_argument("-f", "--force", default=False, action='store_true')
    args = parser.parse_args()
    processes = []

    if not args.force and not os.path.exists(args.db_path):
        logging.info("DB path doesnt exist")
        return

    if not args.force and not asyncio.run(make_es_index()):
        logging.info("ES is already initialized")
        return
    for i in range(args.clients):
        processes.append(Process(target=__run, args=(args, i)))
        processes[-1].start()
    for process in processes:
        process.join()
        process.close()
add sync script 2021-01-20 05:41:54 +01:00			`import argparse`
			`import asyncio`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`import logging`
check if db file exists before sync 2021-02-16 16:52:32 +01:00			`import os`
add sync script 2021-01-20 05:41:54 +01:00			`from collections import namedtuple`
make sync parallel 2021-01-27 05:43:06 +01:00			`from multiprocessing import Process`
add sync script 2021-01-20 05:41:54 +01:00
drop apsw in wallet.server.db.elasticsearch.sync 2021-06-15 22:12:23 +02:00			`import sqlite3`
add sync script 2021-01-20 05:41:54 +01:00			`from elasticsearch import AsyncElasticsearch`
			`from elasticsearch.helpers import async_bulk`
add ELASTIC_HOST and ELASTIC_PORT settings to hub 2021-03-29 19:16:49 +02:00			`from lbry.wallet.server.env import Env`
			`from lbry.wallet.server.coin import LBC`
resync ES search index on version bumps -bump ES search index to version 1 2021-05-07 18:42:52 +02:00			`from lbry.wallet.server.db.elasticsearch.search import extract_doc, SearchIndex, IndexVersionMismatch`
add sync script 2021-01-20 05:41:54 +01:00

test sync helper 2021-05-12 02:38:05 +02:00			`async def get_all(db, shard_num, shards_total, limit=0, index_name='claims'):`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`logging.info("shard %d starting", shard_num)`
add sync script 2021-01-20 05:41:54 +01:00
drop apsw in wallet.server.db.elasticsearch.sync 2021-06-15 22:12:23 +02:00			`def namedtuple_factory(cursor, row):`
			`Row = namedtuple('Row', (d[0] for d in cursor.description))`
			`return Row(*row)`
			`db.row_factory = namedtuple_factory`
backport fixes from server 2021-02-02 21:11:13 +01:00			`total = db.execute(f"select count(*) as total from claim where height % {shards_total} = {shard_num};").fetchone()[0]`
add sync script 2021-01-20 05:41:54 +01:00			`for num, claim in enumerate(db.execute(f"""`
			`SELECT claimtrie.claim_hash as is_controlling,`
			`claimtrie.last_take_over_height,`
tag can have empty space 2021-01-27 02:33:17 +01:00			`(select group_concat(tag, ',,') from tag where tag.claim_hash in (claim.claim_hash, claim.reposted_claim_hash)) as tags,`
add sync script 2021-01-20 05:41:54 +01:00			`(select group_concat(language, ' ') from language where language.claim_hash in (claim.claim_hash, claim.reposted_claim_hash)) as languages,`
fix no_source for reposts 2021-03-25 08:46:21 +01:00			`(select cr.has_source from claim cr where cr.claim_hash = claim.reposted_claim_hash) as reposted_has_source,`
apply reposted_claim_type on es sync 2021-05-03 23:40:21 +02:00			`(select cr.claim_type from claim cr where cr.claim_hash = claim.reposted_claim_hash) as reposted_claim_type,`
add sync script 2021-01-20 05:41:54 +01:00			`claim.*`
			`FROM claim LEFT JOIN claimtrie USING (claim_hash)`
backport fixes from server 2021-02-02 21:11:13 +01:00			`WHERE claim.height % {shards_total} = {shard_num}`
improve sync script for no-downtime maintenance 2021-02-17 05:09:12 +01:00			`ORDER BY claim.height desc`
add sync script 2021-01-20 05:41:54 +01:00			`""")):`
			`claim = dict(claim._asdict())`
fix no_source for reposts 2021-03-25 08:46:21 +01:00			`claim['has_source'] = bool(claim.pop('reposted_has_source') or claim['has_source'])`
add sync script 2021-01-20 05:41:54 +01:00			`claim['censor_type'] = 0`
			`claim['censoring_channel_hash'] = None`
tag can have empty space 2021-01-27 02:33:17 +01:00			`claim['tags'] = claim['tags'].split(',,') if claim['tags'] else []`
add sync script 2021-01-20 05:41:54 +01:00			`claim['languages'] = claim['languages'].split(' ') if claim['languages'] else []`
fix resolve partial id 2021-01-27 02:26:45 +01:00			`if num % 10_000 == 0:`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`logging.info("%d/%d", num, total)`
test sync helper 2021-05-12 02:38:05 +02:00			`yield extract_doc(claim, index_name)`
improve sync script for no-downtime maintenance 2021-02-17 05:09:12 +01:00			`if 0 < limit <= num:`
			`break`
add sync script 2021-01-20 05:41:54 +01:00

test sync helper 2021-05-12 02:38:05 +02:00			`async def consume(producer, index_name):`
add ELASTIC_HOST and ELASTIC_PORT settings to hub 2021-03-29 19:16:49 +02:00			`env = Env(LBC)`
			`logging.info("ES sync host: %s:%i", env.elastic_host, env.elastic_port)`
			`es = AsyncElasticsearch([{'host': env.elastic_host, 'port': env.elastic_port}])`
torba-elastic-sync 2021-02-12 05:10:30 +01:00			`try:`
			`await async_bulk(es, producer, request_timeout=120)`
fix es migration bug, expand test case 2021-05-12 05:21:03 +02:00			`await es.indices.refresh(index=index_name)`
torba-elastic-sync 2021-02-12 05:10:30 +01:00			`finally:`
			`await es.close()`


test sync helper 2021-05-12 02:38:05 +02:00			`async def make_es_index(index=None):`
add ELASTIC_HOST and ELASTIC_PORT settings to hub 2021-03-29 19:16:49 +02:00			`env = Env(LBC)`
test sync helper 2021-05-12 02:38:05 +02:00			`if index is None:`
			`index = SearchIndex('', elastic_host=env.elastic_host, elastic_port=env.elastic_port)`
resync ES search index on version bumps -bump ES search index to version 1 2021-05-07 18:42:52 +02:00
torba-elastic-sync 2021-02-12 05:10:30 +01:00			`try:`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`return await index.start()`
resync ES search index on version bumps -bump ES search index to version 1 2021-05-07 18:42:52 +02:00			`except IndexVersionMismatch as err:`
			`logging.info(`
			`"dropping ES search index (version %s) for upgrade to version %s", err.got_version, err.expected_version`
			`)`
			`await index.delete_index()`
fix es migration bug, expand test case 2021-05-12 05:21:03 +02:00			`await index.stop()`
resync ES search index on version bumps -bump ES search index to version 1 2021-05-07 18:42:52 +02:00			`return await index.start()`
torba-elastic-sync 2021-02-12 05:10:30 +01:00			`finally:`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`index.stop()`
use multiple clients on sync script indexing 2021-01-25 03:19:28 +01:00

test sync helper 2021-05-12 02:38:05 +02:00			`async def run(db_path, clients, blocks, shard, index_name='claims'):`
fix trending to use built-in sqlite instead of apsw 2021-06-15 22:51:50 +02:00			`db = sqlite3.connect(db_path, isolation_level=None, check_same_thread=False, uri=True)`
drop apsw in wallet.server.db.elasticsearch.sync 2021-06-15 22:12:23 +02:00			`db.execute('pragma journal_mode=wal;')`
			`db.execute('pragma temp_store=memory;')`
			`producer = get_all(db, shard, clients, limit=blocks, index_name=index_name)`
test sync helper 2021-05-12 02:38:05 +02:00			`await asyncio.gather(*(consume(producer, index_name=index_name) for _ in range(min(8, clients))))`
make sync parallel 2021-01-27 05:43:06 +01:00
torba-elastic-sync 2021-02-12 05:10:30 +01:00
make sync parallel 2021-01-27 05:43:06 +01:00			`def __run(args, shard):`
test sync helper 2021-05-12 02:38:05 +02:00			`asyncio.run(run(args.db_path, args.clients, args.blocks, shard))`
make sync parallel 2021-01-27 05:43:06 +01:00

torba-elastic-sync 2021-02-12 05:10:30 +01:00			`def run_elastic_sync():`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`logging.basicConfig(level=logging.INFO)`
add ELASTIC_HOST and ELASTIC_PORT settings to hub 2021-03-29 19:16:49 +02:00			`logging.getLogger('aiohttp').setLevel(logging.WARNING)`
			`logging.getLogger('elasticsearch').setLevel(logging.WARNING)`

check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`logging.info('lbry.server starting')`
add prog name to sync arg parser 2021-03-24 21:07:17 +01:00			`parser = argparse.ArgumentParser(prog="lbry-hub-elastic-sync")`
make sync parallel 2021-01-27 05:43:06 +01:00			`parser.add_argument("db_path", type=str)`
			`parser.add_argument("-c", "--clients", type=int, default=16)`
improve sync script for no-downtime maintenance 2021-02-17 05:09:12 +01:00			`parser.add_argument("-b", "--blocks", type=int, default=0)`
			`parser.add_argument("-f", "--force", default=False, action='store_true')`
make sync parallel 2021-01-27 05:43:06 +01:00			`args = parser.parse_args()`
			`processes = []`
torba-elastic-sync 2021-02-12 05:10:30 +01:00
improve sync script for no-downtime maintenance 2021-02-17 05:09:12 +01:00			`if not args.force and not os.path.exists(args.db_path):`
check if db file exists before sync 2021-02-16 16:52:32 +01:00			`logging.info("DB path doesnt exist")`
			`return`

improve sync script for no-downtime maintenance 2021-02-17 05:09:12 +01:00			`if not args.force and not asyncio.run(make_es_index()):`
check ES synced without a process and wait for ES 2021-02-12 18:41:03 +01:00			`logging.info("ES is already initialized")`
torba-elastic-sync 2021-02-12 05:10:30 +01:00			`return`
make sync parallel 2021-01-27 05:43:06 +01:00			`for i in range(args.clients):`
			`processes.append(Process(target=__run, args=(args, i)))`
			`processes[-1].start()`
			`for process in processes:`
			`process.join()`
			`process.close()`