From 48a18ff7719dba6961f350acf5f435e8a79ab71e Mon Sep 17 00:00:00 2001 From: Lex Berezhny Date: Thu, 14 Nov 2019 14:31:49 -0500 Subject: [PATCH] full text search in wallet server --- lbry/lbry/extras/daemon/Daemon.py | 3 +- .../lbry/wallet/server/db/full_text_search.py | 53 +++++++++++++++++++ lbry/lbry/wallet/server/db/reader.py | 10 +++- lbry/lbry/wallet/server/db/writer.py | 27 ++++++++-- lbry/tests/integration/test_claim_commands.py | 32 +++++++++++ 5 files changed, 119 insertions(+), 6 deletions(-) create mode 100644 lbry/lbry/wallet/server/db/full_text_search.py diff --git a/lbry/lbry/extras/daemon/Daemon.py b/lbry/lbry/extras/daemon/Daemon.py index afb94ee59..2b433b4aa 100644 --- a/lbry/lbry/extras/daemon/Daemon.py +++ b/lbry/lbry/extras/daemon/Daemon.py @@ -2094,7 +2094,7 @@ class Daemon(metaclass=JSONRPCServerType): eg. --height=">400000" would limit results to only claims above 400k block height. Usage: - claim_search [ | --name=] [--txid=] [--nout=] + claim_search [ | --name=] [--text=] [--txid=] [--nout=] [--claim_id= | --claim_ids=...] [--channel= | [[--channel_ids=...] [--not_channel_ids=...]]] @@ -2119,6 +2119,7 @@ class Daemon(metaclass=JSONRPCServerType): Options: --name= : (str) claim name (normalized) + --text= : (str) full text search --claim_id= : (str) full or partial claim id --claim_ids= : (list) list of full claim ids --txid= : (str) transaction id diff --git a/lbry/lbry/wallet/server/db/full_text_search.py b/lbry/lbry/wallet/server/db/full_text_search.py new file mode 100644 index 000000000..eccf798dc --- /dev/null +++ b/lbry/lbry/wallet/server/db/full_text_search.py @@ -0,0 +1,53 @@ +import sqlite3 +from torba.client.basedatabase import constraints_to_sql + +CREATE_FULL_TEXT_SEARCH = """ +create virtual table if not exists search using fts5( + claim_name, channel_name, title, description, author, tags, + content=claim, tokenize=porter +); +""" + +FTS_ORDER_BY = "bm25(search, 4.0, 8.0, 1.0, 0.5, 1.0, 0.5)" + + +def fts_action_sql(claims=None, action='insert'): + select = { + 'rowid': "claim.rowid", + 'claim_name': "claim.normalized", + 'channel_name': "channel.normalized", + 'title': "claim.title", + 'description': "claim.description", + 'author': "claim.author", + 'tags': "(select group_concat(tag, ' ') from tag where tag.claim_hash=claim.claim_hash)" + } + if action == 'delete': + select['search'] = 'delete' + + where, values = "", {} + if claims: + where, values = constraints_to_sql({ + 'claim.claim_hash__in': [sqlite3.Binary(claim_hash) for claim_hash in claims] + }) + where = 'WHERE '+where + + return f""" + INSERT INTO search ({','.join(select.keys())}) + SELECT {','.join(select.values())} FROM claim + LEFT JOIN claim as channel ON (claim.channel_hash=channel.claim_hash) {where} + """, values + + +def update_full_text_search(action, outputs, db, height, final_height, is_first_sync): + if is_first_sync: + if height == final_height: + db.execute(*fts_action_sql()) + return + if not outputs: + return + if action in ("before-delete", "before-update"): + db.execute(*fts_action_sql(outputs, 'delete')) + elif action in ("after-insert", "after-update"): + db.execute(*fts_action_sql(outputs, 'insert')) + else: + raise ValueError(f"Invalid action for updating full text search: '{action}'") diff --git a/lbry/lbry/wallet/server/db/reader.py b/lbry/lbry/wallet/server/db/reader.py index 6b086b377..9aa55e08d 100644 --- a/lbry/lbry/wallet/server/db/reader.py +++ b/lbry/lbry/wallet/server/db/reader.py @@ -18,6 +18,7 @@ from lbry.schema.result import Outputs from lbry.wallet.ledger import BaseLedger, MainNetLedger, RegTestLedger from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS +from .full_text_search import FTS_ORDER_BY class SQLiteOperationalError(sqlite3.OperationalError): @@ -45,7 +46,7 @@ INTEGER_PARAMS = { } SEARCH_PARAMS = { - 'name', 'claim_id', 'claim_ids', 'txid', 'nout', 'channel', 'channel_ids', 'not_channel_ids', + 'name', 'text', 'claim_id', 'claim_ids', 'txid', 'nout', 'channel', 'channel_ids', 'not_channel_ids', 'public_key_id', 'claim_type', 'stream_types', 'media_types', 'fee_currency', 'has_channel_signature', 'signature_valid', 'any_tags', 'all_tags', 'not_tags', @@ -295,7 +296,12 @@ def _get_claims(cols, for_count=False, **constraints) -> Tuple[str, Dict]: _apply_constraints_for_array_attributes(constraints, 'language', lambda _: _, for_count) _apply_constraints_for_array_attributes(constraints, 'location', lambda _: _, for_count) - select = f"SELECT {cols} FROM claim" + if 'text' in constraints: + constraints["search"] = constraints.pop("text") + constraints["order_by"] = FTS_ORDER_BY + select = f"SELECT {cols} FROM search JOIN claim ON (search.rowid=claim.rowid)" + else: + select = f"SELECT {cols} FROM claim" sql, values = query( select if for_count else select+""" diff --git a/lbry/lbry/wallet/server/db/writer.py b/lbry/lbry/wallet/server/db/writer.py index 01ec33a57..ad9d1382c 100644 --- a/lbry/lbry/wallet/server/db/writer.py +++ b/lbry/lbry/wallet/server/db/writer.py @@ -13,6 +13,7 @@ from lbry.schema.mime_types import guess_stream_type from lbry.wallet.ledger import MainNetLedger, RegTestLedger from lbry.wallet.transaction import Transaction, Output from lbry.wallet.server.db.canonical import register_canonical_functions +from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH from lbry.wallet.server.db.trending import ( CREATE_TREND_TABLE, calculate_trending, register_trending_functions ) @@ -49,6 +50,10 @@ class SQLDB: short_url text not null, -- normalized#shortest-unique-claim_id canonical_url text, -- channel's-short_url/normalized#shortest-unique-claim_id-within-channel + title text, + author text, + description text, + claim_type integer, -- streams @@ -151,6 +156,7 @@ class SQLDB: PRAGMAS + CREATE_CLAIM_TABLE + CREATE_TREND_TABLE + + CREATE_FULL_TEXT_SEARCH + CREATE_SUPPORT_TABLE + CREATE_CLAIMTRIE_TABLE + CREATE_TAG_TABLE @@ -233,12 +239,15 @@ class SQLDB: 'amount': txo.amount, 'timestamp': header['timestamp'], 'height': tx.height, + 'title': None, + 'description': None, + 'author': None, 'claim_type': None, 'stream_type': None, 'media_type': None, 'release_time': None, 'fee_currency': None, - 'fee_amount': 0 + 'fee_amount': 0, } claims.append(claim_record) @@ -252,6 +261,9 @@ class SQLDB: claim_record['claim_type'] = CLAIM_TYPES['stream'] claim_record['media_type'] = claim.stream.source.media_type claim_record['stream_type'] = STREAM_TYPES[guess_stream_type(claim_record['media_type'])] + claim_record['title'] = claim.stream.title + claim_record['description'] = claim.stream.description + claim_record['author'] = claim.stream.author if claim.stream.release_time: claim_record['release_time'] = claim.stream.release_time if claim.stream.has_fee: @@ -283,12 +295,12 @@ class SQLDB: INSERT OR IGNORE INTO claim ( claim_hash, claim_id, claim_name, normalized, txo_hash, tx_position, amount, claim_type, media_type, stream_type, timestamp, creation_timestamp, - fee_currency, fee_amount, height, + fee_currency, fee_amount, title, description, author, height, creation_height, release_time, activation_height, expiration_height, short_url) VALUES ( :claim_hash, :claim_id, :claim_name, :normalized, :txo_hash, :tx_position, :amount, :claim_type, :media_type, :stream_type, :timestamp, :timestamp, - :fee_currency, :fee_amount, :height, :height, + :fee_currency, :fee_amount, :title, :description, :author, :height, :height, CASE WHEN :release_time IS NOT NULL THEN :release_time ELSE :timestamp END, CASE WHEN :normalized NOT IN (SELECT normalized FROM claimtrie) THEN :height END, CASE WHEN :height >= 137181 THEN :height+2102400 ELSE :height+262974 END, @@ -306,6 +318,7 @@ class SQLDB: txo_hash=:txo_hash, tx_position=:tx_position, amount=:amount, height=:height, claim_type=:claim_type, media_type=:media_type, stream_type=:stream_type, timestamp=:timestamp, fee_amount=:fee_amount, fee_currency=:fee_currency, + title=:title, description=:description, author=:author, release_time=CASE WHEN :release_time IS NOT NULL THEN :release_time ELSE release_time END WHERE claim_hash=:claim_hash; """, claims) @@ -692,10 +705,18 @@ class SQLDB: expire_timer.stop() r = timer.run + r(update_full_text_search, 'before-delete', + delete_claim_hashes, self.db, height, daemon_height, self.main.first_sync) affected_channels = r(self.delete_claims, delete_claim_hashes) r(self.delete_supports, delete_support_txo_hashes) r(self.insert_claims, insert_claims, header) + r(update_full_text_search, 'after-insert', + [txo.claim_hash for txo in insert_claims], self.db, height, daemon_height, self.main.first_sync) + r(update_full_text_search, 'before-update', + [txo.claim_hash for txo in update_claims], self.db, height, daemon_height, self.main.first_sync) r(self.update_claims, update_claims, header) + r(update_full_text_search, 'after-update', + [txo.claim_hash for txo in update_claims], self.db, height, daemon_height, self.main.first_sync) r(self.validate_channel_signatures, height, insert_claims, update_claims, delete_claim_hashes, affected_channels, forward_timer=True) r(self.insert_supports, insert_supports) diff --git a/lbry/tests/integration/test_claim_commands.py b/lbry/tests/integration/test_claim_commands.py index fbb08fd19..b4d8867f7 100644 --- a/lbry/tests/integration/test_claim_commands.py +++ b/lbry/tests/integration/test_claim_commands.py @@ -328,6 +328,38 @@ class ClaimSearchCommand(ClaimTestCase): await self.assertFindsClaims([image], media_types=['image/png']) await self.assertFindsClaims([image, video], media_types=['video/mp4', 'image/png']) + async def test_search_by_text(self): + chan1_id = self.get_claim_id(await self.channel_create('@SatoshiNakamoto')) + chan2_id = self.get_claim_id(await self.channel_create('@Bitcoin')) + chan3_id = self.get_claim_id(await self.channel_create('@IAmSatoshi')) + + claim1 = await self.stream_create( + "the-real-satoshi", title="The Real Satoshi Nakamoto", + description="Documentary about the real Satoshi Nakamoto, creator of bitcoin.", + tags=['satoshi nakamoto', 'bitcoin', 'documentary'] + ) + claim2 = await self.stream_create( + "about-me", channel_id=chan1_id, title="Satoshi Nakamoto Autobiography", + description="I am Satoshi Nakamoto and this is my autobiography.", + tags=['satoshi nakamoto', 'bitcoin', 'documentary', 'autobiography'] + ) + claim3 = await self.stream_create( + "history-of-bitcoin", channel_id=chan2_id, title="History of Bitcoin", + description="History of bitcoin and its creator Satoshi Nakamoto.", + tags=['satoshi nakamoto', 'bitcoin', 'documentary', 'history'] + ) + claim4 = await self.stream_create( + "satoshi-conspiracies", channel_id=chan3_id, title="Satoshi Nakamoto Conspiracies", + description="Documentary detailing various conspiracies surrounding Satoshi Nakamoto.", + tags=['conspiracies', 'bitcoin', 'satoshi nakamoto'] + ) + + await self.assertFindsClaims([], text='cheese') + await self.assertFindsClaims([claim3], text='history') + await self.assertFindsClaims([claim4], text='conspiracy') + await self.assertFindsClaims([claim1, claim4, claim2, claim3], text='documentary') + await self.assertFindsClaims([claim4, claim1, claim2, claim3], text='satoshi') + class ChannelCommands(CommandTestCase):