full text search in wallet server

This commit is contained in:
Lex Berezhny 2019-11-14 14:31:49 -05:00
parent 37161057c6
commit 48a18ff771
5 changed files with 119 additions and 6 deletions

View file

@ -2094,7 +2094,7 @@ class Daemon(metaclass=JSONRPCServerType):
eg. --height=">400000" would limit results to only claims above 400k block height.
Usage:
claim_search [<name> | --name=<name>] [--txid=<txid>] [--nout=<nout>]
claim_search [<name> | --name=<name>] [--text=<text>] [--txid=<txid>] [--nout=<nout>]
[--claim_id=<claim_id> | --claim_ids=<claim_ids>...]
[--channel=<channel> |
[[--channel_ids=<channel_ids>...] [--not_channel_ids=<not_channel_ids>...]]]
@ -2119,6 +2119,7 @@ class Daemon(metaclass=JSONRPCServerType):
Options:
--name=<name> : (str) claim name (normalized)
--text=<text> : (str) full text search
--claim_id=<claim_id> : (str) full or partial claim id
--claim_ids=<claim_ids> : (list) list of full claim ids
--txid=<txid> : (str) transaction id

View file

@ -0,0 +1,53 @@
import sqlite3
from torba.client.basedatabase import constraints_to_sql
CREATE_FULL_TEXT_SEARCH = """
create virtual table if not exists search using fts5(
claim_name, channel_name, title, description, author, tags,
content=claim, tokenize=porter
);
"""
FTS_ORDER_BY = "bm25(search, 4.0, 8.0, 1.0, 0.5, 1.0, 0.5)"
def fts_action_sql(claims=None, action='insert'):
select = {
'rowid': "claim.rowid",
'claim_name': "claim.normalized",
'channel_name': "channel.normalized",
'title': "claim.title",
'description': "claim.description",
'author': "claim.author",
'tags': "(select group_concat(tag, ' ') from tag where tag.claim_hash=claim.claim_hash)"
}
if action == 'delete':
select['search'] = 'delete'
where, values = "", {}
if claims:
where, values = constraints_to_sql({
'claim.claim_hash__in': [sqlite3.Binary(claim_hash) for claim_hash in claims]
})
where = 'WHERE '+where
return f"""
INSERT INTO search ({','.join(select.keys())})
SELECT {','.join(select.values())} FROM claim
LEFT JOIN claim as channel ON (claim.channel_hash=channel.claim_hash) {where}
""", values
def update_full_text_search(action, outputs, db, height, final_height, is_first_sync):
if is_first_sync:
if height == final_height:
db.execute(*fts_action_sql())
return
if not outputs:
return
if action in ("before-delete", "before-update"):
db.execute(*fts_action_sql(outputs, 'delete'))
elif action in ("after-insert", "after-update"):
db.execute(*fts_action_sql(outputs, 'insert'))
else:
raise ValueError(f"Invalid action for updating full text search: '{action}'")

View file

@ -18,6 +18,7 @@ from lbry.schema.result import Outputs
from lbry.wallet.ledger import BaseLedger, MainNetLedger, RegTestLedger
from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
from .full_text_search import FTS_ORDER_BY
class SQLiteOperationalError(sqlite3.OperationalError):
@ -45,7 +46,7 @@ INTEGER_PARAMS = {
}
SEARCH_PARAMS = {
'name', 'claim_id', 'claim_ids', 'txid', 'nout', 'channel', 'channel_ids', 'not_channel_ids',
'name', 'text', 'claim_id', 'claim_ids', 'txid', 'nout', 'channel', 'channel_ids', 'not_channel_ids',
'public_key_id', 'claim_type', 'stream_types', 'media_types', 'fee_currency',
'has_channel_signature', 'signature_valid',
'any_tags', 'all_tags', 'not_tags',
@ -295,6 +296,11 @@ def _get_claims(cols, for_count=False, **constraints) -> Tuple[str, Dict]:
_apply_constraints_for_array_attributes(constraints, 'language', lambda _: _, for_count)
_apply_constraints_for_array_attributes(constraints, 'location', lambda _: _, for_count)
if 'text' in constraints:
constraints["search"] = constraints.pop("text")
constraints["order_by"] = FTS_ORDER_BY
select = f"SELECT {cols} FROM search JOIN claim ON (search.rowid=claim.rowid)"
else:
select = f"SELECT {cols} FROM claim"
sql, values = query(

View file

@ -13,6 +13,7 @@ from lbry.schema.mime_types import guess_stream_type
from lbry.wallet.ledger import MainNetLedger, RegTestLedger
from lbry.wallet.transaction import Transaction, Output
from lbry.wallet.server.db.canonical import register_canonical_functions
from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH
from lbry.wallet.server.db.trending import (
CREATE_TREND_TABLE, calculate_trending, register_trending_functions
)
@ -49,6 +50,10 @@ class SQLDB:
short_url text not null, -- normalized#shortest-unique-claim_id
canonical_url text, -- channel's-short_url/normalized#shortest-unique-claim_id-within-channel
title text,
author text,
description text,
claim_type integer,
-- streams
@ -151,6 +156,7 @@ class SQLDB:
PRAGMAS +
CREATE_CLAIM_TABLE +
CREATE_TREND_TABLE +
CREATE_FULL_TEXT_SEARCH +
CREATE_SUPPORT_TABLE +
CREATE_CLAIMTRIE_TABLE +
CREATE_TAG_TABLE
@ -233,12 +239,15 @@ class SQLDB:
'amount': txo.amount,
'timestamp': header['timestamp'],
'height': tx.height,
'title': None,
'description': None,
'author': None,
'claim_type': None,
'stream_type': None,
'media_type': None,
'release_time': None,
'fee_currency': None,
'fee_amount': 0
'fee_amount': 0,
}
claims.append(claim_record)
@ -252,6 +261,9 @@ class SQLDB:
claim_record['claim_type'] = CLAIM_TYPES['stream']
claim_record['media_type'] = claim.stream.source.media_type
claim_record['stream_type'] = STREAM_TYPES[guess_stream_type(claim_record['media_type'])]
claim_record['title'] = claim.stream.title
claim_record['description'] = claim.stream.description
claim_record['author'] = claim.stream.author
if claim.stream.release_time:
claim_record['release_time'] = claim.stream.release_time
if claim.stream.has_fee:
@ -283,12 +295,12 @@ class SQLDB:
INSERT OR IGNORE INTO claim (
claim_hash, claim_id, claim_name, normalized, txo_hash, tx_position, amount,
claim_type, media_type, stream_type, timestamp, creation_timestamp,
fee_currency, fee_amount, height,
fee_currency, fee_amount, title, description, author, height,
creation_height, release_time, activation_height, expiration_height, short_url)
VALUES (
:claim_hash, :claim_id, :claim_name, :normalized, :txo_hash, :tx_position, :amount,
:claim_type, :media_type, :stream_type, :timestamp, :timestamp,
:fee_currency, :fee_amount, :height, :height,
:fee_currency, :fee_amount, :title, :description, :author, :height, :height,
CASE WHEN :release_time IS NOT NULL THEN :release_time ELSE :timestamp END,
CASE WHEN :normalized NOT IN (SELECT normalized FROM claimtrie) THEN :height END,
CASE WHEN :height >= 137181 THEN :height+2102400 ELSE :height+262974 END,
@ -306,6 +318,7 @@ class SQLDB:
txo_hash=:txo_hash, tx_position=:tx_position, amount=:amount, height=:height,
claim_type=:claim_type, media_type=:media_type, stream_type=:stream_type,
timestamp=:timestamp, fee_amount=:fee_amount, fee_currency=:fee_currency,
title=:title, description=:description, author=:author,
release_time=CASE WHEN :release_time IS NOT NULL THEN :release_time ELSE release_time END
WHERE claim_hash=:claim_hash;
""", claims)
@ -692,10 +705,18 @@ class SQLDB:
expire_timer.stop()
r = timer.run
r(update_full_text_search, 'before-delete',
delete_claim_hashes, self.db, height, daemon_height, self.main.first_sync)
affected_channels = r(self.delete_claims, delete_claim_hashes)
r(self.delete_supports, delete_support_txo_hashes)
r(self.insert_claims, insert_claims, header)
r(update_full_text_search, 'after-insert',
[txo.claim_hash for txo in insert_claims], self.db, height, daemon_height, self.main.first_sync)
r(update_full_text_search, 'before-update',
[txo.claim_hash for txo in update_claims], self.db, height, daemon_height, self.main.first_sync)
r(self.update_claims, update_claims, header)
r(update_full_text_search, 'after-update',
[txo.claim_hash for txo in update_claims], self.db, height, daemon_height, self.main.first_sync)
r(self.validate_channel_signatures, height, insert_claims,
update_claims, delete_claim_hashes, affected_channels, forward_timer=True)
r(self.insert_supports, insert_supports)

View file

@ -328,6 +328,38 @@ class ClaimSearchCommand(ClaimTestCase):
await self.assertFindsClaims([image], media_types=['image/png'])
await self.assertFindsClaims([image, video], media_types=['video/mp4', 'image/png'])
async def test_search_by_text(self):
chan1_id = self.get_claim_id(await self.channel_create('@SatoshiNakamoto'))
chan2_id = self.get_claim_id(await self.channel_create('@Bitcoin'))
chan3_id = self.get_claim_id(await self.channel_create('@IAmSatoshi'))
claim1 = await self.stream_create(
"the-real-satoshi", title="The Real Satoshi Nakamoto",
description="Documentary about the real Satoshi Nakamoto, creator of bitcoin.",
tags=['satoshi nakamoto', 'bitcoin', 'documentary']
)
claim2 = await self.stream_create(
"about-me", channel_id=chan1_id, title="Satoshi Nakamoto Autobiography",
description="I am Satoshi Nakamoto and this is my autobiography.",
tags=['satoshi nakamoto', 'bitcoin', 'documentary', 'autobiography']
)
claim3 = await self.stream_create(
"history-of-bitcoin", channel_id=chan2_id, title="History of Bitcoin",
description="History of bitcoin and its creator Satoshi Nakamoto.",
tags=['satoshi nakamoto', 'bitcoin', 'documentary', 'history']
)
claim4 = await self.stream_create(
"satoshi-conspiracies", channel_id=chan3_id, title="Satoshi Nakamoto Conspiracies",
description="Documentary detailing various conspiracies surrounding Satoshi Nakamoto.",
tags=['conspiracies', 'bitcoin', 'satoshi nakamoto']
)
await self.assertFindsClaims([], text='cheese')
await self.assertFindsClaims([claim3], text='history')
await self.assertFindsClaims([claim4], text='conspiracy')
await self.assertFindsClaims([claim1, claim4, claim2, claim3], text='documentary')
await self.assertFindsClaims([claim4, claim1, claim2, claim3], text='satoshi')
class ChannelCommands(CommandTestCase):