Merge pull request #3050 from lbryio/language-indexes

add indexes for `any_languages` argument to `claim_search`
This commit is contained in:
Lex Berezhny 2020-09-28 15:52:41 -04:00 committed by GitHub
commit d0f21c0095
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 354 additions and 205 deletions

View file

@ -768,6 +768,7 @@ class LBRYBlockProcessor(BlockProcessor):
self.timer.run(self.sql.execute, self.sql.SEARCH_INDEXES, timer_name='executing SEARCH_INDEXES')
if self.env.individual_tag_indexes:
self.timer.run(self.sql.execute, self.sql.TAG_INDEXES, timer_name='executing TAG_INDEXES')
self.timer.run(self.sql.execute, self.sql.LANGUAGE_INDEXES, timer_name='executing LANGUAGE_INDEXES')
def advance_txs(self, height, txs, header):
timer = self.timer.sub_timers['advance_blocks']

View file

@ -13,209 +13,306 @@ STREAM_TYPES = {
'model': 6
}
# 9/21/2020
MOST_USED_TAGS = {
"gaming",
"people & blogs",
"entertainment",
"music",
"pop culture",
"education",
"technology",
"blockchain",
"news",
"funny",
"science & technology",
"learning",
"gameplay",
"news & politics",
"comedy",
"bitcoin",
"beliefs",
"nature",
"art",
"economics",
"film & animation",
"lets play",
"games",
"sports",
"howto & style",
"game",
"cryptocurrency",
"playstation 4",
"automotive",
"crypto",
"mature",
"sony interactive entertainment",
"walkthrough",
"tutorial",
"video game",
"weapons",
"pc",
"playthrough",
"anime",
"how to",
"btc",
"fun",
"ethereum",
"food",
"travel & events",
"minecraft",
"science",
"autos & vehicles",
"play",
"politics",
"commentary",
"twitch",
"ps4live",
"love",
"ps4",
"nonprofits & activism",
"ps4share",
"fortnite",
"xbox",
"porn",
"video games",
"trump",
"español",
"money",
"music video",
"movie",
"coronavirus",
"nintendo",
"donald trump",
"steam",
"trailer",
"android",
"podcast",
"xbox one",
"survival",
"linux",
"travel",
"funny moments",
"audio",
"litecoin",
"animation",
"gamer",
"lets",
"playstation",
"bitcoin news",
"history",
"fox news",
"xxx",
"god",
"dance",
"adventure",
"liberal",
"horror",
"government",
"freedom",
"2020",
"reaction",
"meme",
"photography",
"truth"
}
MATURE_TAGS = [
'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex'
]
def normalize_tag(tag):
return tag.replace(" ", "_").replace("&", "and").replace("-", "_")
COMMON_TAGS = {
"gaming": "gaming",
"people & blogs": "people_and_blogs",
"pop culture": "pop_culture",
"entertainment": "entertainment",
"technology": "technology",
"music": "music",
"funny": "funny",
"education": "education",
"learning": "learning",
"news": "news",
"gameplay": "gameplay",
"science & technology": "science_and_technology",
"playstation 4": "playstation_4",
"beliefs": "beliefs",
"nature": "nature",
"news & politics": "news_and_politics",
"comedy": "comedy",
"games": "games",
"sony interactive entertainment": "sony_interactive_entertainment",
"film & animation": "film_and_animation",
"game": "game",
"howto & style": "howto_and_style",
"weapons": "weapons",
"blockchain": "blockchain",
"video game": "video_game",
"sports": "sports",
"walkthrough": "walkthrough",
"ps4live": "ps4live",
"art": "art",
"pc": "pc",
"economics": "economics",
"automotive": "automotive",
"minecraft": "minecraft",
"playthrough": "playthrough",
"ps4share": "ps4share",
"tutorial": "tutorial",
"play": "play",
"twitch": "twitch",
"how to": "how_to",
"ps4": "ps4",
"bitcoin": "bitcoin",
"fortnite": "fortnite",
"commentary": "commentary",
"lets play": "lets_play",
"fun": "fun",
"politics": "politics",
"xbox": "xbox",
"autos & vehicles": "autos_and_vehicles",
"travel & events": "travel_and_events",
"food": "food",
"science": "science",
"mature": "mature",
"xbox one": "xbox_one",
"liberal": "liberal",
"democrat": "democrat",
"progressive": "progressive",
"survival": "survival",
"nonprofits & activism": "nonprofits_and_activism",
"cryptocurrency": "cryptocurrency",
"playstation": "playstation",
"nintendo": "nintendo",
"government": "government",
"steam": "steam",
"podcast": "podcast",
"horror": "horror",
"conservative": "conservative",
"reaction": "reaction",
"trailer": "trailer",
"love": "love",
"cnn": "cnn",
"republican": "republican",
"gamer": "gamer",
"political": "political",
"hangoutsonair": "hangoutsonair",
"hoa": "hoa",
"msnbc": "msnbc",
"cbs": "cbs",
"donald trump": "donald_trump",
"fiction": "fiction",
"fox news": "fox_news",
"anime": "anime",
"crypto": "crypto",
"ethereum": "ethereum",
"call of duty": "call_of_duty",
"multiplayer": "multiplayer",
"android": "android",
"epic": "epic",
"rpg": "rpg",
"adventure": "adventure",
"secular talk": "secular_talk",
"btc": "btc",
"atheist": "atheist",
"atheism": "atheism",
"ps3": "ps3",
"video games": "video_games",
"cod": "cod",
"agnostic": "agnostic",
"movie": "movie",
"online": "online",
"fps": "fps",
"mod": "mod",
"reviews": "reviews",
"sharefactory": "sharefactory",
"world": "world",
"space": "space",
"hilarious": "hilarious",
"stream": "stream",
"lol": "lol",
"sony": "sony",
"god": "god",
"lets": "lets",
"dance": "dance",
"pvp": "pvp",
"tech": "tech",
"zombies": "zombies",
"pokemon": "pokemon",
"fail": "fail",
"xbox 360": "xbox_360",
"film": "film",
"unboxing": "unboxing",
"animation": "animation",
"travel": "travel",
"money": "money",
"wwe": "wwe",
"how": "how",
"mods": "mods",
"pubg": "pubg",
"indie": "indie",
"strategy": "strategy",
"history": "history",
"rap": "rap",
"ios": "ios",
"sony computer entertainment": "sony_computer_entertainment",
"mobile": "mobile",
"trump": "trump",
"flat earth": "flat_earth",
"hack": "hack",
"trap": "trap",
"fox": "fox",
"vlogging": "vlogging",
"news radio": "news_radio",
"humor": "humor",
"facebook": "facebook",
"edm": "edm",
"fitness": "fitness",
"vaping": "vaping",
"hip hop": "hip_hop",
"secular": "secular",
"jesus": "jesus",
"vape": "vape",
"song": "song",
"remix": "remix",
"guitar": "guitar",
"daily": "daily",
"mining": "mining",
"diy": "diy",
"videogame": "videogame",
"pets & animals": "pets_and_animals",
"funny moments": "funny_moments",
"religion": "religion",
"death": "death",
"media": "media",
"nbc": "nbc",
"war": "war",
"freedom": "freedom",
"viral": "viral",
"meme": "meme",
"family": "family",
"gold": "gold",
"photography": "photography",
"chill": "chill",
"zombie": "zombie",
"computer": "computer",
"sniper": "sniper",
"bible": "bible",
"linux": "linux",
"overwatch": "overwatch",
"pro": "pro",
"dragon": "dragon",
"litecoin": "litecoin",
"gta": "gta",
"iphone": "iphone",
"house": "house",
"bass": "bass",
"bitcoin news": "bitcoin_news",
"wii": "wii",
"crash": "crash",
"league of legends": "league_of_legends",
"grand theft auto v": "grand_theft_auto_v",
"mario": "mario",
"mmorpg": "mmorpg",
"satire": "satire",
"fire": "fire",
"racing": "racing",
"apple": "apple",
"health": "health",
"instrumental": "instrumental",
"destiny": "destiny",
"truth": "truth",
"race": "race"
tag: normalize_tag(tag) for tag in list(MOST_USED_TAGS)
}
INDEXED_LANGUAGES = [
'en',
'aa',
'ab',
'ae',
'af',
'ak',
'am',
'an',
'ar',
'as',
'av',
'ay',
'az',
'ba',
'be',
'bg',
'bh',
'bi',
'bm',
'bn',
'bo',
'br',
'bs',
'ca',
'ce',
'ch',
'co',
'cr',
'cs',
'cu',
'cv',
'cy',
'da',
'de',
'dv',
'dz',
'ee',
'el',
'eo',
'es',
'et',
'eu',
'fa',
'ff',
'fi',
'fj',
'fo',
'fr',
'fy',
'ga',
'gd',
'gl',
'gn',
'gu',
'gv',
'ha',
'he',
'hi',
'ho',
'hr',
'ht',
'hu',
'hy',
'hz',
'ia',
'id',
'ie',
'ig',
'ii',
'ik',
'io',
'is',
'it',
'iu',
'ja',
'jv',
'ka',
'kg',
'ki',
'kj',
'kk',
'kl',
'km',
'kn',
'ko',
'kr',
'ks',
'ku',
'kv',
'kw',
'ky',
'la',
'lb',
'lg',
'li',
'ln',
'lo',
'lt',
'lu',
'lv',
'mg',
'mh',
'mi',
'mk',
'ml',
'mn',
'mr',
'ms',
'mt',
'my',
'na',
'nb',
'nd',
'ne',
'ng',
'nl',
'nn',
'no',
'nr',
'nv',
'ny',
'oc',
'oj',
'om',
'or',
'os',
'pa',
'pi',
'pl',
'ps',
'pt',
'qu',
'rm',
'rn',
'ro',
'ru',
'rw',
'sa',
'sc',
'sd',
'se',
'sg',
'si',
'sk',
'sl',
'sm',
'sn',
'so',
'sq',
'sr',
'ss',
'st',
'su',
'sv',
'sw',
'ta',
'te',
'tg',
'th',
'ti',
'tk',
'tl',
'tn',
'to',
'tr',
'ts',
'tt',
'tw',
'ty',
'ug',
'uk',
'ur',
'uz',
've',
'vi',
'vo',
'wa',
'wo',
'xh',
'yi',
'yo',
'za',
'zh',
'zu'
]

View file

@ -18,7 +18,7 @@ from lbry.schema.tags import clean_tags
from lbry.schema.result import Outputs, Censor
from lbry.wallet import Ledger, RegTestLedger
from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES
from .full_text_search import FTS_ORDER_BY
@ -536,6 +536,18 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun
AND tag IN ({values})
)
"""
elif attr == 'language':
indexed_languages = any_items & set(INDEXED_LANGUAGES)
if indexed_languages:
any_items -= indexed_languages
for language in indexed_languages:
any_queries[f'#_any_common_languages_{language}'] = f"""
EXISTS(
SELECT 1 FROM language INDEXED BY language_{language}_idx
WHERE {CLAIM_HASH_OR_REPOST_HASH_SQL}=language.claim_hash
AND language = '{language}'
)
"""
if any_items:

View file

@ -6,7 +6,6 @@ from decimal import Decimal
from collections import namedtuple
from multiprocessing import Manager
from binascii import unhexlify
from lbry.wallet.server.leveldb import LevelDB
from lbry.wallet.server.util import class_logger
from lbry.wallet.database import query, constraints_to_sql
@ -19,7 +18,7 @@ from lbry.wallet.server.db.canonical import register_canonical_functions
from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH, first_sync_finished
from lbry.wallet.server.db.trending import TRENDING_ALGORITHMS
from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES
ATTRIBUTE_ARRAY_MAX_LENGTH = 100
@ -117,6 +116,15 @@ class SQLDB:
create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag);
"""
CREATE_LANGUAGE_TABLE = """
create table if not exists language (
language text not null,
claim_hash bytes not null,
height integer not null
);
create unique index if not exists language_claim_hash_language_idx on language (claim_hash, language);
"""
CREATE_CLAIMTRIE_TABLE = """
create table if not exists claimtrie (
normalized text primary key,
@ -174,12 +182,18 @@ class SQLDB:
for tag_value, tag_key in COMMON_TAGS.items()
)
LANGUAGE_INDEXES = '\n'.join(
f"create unique index if not exists language_{language}_idx on language (language, claim_hash) WHERE language='{language}';"
for language in INDEXED_LANGUAGES
)
CREATE_TABLES_QUERY = (
CREATE_CLAIM_TABLE +
CREATE_FULL_TEXT_SEARCH +
CREATE_SUPPORT_TABLE +
CREATE_CLAIMTRIE_TABLE +
CREATE_TAG_TABLE
CREATE_TAG_TABLE +
CREATE_LANGUAGE_TABLE
)
def __init__(
@ -305,7 +319,7 @@ class SQLDB:
self.execute('commit;')
def _upsertable_claims(self, txos: List[Output], header, clear_first=False):
claim_hashes, claims, tags = set(), [], {}
claim_hashes, claims, tags, languages = set(), [], {}, {}
for txo in txos:
tx = txo.tx_ref.tx
@ -316,6 +330,13 @@ class SQLDB:
#self.logger.exception(f"Could not decode claim name for {tx.id}:{txo.position}.")
continue
language = None
try:
if txo.claim.is_stream and txo.claim.stream.languages:
language = txo.claim.stream.languages[0].language
except:
pass
claim_hash = txo.claim_hash
claim_hashes.add(claim_hash)
claim_record = {
@ -373,6 +394,9 @@ class SQLDB:
elif claim.is_channel:
claim_record['claim_type'] = CLAIM_TYPES['channel']
if language:
languages[(language, claim_hash)] = (language, claim_hash, tx.height)
for tag in clean_tags(claim.message.tags):
tags[(tag, claim_hash)] = (tag, claim_hash, tx.height)
@ -383,6 +407,10 @@ class SQLDB:
self.executemany(
"INSERT OR IGNORE INTO tag (tag, claim_hash, height) VALUES (?, ?, ?)", tags.values()
)
if languages:
self.executemany(
"INSERT OR IGNORE INTO language (language, claim_hash, height) VALUES (?, ?, ?)", languages.values()
)
return claims

View file

@ -262,6 +262,17 @@ class ClaimSearchCommand(ClaimTestCase):
await self.assertFindsClaims([claim4, claim3, claim2], fee_amount='<1.0', fee_currency='lbc')
await self.assertFindsClaims([claim3], fee_amount='0.5', fee_currency='lbc')
await self.assertFindsClaims([claim5], fee_currency='usd')
async def test_search_by_language(self):
claim1 = await self.stream_create('claim1', fee_amount='1.0', fee_currency='lbc')
claim2 = await self.stream_create('claim2', fee_amount='0.9', fee_currency='lbc')
claim3 = await self.stream_create('claim3', fee_amount='0.5', fee_currency='lbc', languages='en')
claim4 = await self.stream_create('claim4', fee_amount='0.1', fee_currency='lbc', languages='en')
claim5 = await self.stream_create('claim5', fee_amount='1.0', fee_currency='usd', languages='es')
await self.assertFindsClaims([claim4, claim3], any_languages=['en'])
await self.assertFindsClaims([claim5], any_languages=['es'])
await self.assertFindsClaims([claim5, claim4, claim3], any_languages=['en', 'es'])
await self.assertFindsClaims([], fee_currency='foo')
async def test_search_by_channel(self):