Merge pull request #3050 from lbryio/language-indexes

add indexes for `any_languages` argument to `claim_search`
2020-09-28 15:52:41 -04:00 · 2020-09-28 15:52:41 -04:00 · d0f21c0095
commit d0f21c0095
parent 43c2e8d8e9 46dc15dd29
5 changed files with 354 additions and 205 deletions
--- a/lbry/wallet/server/block_processor.py
+++ b/lbry/wallet/server/block_processor.py
@ -768,6 +768,7 @@ class LBRYBlockProcessor(BlockProcessor):
            self.timer.run(self.sql.execute, self.sql.SEARCH_INDEXES, timer_name='executing SEARCH_INDEXES')
            if self.env.individual_tag_indexes:
                self.timer.run(self.sql.execute, self.sql.TAG_INDEXES, timer_name='executing TAG_INDEXES')
            self.timer.run(self.sql.execute, self.sql.LANGUAGE_INDEXES, timer_name='executing LANGUAGE_INDEXES')
    def advance_txs(self, height, txs, header):
        timer = self.timer.sub_timers['advance_blocks']
--- a/lbry/wallet/server/db/common.py
+++ b/lbry/wallet/server/db/common.py
@ -13,209 +13,306 @@ STREAM_TYPES = {
    'model': 6
 }
 # 9/21/2020
 MOST_USED_TAGS = {
    "gaming",
    "people & blogs",
    "entertainment",
    "music",
    "pop culture",
    "education",
    "technology",
    "blockchain",
    "news",
    "funny",
    "science & technology",
    "learning",
    "gameplay",
    "news & politics",
    "comedy",
    "bitcoin",
    "beliefs",
    "nature",
    "art",
    "economics",
    "film & animation",
    "lets play",
    "games",
    "sports",
    "howto & style",
    "game",
    "cryptocurrency",
    "playstation 4",
    "automotive",
    "crypto",
    "mature",
    "sony interactive entertainment",
    "walkthrough",
    "tutorial",
    "video game",
    "weapons",
    "pc",
    "playthrough",
    "anime",
    "how to",
    "btc",
    "fun",
    "ethereum",
    "food",
    "travel & events",
    "minecraft",
    "science",
    "autos & vehicles",
    "play",
    "politics",
    "commentary",
    "twitch",
    "ps4live",
    "love",
    "ps4",
    "nonprofits & activism",
    "ps4share",
    "fortnite",
    "xbox",
    "porn",
    "video games",
    "trump",
    "español",
    "money",
    "music video",
    "movie",
    "coronavirus",
    "nintendo",
    "donald trump",
    "steam",
    "trailer",
    "android",
    "podcast",
    "xbox one",
    "survival",
    "linux",
    "travel",
    "funny moments",
    "audio",
    "litecoin",
    "animation",
    "gamer",
    "lets",
    "playstation",
    "bitcoin news",
    "history",
    "fox news",
    "xxx",
    "god",
    "dance",
    "adventure",
    "liberal",
    "horror",
    "government",
    "freedom",
    "2020",
    "reaction",
    "meme",
    "photography",
    "truth"
 }
 MATURE_TAGS = [
    'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex'
 ]
 def normalize_tag(tag):
    return tag.replace(" ", "_").replace("&", "and").replace("-", "_")
 COMMON_TAGS = {
-    "gaming": "gaming",
+    tag: normalize_tag(tag) for tag in list(MOST_USED_TAGS)
    "people & blogs": "people_and_blogs",
    "pop culture": "pop_culture",
    "entertainment": "entertainment",
    "technology": "technology",
    "music": "music",
    "funny": "funny",
    "education": "education",
    "learning": "learning",
    "news": "news",
    "gameplay": "gameplay",
    "science & technology": "science_and_technology",
    "playstation 4": "playstation_4",
    "beliefs": "beliefs",
    "nature": "nature",
    "news & politics": "news_and_politics",
    "comedy": "comedy",
    "games": "games",
    "sony interactive entertainment": "sony_interactive_entertainment",
    "film & animation": "film_and_animation",
    "game": "game",
    "howto & style": "howto_and_style",
    "weapons": "weapons",
    "blockchain": "blockchain",
    "video game": "video_game",
    "sports": "sports",
    "walkthrough": "walkthrough",
    "ps4live": "ps4live",
    "art": "art",
    "pc": "pc",
    "economics": "economics",
    "automotive": "automotive",
    "minecraft": "minecraft",
    "playthrough": "playthrough",
    "ps4share": "ps4share",
    "tutorial": "tutorial",
    "play": "play",
    "twitch": "twitch",
    "how to": "how_to",
    "ps4": "ps4",
    "bitcoin": "bitcoin",
    "fortnite": "fortnite",
    "commentary": "commentary",
    "lets play": "lets_play",
    "fun": "fun",
    "politics": "politics",
    "xbox": "xbox",
    "autos & vehicles": "autos_and_vehicles",
    "travel & events": "travel_and_events",
    "food": "food",
    "science": "science",
    "mature": "mature",
    "xbox one": "xbox_one",
    "liberal": "liberal",
    "democrat": "democrat",
    "progressive": "progressive",
    "survival": "survival",
    "nonprofits & activism": "nonprofits_and_activism",
    "cryptocurrency": "cryptocurrency",
    "playstation": "playstation",
    "nintendo": "nintendo",
    "government": "government",
    "steam": "steam",
    "podcast": "podcast",
    "horror": "horror",
    "conservative": "conservative",
    "reaction": "reaction",
    "trailer": "trailer",
    "love": "love",
    "cnn": "cnn",
    "republican": "republican",
    "gamer": "gamer",
    "political": "political",
    "hangoutsonair": "hangoutsonair",
    "hoa": "hoa",
    "msnbc": "msnbc",
    "cbs": "cbs",
    "donald trump": "donald_trump",
    "fiction": "fiction",
    "fox news": "fox_news",
    "anime": "anime",
    "crypto": "crypto",
    "ethereum": "ethereum",
    "call of duty": "call_of_duty",
    "multiplayer": "multiplayer",
    "android": "android",
    "epic": "epic",
    "rpg": "rpg",
    "adventure": "adventure",
    "secular talk": "secular_talk",
    "btc": "btc",
    "atheist": "atheist",
    "atheism": "atheism",
    "ps3": "ps3",
    "video games": "video_games",
    "cod": "cod",
    "agnostic": "agnostic",
    "movie": "movie",
    "online": "online",
    "fps": "fps",
    "mod": "mod",
    "reviews": "reviews",
    "sharefactory": "sharefactory",
    "world": "world",
    "space": "space",
    "hilarious": "hilarious",
    "stream": "stream",
    "lol": "lol",
    "sony": "sony",
    "god": "god",
    "lets": "lets",
    "dance": "dance",
    "pvp": "pvp",
    "tech": "tech",
    "zombies": "zombies",
    "pokemon": "pokemon",
    "fail": "fail",
    "xbox 360": "xbox_360",
    "film": "film",
    "unboxing": "unboxing",
    "animation": "animation",
    "travel": "travel",
    "money": "money",
    "wwe": "wwe",
    "how": "how",
    "mods": "mods",
    "pubg": "pubg",
    "indie": "indie",
    "strategy": "strategy",
    "history": "history",
    "rap": "rap",
    "ios": "ios",
    "sony computer entertainment": "sony_computer_entertainment",
    "mobile": "mobile",
    "trump": "trump",
    "flat earth": "flat_earth",
    "hack": "hack",
    "trap": "trap",
    "fox": "fox",
    "vlogging": "vlogging",
    "news radio": "news_radio",
    "humor": "humor",
    "facebook": "facebook",
    "edm": "edm",
    "fitness": "fitness",
    "vaping": "vaping",
    "hip hop": "hip_hop",
    "secular": "secular",
    "jesus": "jesus",
    "vape": "vape",
    "song": "song",
    "remix": "remix",
    "guitar": "guitar",
    "daily": "daily",
    "mining": "mining",
    "diy": "diy",
    "videogame": "videogame",
    "pets & animals": "pets_and_animals",
    "funny moments": "funny_moments",
    "religion": "religion",
    "death": "death",
    "media": "media",
    "nbc": "nbc",
    "war": "war",
    "freedom": "freedom",
    "viral": "viral",
    "meme": "meme",
    "family": "family",
    "gold": "gold",
    "photography": "photography",
    "chill": "chill",
    "zombie": "zombie",
    "computer": "computer",
    "sniper": "sniper",
    "bible": "bible",
    "linux": "linux",
    "overwatch": "overwatch",
    "pro": "pro",
    "dragon": "dragon",
    "litecoin": "litecoin",
    "gta": "gta",
    "iphone": "iphone",
    "house": "house",
    "bass": "bass",
    "bitcoin news": "bitcoin_news",
    "wii": "wii",
    "crash": "crash",
    "league of legends": "league_of_legends",
    "grand theft auto v": "grand_theft_auto_v",
    "mario": "mario",
    "mmorpg": "mmorpg",
    "satire": "satire",
    "fire": "fire",
    "racing": "racing",
    "apple": "apple",
    "health": "health",
    "instrumental": "instrumental",
    "destiny": "destiny",
    "truth": "truth",
    "race": "race"
 }
 INDEXED_LANGUAGES = [
  'en',
  'aa',
  'ab',
  'ae',
  'af',
  'ak',
  'am',
  'an',
  'ar',
  'as',
  'av',
  'ay',
  'az',
  'ba',
  'be',
  'bg',
  'bh',
  'bi',
  'bm',
  'bn',
  'bo',
  'br',
  'bs',
  'ca',
  'ce',
  'ch',
  'co',
  'cr',
  'cs',
  'cu',
  'cv',
  'cy',
  'da',
  'de',
  'dv',
  'dz',
  'ee',
  'el',
  'eo',
  'es',
  'et',
  'eu',
  'fa',
  'ff',
  'fi',
  'fj',
  'fo',
  'fr',
  'fy',
  'ga',
  'gd',
  'gl',
  'gn',
  'gu',
  'gv',
  'ha',
  'he',
  'hi',
  'ho',
  'hr',
  'ht',
  'hu',
  'hy',
  'hz',
  'ia',
  'id',
  'ie',
  'ig',
  'ii',
  'ik',
  'io',
  'is',
  'it',
  'iu',
  'ja',
  'jv',
  'ka',
  'kg',
  'ki',
  'kj',
  'kk',
  'kl',
  'km',
  'kn',
  'ko',
  'kr',
  'ks',
  'ku',
  'kv',
  'kw',
  'ky',
  'la',
  'lb',
  'lg',
  'li',
  'ln',
  'lo',
  'lt',
  'lu',
  'lv',
  'mg',
  'mh',
  'mi',
  'mk',
  'ml',
  'mn',
  'mr',
  'ms',
  'mt',
  'my',
  'na',
  'nb',
  'nd',
  'ne',
  'ng',
  'nl',
  'nn',
  'no',
  'nr',
  'nv',
  'ny',
  'oc',
  'oj',
  'om',
  'or',
  'os',
  'pa',
  'pi',
  'pl',
  'ps',
  'pt',
  'qu',
  'rm',
  'rn',
  'ro',
  'ru',
  'rw',
  'sa',
  'sc',
  'sd',
  'se',
  'sg',
  'si',
  'sk',
  'sl',
  'sm',
  'sn',
  'so',
  'sq',
  'sr',
  'ss',
  'st',
  'su',
  'sv',
  'sw',
  'ta',
  'te',
  'tg',
  'th',
  'ti',
  'tk',
  'tl',
  'tn',
  'to',
  'tr',
  'ts',
  'tt',
  'tw',
  'ty',
  'ug',
  'uk',
  'ur',
  'uz',
  've',
  'vi',
  'vo',
  'wa',
  'wo',
  'xh',
  'yi',
  'yo',
  'za',
  'zh',
  'zu'
 ]
--- a/lbry/wallet/server/db/reader.py
+++ b/lbry/wallet/server/db/reader.py
@ -18,7 +18,7 @@ from lbry.schema.tags import clean_tags
 from lbry.schema.result import Outputs, Censor
 from lbry.wallet import Ledger, RegTestLedger
-from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES
 from .full_text_search import FTS_ORDER_BY
@ -536,6 +536,18 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun
                AND tag IN ({values})
            )
            """
    elif attr == 'language':
        indexed_languages = any_items & set(INDEXED_LANGUAGES)
        if indexed_languages:
            any_items -= indexed_languages
        for language in indexed_languages:
            any_queries[f'#_any_common_languages_{language}'] = f"""
            EXISTS(
                SELECT 1 FROM language INDEXED BY language_{language}_idx
                WHERE {CLAIM_HASH_OR_REPOST_HASH_SQL}=language.claim_hash
                AND language = '{language}'
            )
            """
    if any_items:
--- a/lbry/wallet/server/db/writer.py
+++ b/lbry/wallet/server/db/writer.py
@ -6,7 +6,6 @@ from decimal import Decimal
 from collections import namedtuple
 from multiprocessing import Manager
 from binascii import unhexlify
 from lbry.wallet.server.leveldb import LevelDB
 from lbry.wallet.server.util import class_logger
 from lbry.wallet.database import query, constraints_to_sql
@ -19,7 +18,7 @@ from lbry.wallet.server.db.canonical import register_canonical_functions
 from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH, first_sync_finished
 from lbry.wallet.server.db.trending import TRENDING_ALGORITHMS
-from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES
 ATTRIBUTE_ARRAY_MAX_LENGTH = 100
@ -117,6 +116,15 @@ class SQLDB:
        create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag);
    """
    CREATE_LANGUAGE_TABLE = """
        create table if not exists language (
            language text not null,
            claim_hash bytes not null,
            height integer not null
        );
        create unique index if not exists language_claim_hash_language_idx on language (claim_hash, language);
    """
    CREATE_CLAIMTRIE_TABLE = """
        create table if not exists claimtrie (
            normalized text primary key,
@ -174,12 +182,18 @@ class SQLDB:
        for tag_value, tag_key in COMMON_TAGS.items()
    )
    LANGUAGE_INDEXES = '\n'.join(
        f"create unique index if not exists language_{language}_idx on language (language, claim_hash) WHERE language='{language}';"
        for language in INDEXED_LANGUAGES
    )
    CREATE_TABLES_QUERY = (
        CREATE_CLAIM_TABLE +
        CREATE_FULL_TEXT_SEARCH +
        CREATE_SUPPORT_TABLE +
        CREATE_CLAIMTRIE_TABLE +
-        CREATE_TAG_TABLE
+        CREATE_TAG_TABLE +
        CREATE_LANGUAGE_TABLE
    )
    def __init__(
@ -305,7 +319,7 @@ class SQLDB:
        self.execute('commit;')
    def _upsertable_claims(self, txos: List[Output], header, clear_first=False):
-        claim_hashes, claims, tags = set(), [], {}
+        claim_hashes, claims, tags, languages = set(), [], {}, {}
        for txo in txos:
            tx = txo.tx_ref.tx
@ -316,6 +330,13 @@ class SQLDB:
                #self.logger.exception(f"Could not decode claim name for {tx.id}:{txo.position}.")
                continue
            language = None
            try:
                if txo.claim.is_stream and txo.claim.stream.languages:
                    language = txo.claim.stream.languages[0].language
            except:
                pass
            claim_hash = txo.claim_hash
            claim_hashes.add(claim_hash)
            claim_record = {
@ -373,6 +394,9 @@ class SQLDB:
            elif claim.is_channel:
                claim_record['claim_type'] = CLAIM_TYPES['channel']
            if language:
                languages[(language, claim_hash)] = (language, claim_hash, tx.height)
            for tag in clean_tags(claim.message.tags):
                tags[(tag, claim_hash)] = (tag, claim_hash, tx.height)
@ -383,6 +407,10 @@ class SQLDB:
            self.executemany(
                "INSERT OR IGNORE INTO tag (tag, claim_hash, height) VALUES (?, ?, ?)", tags.values()
            )
        if languages:
            self.executemany(
                "INSERT OR IGNORE INTO language (language, claim_hash, height) VALUES (?, ?, ?)", languages.values()
            )
        return claims
--- a/tests/integration/blockchain/test_claim_commands.py
+++ b/tests/integration/blockchain/test_claim_commands.py
@ -262,6 +262,17 @@ class ClaimSearchCommand(ClaimTestCase):
        await self.assertFindsClaims([claim4, claim3, claim2], fee_amount='<1.0', fee_currency='lbc')
        await self.assertFindsClaims([claim3], fee_amount='0.5', fee_currency='lbc')
        await self.assertFindsClaims([claim5], fee_currency='usd')
    async def test_search_by_language(self):
        claim1 = await self.stream_create('claim1', fee_amount='1.0', fee_currency='lbc')
        claim2 = await self.stream_create('claim2', fee_amount='0.9', fee_currency='lbc')
        claim3 = await self.stream_create('claim3', fee_amount='0.5', fee_currency='lbc', languages='en')
        claim4 = await self.stream_create('claim4', fee_amount='0.1', fee_currency='lbc', languages='en')
        claim5 = await self.stream_create('claim5', fee_amount='1.0', fee_currency='usd', languages='es')
        await self.assertFindsClaims([claim4, claim3], any_languages=['en'])
        await self.assertFindsClaims([claim5], any_languages=['es'])
        await self.assertFindsClaims([claim5, claim4, claim3], any_languages=['en', 'es'])
        await self.assertFindsClaims([], fee_currency='foo')
    async def test_search_by_channel(self):