Merge pull request #3050 from lbryio/language-indexes

add indexes for `any_languages` argument to `claim_search`
2020-09-28 15:52:41 -04:00 · 2020-09-28 15:52:41 -04:00 · d0f21c0095
commit d0f21c0095
parent 43c2e8d8e9 46dc15dd29
5 changed files with 354 additions and 205 deletions
--- a/lbry/wallet/server/block_processor.py
+++ b/lbry/wallet/server/block_processor.py
@ -768,6 +768,7 @@ class LBRYBlockProcessor(BlockProcessor):
            self.timer.run(self.sql.execute, self.sql.SEARCH_INDEXES, timer_name='executing SEARCH_INDEXES')
            if self.env.individual_tag_indexes:
                self.timer.run(self.sql.execute, self.sql.TAG_INDEXES, timer_name='executing TAG_INDEXES')
+            self.timer.run(self.sql.execute, self.sql.LANGUAGE_INDEXES, timer_name='executing LANGUAGE_INDEXES')

    def advance_txs(self, height, txs, header):
        timer = self.timer.sub_timers['advance_blocks']
--- a/lbry/wallet/server/db/common.py
+++ b/lbry/wallet/server/db/common.py
@ -13,209 +13,306 @@ STREAM_TYPES = {
    'model': 6
 }

+# 9/21/2020
+MOST_USED_TAGS = {
+    "gaming",
+    "people & blogs",
+    "entertainment",
+    "music",
+    "pop culture",
+    "education",
+    "technology",
+    "blockchain",
+    "news",
+    "funny",
+    "science & technology",
+    "learning",
+    "gameplay",
+    "news & politics",
+    "comedy",
+    "bitcoin",
+    "beliefs",
+    "nature",
+    "art",
+    "economics",
+    "film & animation",
+    "lets play",
+    "games",
+    "sports",
+    "howto & style",
+    "game",
+    "cryptocurrency",
+    "playstation 4",
+    "automotive",
+    "crypto",
+    "mature",
+    "sony interactive entertainment",
+    "walkthrough",
+    "tutorial",
+    "video game",
+    "weapons",
+    "pc",
+    "playthrough",
+    "anime",
+    "how to",
+    "btc",
+    "fun",
+    "ethereum",
+    "food",
+    "travel & events",
+    "minecraft",
+    "science",
+    "autos & vehicles",
+    "play",
+    "politics",
+    "commentary",
+    "twitch",
+    "ps4live",
+    "love",
+    "ps4",
+    "nonprofits & activism",
+    "ps4share",
+    "fortnite",
+    "xbox",
+    "porn",
+    "video games",
+    "trump",
+    "español",
+    "money",
+    "music video",
+    "movie",
+    "coronavirus",
+    "nintendo",
+    "donald trump",
+    "steam",
+    "trailer",
+    "android",
+    "podcast",
+    "xbox one",
+    "survival",
+    "linux",
+    "travel",
+    "funny moments",
+    "audio",
+    "litecoin",
+    "animation",
+    "gamer",
+    "lets",
+    "playstation",
+    "bitcoin news",
+    "history",
+    "fox news",
+    "xxx",
+    "god",
+    "dance",
+    "adventure",
+    "liberal",
+    "horror",
+    "government",
+    "freedom",
+    "2020",
+    "reaction",
+    "meme",
+    "photography",
+    "truth"
+}
+
 MATURE_TAGS = [
    'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex'
 ]

+
+def normalize_tag(tag):
+    return tag.replace(" ", "_").replace("&", "and").replace("-", "_")
+
+
 COMMON_TAGS = {
-    "gaming": "gaming",
-    "people & blogs": "people_and_blogs",
-    "pop culture": "pop_culture",
-    "entertainment": "entertainment",
-    "technology": "technology",
-    "music": "music",
-    "funny": "funny",
-    "education": "education",
-    "learning": "learning",
-    "news": "news",
-    "gameplay": "gameplay",
-    "science & technology": "science_and_technology",
-    "playstation 4": "playstation_4",
-    "beliefs": "beliefs",
-    "nature": "nature",
-    "news & politics": "news_and_politics",
-    "comedy": "comedy",
-    "games": "games",
-    "sony interactive entertainment": "sony_interactive_entertainment",
-    "film & animation": "film_and_animation",
-    "game": "game",
-    "howto & style": "howto_and_style",
-    "weapons": "weapons",
-    "blockchain": "blockchain",
-    "video game": "video_game",
-    "sports": "sports",
-    "walkthrough": "walkthrough",
-    "ps4live": "ps4live",
-    "art": "art",
-    "pc": "pc",
-    "economics": "economics",
-    "automotive": "automotive",
-    "minecraft": "minecraft",
-    "playthrough": "playthrough",
-    "ps4share": "ps4share",
-    "tutorial": "tutorial",
-    "play": "play",
-    "twitch": "twitch",
-    "how to": "how_to",
-    "ps4": "ps4",
-    "bitcoin": "bitcoin",
-    "fortnite": "fortnite",
-    "commentary": "commentary",
-    "lets play": "lets_play",
-    "fun": "fun",
-    "politics": "politics",
-    "xbox": "xbox",
-    "autos & vehicles": "autos_and_vehicles",
-    "travel & events": "travel_and_events",
-    "food": "food",
-    "science": "science",
-    "mature": "mature",
-    "xbox one": "xbox_one",
-    "liberal": "liberal",
-    "democrat": "democrat",
-    "progressive": "progressive",
-    "survival": "survival",
-    "nonprofits & activism": "nonprofits_and_activism",
-    "cryptocurrency": "cryptocurrency",
-    "playstation": "playstation",
-    "nintendo": "nintendo",
-    "government": "government",
-    "steam": "steam",
-    "podcast": "podcast",
-    "horror": "horror",
-    "conservative": "conservative",
-    "reaction": "reaction",
-    "trailer": "trailer",
-    "love": "love",
-    "cnn": "cnn",
-    "republican": "republican",
-    "gamer": "gamer",
-    "political": "political",
-    "hangoutsonair": "hangoutsonair",
-    "hoa": "hoa",
-    "msnbc": "msnbc",
-    "cbs": "cbs",
-    "donald trump": "donald_trump",
-    "fiction": "fiction",
-    "fox news": "fox_news",
-    "anime": "anime",
-    "crypto": "crypto",
-    "ethereum": "ethereum",
-    "call of duty": "call_of_duty",
-    "multiplayer": "multiplayer",
-    "android": "android",
-    "epic": "epic",
-    "rpg": "rpg",
-    "adventure": "adventure",
-    "secular talk": "secular_talk",
-    "btc": "btc",
-    "atheist": "atheist",
-    "atheism": "atheism",
-    "ps3": "ps3",
-    "video games": "video_games",
-    "cod": "cod",
-    "agnostic": "agnostic",
-    "movie": "movie",
-    "online": "online",
-    "fps": "fps",
-    "mod": "mod",
-    "reviews": "reviews",
-    "sharefactory": "sharefactory",
-    "world": "world",
-    "space": "space",
-    "hilarious": "hilarious",
-    "stream": "stream",
-    "lol": "lol",
-    "sony": "sony",
-    "god": "god",
-    "lets": "lets",
-    "dance": "dance",
-    "pvp": "pvp",
-    "tech": "tech",
-    "zombies": "zombies",
-    "pokemon": "pokemon",
-    "fail": "fail",
-    "xbox 360": "xbox_360",
-    "film": "film",
-    "unboxing": "unboxing",
-    "animation": "animation",
-    "travel": "travel",
-    "money": "money",
-    "wwe": "wwe",
-    "how": "how",
-    "mods": "mods",
-    "pubg": "pubg",
-    "indie": "indie",
-    "strategy": "strategy",
-    "history": "history",
-    "rap": "rap",
-    "ios": "ios",
-    "sony computer entertainment": "sony_computer_entertainment",
-    "mobile": "mobile",
-    "trump": "trump",
-    "flat earth": "flat_earth",
-    "hack": "hack",
-    "trap": "trap",
-    "fox": "fox",
-    "vlogging": "vlogging",
-    "news radio": "news_radio",
-    "humor": "humor",
-    "facebook": "facebook",
-    "edm": "edm",
-    "fitness": "fitness",
-    "vaping": "vaping",
-    "hip hop": "hip_hop",
-    "secular": "secular",
-    "jesus": "jesus",
-    "vape": "vape",
-    "song": "song",
-    "remix": "remix",
-    "guitar": "guitar",
-    "daily": "daily",
-    "mining": "mining",
-    "diy": "diy",
-    "videogame": "videogame",
-    "pets & animals": "pets_and_animals",
-    "funny moments": "funny_moments",
-    "religion": "religion",
-    "death": "death",
-    "media": "media",
-    "nbc": "nbc",
-    "war": "war",
-    "freedom": "freedom",
-    "viral": "viral",
-    "meme": "meme",
-    "family": "family",
-    "gold": "gold",
-    "photography": "photography",
-    "chill": "chill",
-    "zombie": "zombie",
-    "computer": "computer",
-    "sniper": "sniper",
-    "bible": "bible",
-    "linux": "linux",
-    "overwatch": "overwatch",
-    "pro": "pro",
-    "dragon": "dragon",
-    "litecoin": "litecoin",
-    "gta": "gta",
-    "iphone": "iphone",
-    "house": "house",
-    "bass": "bass",
-    "bitcoin news": "bitcoin_news",
-    "wii": "wii",
-    "crash": "crash",
-    "league of legends": "league_of_legends",
-    "grand theft auto v": "grand_theft_auto_v",
-    "mario": "mario",
-    "mmorpg": "mmorpg",
-    "satire": "satire",
-    "fire": "fire",
-    "racing": "racing",
-    "apple": "apple",
-    "health": "health",
-    "instrumental": "instrumental",
-    "destiny": "destiny",
-    "truth": "truth",
-    "race": "race"
+    tag: normalize_tag(tag) for tag in list(MOST_USED_TAGS)
 }
+
+INDEXED_LANGUAGES = [
+  'en',
+  'aa',
+  'ab',
+  'ae',
+  'af',
+  'ak',
+  'am',
+  'an',
+  'ar',
+  'as',
+  'av',
+  'ay',
+  'az',
+  'ba',
+  'be',
+  'bg',
+  'bh',
+  'bi',
+  'bm',
+  'bn',
+  'bo',
+  'br',
+  'bs',
+  'ca',
+  'ce',
+  'ch',
+  'co',
+  'cr',
+  'cs',
+  'cu',
+  'cv',
+  'cy',
+  'da',
+  'de',
+  'dv',
+  'dz',
+  'ee',
+  'el',
+  'eo',
+  'es',
+  'et',
+  'eu',
+  'fa',
+  'ff',
+  'fi',
+  'fj',
+  'fo',
+  'fr',
+  'fy',
+  'ga',
+  'gd',
+  'gl',
+  'gn',
+  'gu',
+  'gv',
+  'ha',
+  'he',
+  'hi',
+  'ho',
+  'hr',
+  'ht',
+  'hu',
+  'hy',
+  'hz',
+  'ia',
+  'id',
+  'ie',
+  'ig',
+  'ii',
+  'ik',
+  'io',
+  'is',
+  'it',
+  'iu',
+  'ja',
+  'jv',
+  'ka',
+  'kg',
+  'ki',
+  'kj',
+  'kk',
+  'kl',
+  'km',
+  'kn',
+  'ko',
+  'kr',
+  'ks',
+  'ku',
+  'kv',
+  'kw',
+  'ky',
+  'la',
+  'lb',
+  'lg',
+  'li',
+  'ln',
+  'lo',
+  'lt',
+  'lu',
+  'lv',
+  'mg',
+  'mh',
+  'mi',
+  'mk',
+  'ml',
+  'mn',
+  'mr',
+  'ms',
+  'mt',
+  'my',
+  'na',
+  'nb',
+  'nd',
+  'ne',
+  'ng',
+  'nl',
+  'nn',
+  'no',
+  'nr',
+  'nv',
+  'ny',
+  'oc',
+  'oj',
+  'om',
+  'or',
+  'os',
+  'pa',
+  'pi',
+  'pl',
+  'ps',
+  'pt',
+  'qu',
+  'rm',
+  'rn',
+  'ro',
+  'ru',
+  'rw',
+  'sa',
+  'sc',
+  'sd',
+  'se',
+  'sg',
+  'si',
+  'sk',
+  'sl',
+  'sm',
+  'sn',
+  'so',
+  'sq',
+  'sr',
+  'ss',
+  'st',
+  'su',
+  'sv',
+  'sw',
+  'ta',
+  'te',
+  'tg',
+  'th',
+  'ti',
+  'tk',
+  'tl',
+  'tn',
+  'to',
+  'tr',
+  'ts',
+  'tt',
+  'tw',
+  'ty',
+  'ug',
+  'uk',
+  'ur',
+  'uz',
+  've',
+  'vi',
+  'vo',
+  'wa',
+  'wo',
+  'xh',
+  'yi',
+  'yo',
+  'za',
+  'zh',
+  'zu'
+]
--- a/lbry/wallet/server/db/reader.py
+++ b/lbry/wallet/server/db/reader.py
@ -18,7 +18,7 @@ from lbry.schema.tags import clean_tags
 from lbry.schema.result import Outputs, Censor
 from lbry.wallet import Ledger, RegTestLedger

-from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES
 from .full_text_search import FTS_ORDER_BY


@ -536,6 +536,18 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun
                AND tag IN ({values})
            )
            """
+    elif attr == 'language':
+        indexed_languages = any_items & set(INDEXED_LANGUAGES)
+        if indexed_languages:
+            any_items -= indexed_languages
+        for language in indexed_languages:
+            any_queries[f'#_any_common_languages_{language}'] = f"""
+            EXISTS(
+                SELECT 1 FROM language INDEXED BY language_{language}_idx
+                WHERE {CLAIM_HASH_OR_REPOST_HASH_SQL}=language.claim_hash
+                AND language = '{language}'
+            )
+            """

    if any_items:

--- a/lbry/wallet/server/db/writer.py
+++ b/lbry/wallet/server/db/writer.py
@ -6,7 +6,6 @@ from decimal import Decimal
 from collections import namedtuple
 from multiprocessing import Manager
 from binascii import unhexlify
-
 from lbry.wallet.server.leveldb import LevelDB
 from lbry.wallet.server.util import class_logger
 from lbry.wallet.database import query, constraints_to_sql
@ -19,7 +18,7 @@ from lbry.wallet.server.db.canonical import register_canonical_functions
 from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH, first_sync_finished
 from lbry.wallet.server.db.trending import TRENDING_ALGORITHMS

-from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES


 ATTRIBUTE_ARRAY_MAX_LENGTH = 100
@ -117,6 +116,15 @@ class SQLDB:
        create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag);
    """

+    CREATE_LANGUAGE_TABLE = """
+        create table if not exists language (
+            language text not null,
+            claim_hash bytes not null,
+            height integer not null
+        );
+        create unique index if not exists language_claim_hash_language_idx on language (claim_hash, language);
+    """
+
    CREATE_CLAIMTRIE_TABLE = """
        create table if not exists claimtrie (
            normalized text primary key,
@ -174,12 +182,18 @@ class SQLDB:
        for tag_value, tag_key in COMMON_TAGS.items()
    )

+    LANGUAGE_INDEXES = '\n'.join(
+        f"create unique index if not exists language_{language}_idx on language (language, claim_hash) WHERE language='{language}';"
+        for language in INDEXED_LANGUAGES
+    )
+
    CREATE_TABLES_QUERY = (
        CREATE_CLAIM_TABLE +
        CREATE_FULL_TEXT_SEARCH +
        CREATE_SUPPORT_TABLE +
        CREATE_CLAIMTRIE_TABLE +
-        CREATE_TAG_TABLE
+        CREATE_TAG_TABLE +
+        CREATE_LANGUAGE_TABLE
    )

    def __init__(
@ -305,7 +319,7 @@ class SQLDB:
        self.execute('commit;')

    def _upsertable_claims(self, txos: List[Output], header, clear_first=False):
-        claim_hashes, claims, tags = set(), [], {}
+        claim_hashes, claims, tags, languages = set(), [], {}, {}
        for txo in txos:
            tx = txo.tx_ref.tx

@ -316,6 +330,13 @@ class SQLDB:
                #self.logger.exception(f"Could not decode claim name for {tx.id}:{txo.position}.")
                continue

+            language = None
+            try:
+                if txo.claim.is_stream and txo.claim.stream.languages:
+                    language = txo.claim.stream.languages[0].language
+            except:
+                pass
+
            claim_hash = txo.claim_hash
            claim_hashes.add(claim_hash)
            claim_record = {
@ -373,6 +394,9 @@ class SQLDB:
            elif claim.is_channel:
                claim_record['claim_type'] = CLAIM_TYPES['channel']

+            if language:
+                languages[(language, claim_hash)] = (language, claim_hash, tx.height)
+
            for tag in clean_tags(claim.message.tags):
                tags[(tag, claim_hash)] = (tag, claim_hash, tx.height)

@ -383,6 +407,10 @@ class SQLDB:
            self.executemany(
                "INSERT OR IGNORE INTO tag (tag, claim_hash, height) VALUES (?, ?, ?)", tags.values()
            )
+        if languages:
+            self.executemany(
+                "INSERT OR IGNORE INTO language (language, claim_hash, height) VALUES (?, ?, ?)", languages.values()
+            )

        return claims

--- a/tests/integration/blockchain/test_claim_commands.py
+++ b/tests/integration/blockchain/test_claim_commands.py
@ -262,6 +262,17 @@ class ClaimSearchCommand(ClaimTestCase):
        await self.assertFindsClaims([claim4, claim3, claim2], fee_amount='<1.0', fee_currency='lbc')
        await self.assertFindsClaims([claim3], fee_amount='0.5', fee_currency='lbc')
        await self.assertFindsClaims([claim5], fee_currency='usd')
+
+    async def test_search_by_language(self):
+        claim1 = await self.stream_create('claim1', fee_amount='1.0', fee_currency='lbc')
+        claim2 = await self.stream_create('claim2', fee_amount='0.9', fee_currency='lbc')
+        claim3 = await self.stream_create('claim3', fee_amount='0.5', fee_currency='lbc', languages='en')
+        claim4 = await self.stream_create('claim4', fee_amount='0.1', fee_currency='lbc', languages='en')
+        claim5 = await self.stream_create('claim5', fee_amount='1.0', fee_currency='usd', languages='es')
+
+        await self.assertFindsClaims([claim4, claim3], any_languages=['en'])
+        await self.assertFindsClaims([claim5], any_languages=['es'])
+        await self.assertFindsClaims([claim5, claim4, claim3], any_languages=['en', 'es'])
        await self.assertFindsClaims([], fee_currency='foo')

    async def test_search_by_channel(self):