From ad2df4871ee1d44d2ed6eef09d692b269bdfb36c Mon Sep 17 00:00:00 2001
From: Lex Berezhny <lex@damoti.com>
Date: Sat, 20 Jul 2019 01:14:06 -0400
Subject: [PATCH] popular tags are indexed and use different search strategy
 than less popular tags

---
 lbry/lbry/schema/tags.py             |   2 +-
 lbry/lbry/wallet/server/db/common.py | 207 +++++++++++++++++++++++++++
 lbry/lbry/wallet/server/db/reader.py |  38 +++--
 lbry/lbry/wallet/server/db/writer.py |  62 ++++----
 lbry/tests/unit/schema/test_tags.py  |   2 +-
 5 files changed, 272 insertions(+), 39 deletions(-)

diff --git a/lbry/lbry/schema/tags.py b/lbry/lbry/schema/tags.py
index 7283ea602..ce6ee1875 100644
--- a/lbry/lbry/schema/tags.py
+++ b/lbry/lbry/schema/tags.py
@@ -6,7 +6,7 @@ WEIRD_CHARS_RE = re.compile(r"[#!~]")
 
 
 def normalize_tag(tag: str):
-    return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip()
+    return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower().replace("'", ""))).strip()
 
 
 def clean_tags(tags: List[str]):
diff --git a/lbry/lbry/wallet/server/db/common.py b/lbry/lbry/wallet/server/db/common.py
index dffe752a8..827e4b783 100644
--- a/lbry/lbry/wallet/server/db/common.py
+++ b/lbry/lbry/wallet/server/db/common.py
@@ -11,3 +11,210 @@ STREAM_TYPES = {
     'binary': 5,
     'model': 6
 }
+
+MATURE_TAGS = [
+    'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex'
+]
+
+COMMON_TAGS = {
+    "gaming": "gaming",
+    "people & blogs": "people_and_blogs",
+    "pop culture": "pop_culture",
+    "entertainment": "entertainment",
+    "technology": "technology",
+    "music": "music",
+    "funny": "funny",
+    "education": "education",
+    "learning": "learning",
+    "news": "news",
+    "gameplay": "gameplay",
+    "science & technology": "science_and_technology",
+    "playstation 4": "playstation_4",
+    "beliefs": "beliefs",
+    "nature": "nature",
+    "news & politics": "news_and_politics",
+    "comedy": "comedy",
+    "games": "games",
+    "sony interactive entertainment": "sony_interactive_entertainment",
+    "film & animation": "film_and_animation",
+    "game": "game",
+    "howto & style": "howto_and_style",
+    "weapons": "weapons",
+    "blockchain": "blockchain",
+    "video game": "video_game",
+    "sports": "sports",
+    "walkthrough": "walkthrough",
+    "ps4live": "ps4live",
+    "art": "art",
+    "pc": "pc",
+    "economics": "economics",
+    "automotive": "automotive",
+    "minecraft": "minecraft",
+    "playthrough": "playthrough",
+    "ps4share": "ps4share",
+    "tutorial": "tutorial",
+    "play": "play",
+    "twitch": "twitch",
+    "how to": "how_to",
+    "ps4": "ps4",
+    "bitcoin": "bitcoin",
+    "fortnite": "fortnite",
+    "commentary": "commentary",
+    "lets play": "lets_play",
+    "fun": "fun",
+    "politics": "politics",
+    "xbox": "xbox",
+    "autos & vehicles": "autos_and_vehicles",
+    "travel & events": "travel_and_events",
+    "food": "food",
+    "science": "science",
+    "mature": "mature",
+    "xbox one": "xbox_one",
+    "liberal": "liberal",
+    "democrat": "democrat",
+    "progressive": "progressive",
+    "survival": "survival",
+    "nonprofits & activism": "nonprofits_and_activism",
+    "cryptocurrency": "cryptocurrency",
+    "playstation": "playstation",
+    "nintendo": "nintendo",
+    "government": "government",
+    "steam": "steam",
+    "podcast": "podcast",
+    "horror": "horror",
+    "conservative": "conservative",
+    "reaction": "reaction",
+    "trailer": "trailer",
+    "love": "love",
+    "cnn": "cnn",
+    "republican": "republican",
+    "gamer": "gamer",
+    "political": "political",
+    "hangoutsonair": "hangoutsonair",
+    "hoa": "hoa",
+    "msnbc": "msnbc",
+    "cbs": "cbs",
+    "donald trump": "donald_trump",
+    "fiction": "fiction",
+    "fox news": "fox_news",
+    "anime": "anime",
+    "crypto": "crypto",
+    "ethereum": "ethereum",
+    "call of duty": "call_of_duty",
+    "multiplayer": "multiplayer",
+    "android": "android",
+    "epic": "epic",
+    "rpg": "rpg",
+    "adventure": "adventure",
+    "secular talk": "secular_talk",
+    "btc": "btc",
+    "atheist": "atheist",
+    "atheism": "atheism",
+    "ps3": "ps3",
+    "video games": "video_games",
+    "cod": "cod",
+    "agnostic": "agnostic",
+    "movie": "movie",
+    "online": "online",
+    "fps": "fps",
+    "mod": "mod",
+    "reviews": "reviews",
+    "sharefactory": "sharefactory",
+    "world": "world",
+    "space": "space",
+    "hilarious": "hilarious",
+    "stream": "stream",
+    "lol": "lol",
+    "sony": "sony",
+    "god": "god",
+    "lets": "lets",
+    "dance": "dance",
+    "pvp": "pvp",
+    "tech": "tech",
+    "zombies": "zombies",
+    "pokemon": "pokemon",
+    "fail": "fail",
+    "xbox 360": "xbox_360",
+    "film": "film",
+    "unboxing": "unboxing",
+    "animation": "animation",
+    "travel": "travel",
+    "money": "money",
+    "wwe": "wwe",
+    "how": "how",
+    "mods": "mods",
+    "pubg": "pubg",
+    "indie": "indie",
+    "strategy": "strategy",
+    "history": "history",
+    "rap": "rap",
+    "ios": "ios",
+    "sony computer entertainment": "sony_computer_entertainment",
+    "mobile": "mobile",
+    "trump": "trump",
+    "flat earth": "flat_earth",
+    "hack": "hack",
+    "trap": "trap",
+    "fox": "fox",
+    "vlogging": "vlogging",
+    "news radio": "news_radio",
+    "humor": "humor",
+    "facebook": "facebook",
+    "edm": "edm",
+    "fitness": "fitness",
+    "vaping": "vaping",
+    "hip hop": "hip_hop",
+    "secular": "secular",
+    "jesus": "jesus",
+    "vape": "vape",
+    "song": "song",
+    "remix": "remix",
+    "guitar": "guitar",
+    "daily": "daily",
+    "mining": "mining",
+    "diy": "diy",
+    "videogame": "videogame",
+    "pets & animals": "pets_and_animals",
+    "funny moments": "funny_moments",
+    "religion": "religion",
+    "death": "death",
+    "media": "media",
+    "nbc": "nbc",
+    "war": "war",
+    "freedom": "freedom",
+    "viral": "viral",
+    "meme": "meme",
+    "family": "family",
+    "gold": "gold",
+    "photography": "photography",
+    "chill": "chill",
+    "zombie": "zombie",
+    "computer": "computer",
+    "sniper": "sniper",
+    "bible": "bible",
+    "linux": "linux",
+    "overwatch": "overwatch",
+    "pro": "pro",
+    "dragon": "dragon",
+    "litecoin": "litecoin",
+    "gta": "gta",
+    "iphone": "iphone",
+    "house": "house",
+    "bass": "bass",
+    "bitcoin news": "bitcoin_news",
+    "wii": "wii",
+    "crash": "crash",
+    "league of legends": "league_of_legends",
+    "grand theft auto v": "grand_theft_auto_v",
+    "mario": "mario",
+    "mmorpg": "mmorpg",
+    "satire": "satire",
+    "fire": "fire",
+    "racing": "racing",
+    "apple": "apple",
+    "health": "health",
+    "instrumental": "instrumental",
+    "destiny": "destiny",
+    "truth": "truth",
+    "race": "race"
+}
diff --git a/lbry/lbry/wallet/server/db/reader.py b/lbry/lbry/wallet/server/db/reader.py
index 66073c6c4..3291764f2 100644
--- a/lbry/lbry/wallet/server/db/reader.py
+++ b/lbry/lbry/wallet/server/db/reader.py
@@ -16,7 +16,7 @@ from lbry.schema.tags import clean_tags
 from lbry.schema.result import Outputs
 from lbry.wallet.ledger import BaseLedger, MainNetLedger, RegTestLedger
 
-from .common import CLAIM_TYPES, STREAM_TYPES
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
 
 
 class SQLiteOperationalError(sqlite3.OperationalError):
@@ -433,24 +433,36 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun
     any_items = {item for item in any_items if item not in not_items}
 
     if any_items:
+
+        any_queries = {}
+
+        common_items = any_items & COMMON_TAGS.keys()
+        if common_items:
+            any_items -= common_items
+            for item in common_items:
+                index_name = COMMON_TAGS[item]
+                any_queries[f'$any_{attr}_{index_name}'] = item
+                any_queries[f'#_any_{attr}_{index_name}'] = f"""
+                    EXISTS(
+                        SELECT 1 FROM {attr} INDEXED BY tag_{index_name}_idx WHERE
+                            claim.claim_hash={attr}.claim_hash
+                        AND {attr} = '{item}'
+                    )
+                """
+
         constraints.update({
             f'$any_{attr}{i}': item for i, item in enumerate(any_items)
         })
         values = ', '.join(
             f':$any_{attr}{i}' for i in range(len(any_items))
         )
-        if for_count:
-            constraints[f'claim.claim_hash__in#_any_{attr}'] = f"""
-                SELECT claim_hash FROM {attr} WHERE {attr} IN ({values})
-            """
-        else:
-            constraints[f'#_any_{attr}'] = f"""
-                EXISTS(
-                    SELECT 1 FROM {attr} WHERE
-                        claim.claim_hash={attr}.claim_hash
-                    AND {attr} IN ({values})
-                )
-            """
+        any_queries[f'claim.claim_hash__in#_any_{attr}'] = f"""
+            SELECT claim_hash FROM {attr} WHERE {attr} IN ({values})
+        """
+        if len(any_queries) == 1:
+            constraints.update(any_queries)
+        elif len(any_queries) > 1:
+            constraints[f'ORed_{attr}_queries__any'] = any_queries
 
     if all_items:
         constraints[f'$all_{attr}_count'] = len(all_items)
diff --git a/lbry/lbry/wallet/server/db/writer.py b/lbry/lbry/wallet/server/db/writer.py
index 9e7988e51..1396ef0e2 100644
--- a/lbry/lbry/wallet/server/db/writer.py
+++ b/lbry/lbry/wallet/server/db/writer.py
@@ -16,7 +16,7 @@ from lbry.wallet.server.db.trending import (
     CREATE_TREND_TABLE, calculate_trending, register_trending_functions
 )
 
-from .common import CLAIM_TYPES, STREAM_TYPES
+from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS
 
 
 ATTRIBUTE_ARRAY_MAX_LENGTH = 100
@@ -76,30 +76,12 @@ class SQLDB:
             trending_global integer not null default 0
         );
 
-        create index if not exists claim_resolve_idx on claim (normalized, claim_id);
+        create index if not exists claim_normalized_idx on claim (normalized, activation_height);
+        create index if not exists claim_channel_hash_idx on claim (channel_hash, signature, claim_hash);
         create index if not exists claim_claims_in_channel_idx on claim (signature_valid, channel_hash, normalized);
-
-        create index if not exists claim_id_idx on claim (claim_id);
-        create index if not exists claim_normalized_idx on claim (normalized);
         create index if not exists claim_txo_hash_idx on claim (txo_hash);
-        create index if not exists claim_channel_hash_idx on claim (channel_hash);
-        create index if not exists claim_timestamp_idx on claim (timestamp);
-        create index if not exists claim_height_idx on claim (height);
-        create index if not exists claim_activation_height_idx on claim (activation_height);
+        create index if not exists claim_activation_height_idx on claim (activation_height, claim_hash);
         create index if not exists claim_expiration_height_idx on claim (expiration_height);
-        create index if not exists claim_public_key_hash_idx on claim (public_key_hash);
-
-        create index if not exists claim_claim_type_idx on claim (claim_type);
-        create index if not exists claim_stream_type_idx on claim (stream_type);
-        create index if not exists claim_media_type_idx on claim (media_type);
-        create index if not exists claim_fee_amount_idx on claim (fee_amount);
-        create index if not exists claim_fee_currency_idx on claim (fee_currency);
-
-        create index if not exists claim_signature_valid_idx on claim (signature_valid);
-
-        create unique index if not exists claim_effective_amount_idx on claim (effective_amount, claim_hash, release_time);
-        create unique index if not exists claim_release_time_idx on claim (release_time, claim_hash);
-        create unique index if not exists claim_trending_global_mixed_idx on claim (trending_global, trending_mixed, claim_hash);
     """
 
     CREATE_SUPPORT_TABLE = """
@@ -110,7 +92,6 @@ class SQLDB:
             claim_hash bytes not null,
             amount integer not null
         );
-        create index if not exists support_txo_hash_idx on support (txo_hash);
         create index if not exists support_claim_hash_idx on support (claim_hash, height);
     """
 
@@ -120,7 +101,6 @@ class SQLDB:
             claim_hash bytes not null,
             height integer not null
         );
-        create index if not exists tag_tag_idx on tag (tag);
         create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag);
     """
 
@@ -133,6 +113,37 @@ class SQLDB:
         create index if not exists claimtrie_claim_hash_idx on claimtrie (claim_hash);
     """
 
+    SEARCH_INDEXES = """
+        -- used by any tag clouds
+        create index if not exists tag_tag_idx on tag (tag, claim_hash);
+        {custom_tags_indexes}
+
+        -- common ORDER BY
+        create unique index if not exists claim_effective_amount_idx on claim (effective_amount, claim_hash, release_time);
+        create unique index if not exists claim_release_time_idx on claim (release_time, claim_hash);
+        create unique index if not exists claim_trending_global_mixed_idx on claim (trending_global, trending_mixed, claim_hash);
+
+        -- TODO: verify that all indexes below are used
+        create index if not exists claim_height_normalized_idx on claim (height, normalized asc);
+
+        create index if not exists claim_resolve_idx on claim (normalized, claim_id);
+
+        create index if not exists claim_id_idx on claim (claim_id, claim_hash);
+        create index if not exists claim_timestamp_idx on claim (timestamp);
+        create index if not exists claim_public_key_hash_idx on claim (public_key_hash);
+
+        create index if not exists claim_claim_type_idx on claim (claim_type);
+        create index if not exists claim_stream_type_idx on claim (stream_type);
+        create index if not exists claim_media_type_idx on claim (media_type);
+        create index if not exists claim_fee_amount_idx on claim (fee_amount);
+        create index if not exists claim_fee_currency_idx on claim (fee_currency);
+
+        create index if not exists claim_signature_valid_idx on claim (signature_valid);
+    """.format(custom_tags_indexes='\n'.join(
+        f'create unique index if not exists tag_{tag_key}_idx on tag (tag, claim_hash) WHERE tag="{tag_value}";'
+        for tag_value, tag_key in COMMON_TAGS.items()
+    ))
+
     CREATE_TABLES_QUERY = (
         PRAGMAS +
         CREATE_CLAIM_TABLE +
@@ -688,6 +699,9 @@ class SQLDB:
         r(self.update_claimtrie, height, recalculate_claim_hashes, deleted_claim_names, forward_timer=True)
         r(calculate_trending, self.db, height, self.main.first_sync, daemon_height)
 
+        if self.main.first_sync and height == daemon_height:
+            self.db.executescript(self.SEARCH_INDEXES)
+
 
 class LBRYDB(DB):
 
diff --git a/lbry/tests/unit/schema/test_tags.py b/lbry/tests/unit/schema/test_tags.py
index 110506832..4dd637a29 100644
--- a/lbry/tests/unit/schema/test_tags.py
+++ b/lbry/tests/unit/schema/test_tags.py
@@ -11,7 +11,7 @@ class TestTagNormalization(unittest.TestCase):
     def test_normalize_tag(self):
         tag = self.assertNormalizedTag
         tag('', ' \t #!~')
-        tag('tag', 'Tag')
+        tag('tag', 'T\'ag')
         tag('t ag', '\tT  \nAG   ')
         tag('tag hash', '#tag~#hash!')