From ad2df4871ee1d44d2ed6eef09d692b269bdfb36c Mon Sep 17 00:00:00 2001 From: Lex Berezhny Date: Sat, 20 Jul 2019 01:14:06 -0400 Subject: [PATCH] popular tags are indexed and use different search strategy than less popular tags --- lbry/lbry/schema/tags.py | 2 +- lbry/lbry/wallet/server/db/common.py | 207 +++++++++++++++++++++++++++ lbry/lbry/wallet/server/db/reader.py | 38 +++-- lbry/lbry/wallet/server/db/writer.py | 62 ++++---- lbry/tests/unit/schema/test_tags.py | 2 +- 5 files changed, 272 insertions(+), 39 deletions(-) diff --git a/lbry/lbry/schema/tags.py b/lbry/lbry/schema/tags.py index 7283ea602..ce6ee1875 100644 --- a/lbry/lbry/schema/tags.py +++ b/lbry/lbry/schema/tags.py @@ -6,7 +6,7 @@ WEIRD_CHARS_RE = re.compile(r"[#!~]") def normalize_tag(tag: str): - return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip() + return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower().replace("'", ""))).strip() def clean_tags(tags: List[str]): diff --git a/lbry/lbry/wallet/server/db/common.py b/lbry/lbry/wallet/server/db/common.py index dffe752a8..827e4b783 100644 --- a/lbry/lbry/wallet/server/db/common.py +++ b/lbry/lbry/wallet/server/db/common.py @@ -11,3 +11,210 @@ STREAM_TYPES = { 'binary': 5, 'model': 6 } + +MATURE_TAGS = [ + 'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex' +] + +COMMON_TAGS = { + "gaming": "gaming", + "people & blogs": "people_and_blogs", + "pop culture": "pop_culture", + "entertainment": "entertainment", + "technology": "technology", + "music": "music", + "funny": "funny", + "education": "education", + "learning": "learning", + "news": "news", + "gameplay": "gameplay", + "science & technology": "science_and_technology", + "playstation 4": "playstation_4", + "beliefs": "beliefs", + "nature": "nature", + "news & politics": "news_and_politics", + "comedy": "comedy", + "games": "games", + "sony interactive entertainment": "sony_interactive_entertainment", + "film & animation": "film_and_animation", + "game": "game", + "howto & style": "howto_and_style", + "weapons": "weapons", + "blockchain": "blockchain", + "video game": "video_game", + "sports": "sports", + "walkthrough": "walkthrough", + "ps4live": "ps4live", + "art": "art", + "pc": "pc", + "economics": "economics", + "automotive": "automotive", + "minecraft": "minecraft", + "playthrough": "playthrough", + "ps4share": "ps4share", + "tutorial": "tutorial", + "play": "play", + "twitch": "twitch", + "how to": "how_to", + "ps4": "ps4", + "bitcoin": "bitcoin", + "fortnite": "fortnite", + "commentary": "commentary", + "lets play": "lets_play", + "fun": "fun", + "politics": "politics", + "xbox": "xbox", + "autos & vehicles": "autos_and_vehicles", + "travel & events": "travel_and_events", + "food": "food", + "science": "science", + "mature": "mature", + "xbox one": "xbox_one", + "liberal": "liberal", + "democrat": "democrat", + "progressive": "progressive", + "survival": "survival", + "nonprofits & activism": "nonprofits_and_activism", + "cryptocurrency": "cryptocurrency", + "playstation": "playstation", + "nintendo": "nintendo", + "government": "government", + "steam": "steam", + "podcast": "podcast", + "horror": "horror", + "conservative": "conservative", + "reaction": "reaction", + "trailer": "trailer", + "love": "love", + "cnn": "cnn", + "republican": "republican", + "gamer": "gamer", + "political": "political", + "hangoutsonair": "hangoutsonair", + "hoa": "hoa", + "msnbc": "msnbc", + "cbs": "cbs", + "donald trump": "donald_trump", + "fiction": "fiction", + "fox news": "fox_news", + "anime": "anime", + "crypto": "crypto", + "ethereum": "ethereum", + "call of duty": "call_of_duty", + "multiplayer": "multiplayer", + "android": "android", + "epic": "epic", + "rpg": "rpg", + "adventure": "adventure", + "secular talk": "secular_talk", + "btc": "btc", + "atheist": "atheist", + "atheism": "atheism", + "ps3": "ps3", + "video games": "video_games", + "cod": "cod", + "agnostic": "agnostic", + "movie": "movie", + "online": "online", + "fps": "fps", + "mod": "mod", + "reviews": "reviews", + "sharefactory": "sharefactory", + "world": "world", + "space": "space", + "hilarious": "hilarious", + "stream": "stream", + "lol": "lol", + "sony": "sony", + "god": "god", + "lets": "lets", + "dance": "dance", + "pvp": "pvp", + "tech": "tech", + "zombies": "zombies", + "pokemon": "pokemon", + "fail": "fail", + "xbox 360": "xbox_360", + "film": "film", + "unboxing": "unboxing", + "animation": "animation", + "travel": "travel", + "money": "money", + "wwe": "wwe", + "how": "how", + "mods": "mods", + "pubg": "pubg", + "indie": "indie", + "strategy": "strategy", + "history": "history", + "rap": "rap", + "ios": "ios", + "sony computer entertainment": "sony_computer_entertainment", + "mobile": "mobile", + "trump": "trump", + "flat earth": "flat_earth", + "hack": "hack", + "trap": "trap", + "fox": "fox", + "vlogging": "vlogging", + "news radio": "news_radio", + "humor": "humor", + "facebook": "facebook", + "edm": "edm", + "fitness": "fitness", + "vaping": "vaping", + "hip hop": "hip_hop", + "secular": "secular", + "jesus": "jesus", + "vape": "vape", + "song": "song", + "remix": "remix", + "guitar": "guitar", + "daily": "daily", + "mining": "mining", + "diy": "diy", + "videogame": "videogame", + "pets & animals": "pets_and_animals", + "funny moments": "funny_moments", + "religion": "religion", + "death": "death", + "media": "media", + "nbc": "nbc", + "war": "war", + "freedom": "freedom", + "viral": "viral", + "meme": "meme", + "family": "family", + "gold": "gold", + "photography": "photography", + "chill": "chill", + "zombie": "zombie", + "computer": "computer", + "sniper": "sniper", + "bible": "bible", + "linux": "linux", + "overwatch": "overwatch", + "pro": "pro", + "dragon": "dragon", + "litecoin": "litecoin", + "gta": "gta", + "iphone": "iphone", + "house": "house", + "bass": "bass", + "bitcoin news": "bitcoin_news", + "wii": "wii", + "crash": "crash", + "league of legends": "league_of_legends", + "grand theft auto v": "grand_theft_auto_v", + "mario": "mario", + "mmorpg": "mmorpg", + "satire": "satire", + "fire": "fire", + "racing": "racing", + "apple": "apple", + "health": "health", + "instrumental": "instrumental", + "destiny": "destiny", + "truth": "truth", + "race": "race" +} diff --git a/lbry/lbry/wallet/server/db/reader.py b/lbry/lbry/wallet/server/db/reader.py index 66073c6c4..3291764f2 100644 --- a/lbry/lbry/wallet/server/db/reader.py +++ b/lbry/lbry/wallet/server/db/reader.py @@ -16,7 +16,7 @@ from lbry.schema.tags import clean_tags from lbry.schema.result import Outputs from lbry.wallet.ledger import BaseLedger, MainNetLedger, RegTestLedger -from .common import CLAIM_TYPES, STREAM_TYPES +from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS class SQLiteOperationalError(sqlite3.OperationalError): @@ -433,24 +433,36 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun any_items = {item for item in any_items if item not in not_items} if any_items: + + any_queries = {} + + common_items = any_items & COMMON_TAGS.keys() + if common_items: + any_items -= common_items + for item in common_items: + index_name = COMMON_TAGS[item] + any_queries[f'$any_{attr}_{index_name}'] = item + any_queries[f'#_any_{attr}_{index_name}'] = f""" + EXISTS( + SELECT 1 FROM {attr} INDEXED BY tag_{index_name}_idx WHERE + claim.claim_hash={attr}.claim_hash + AND {attr} = '{item}' + ) + """ + constraints.update({ f'$any_{attr}{i}': item for i, item in enumerate(any_items) }) values = ', '.join( f':$any_{attr}{i}' for i in range(len(any_items)) ) - if for_count: - constraints[f'claim.claim_hash__in#_any_{attr}'] = f""" - SELECT claim_hash FROM {attr} WHERE {attr} IN ({values}) - """ - else: - constraints[f'#_any_{attr}'] = f""" - EXISTS( - SELECT 1 FROM {attr} WHERE - claim.claim_hash={attr}.claim_hash - AND {attr} IN ({values}) - ) - """ + any_queries[f'claim.claim_hash__in#_any_{attr}'] = f""" + SELECT claim_hash FROM {attr} WHERE {attr} IN ({values}) + """ + if len(any_queries) == 1: + constraints.update(any_queries) + elif len(any_queries) > 1: + constraints[f'ORed_{attr}_queries__any'] = any_queries if all_items: constraints[f'$all_{attr}_count'] = len(all_items) diff --git a/lbry/lbry/wallet/server/db/writer.py b/lbry/lbry/wallet/server/db/writer.py index 9e7988e51..1396ef0e2 100644 --- a/lbry/lbry/wallet/server/db/writer.py +++ b/lbry/lbry/wallet/server/db/writer.py @@ -16,7 +16,7 @@ from lbry.wallet.server.db.trending import ( CREATE_TREND_TABLE, calculate_trending, register_trending_functions ) -from .common import CLAIM_TYPES, STREAM_TYPES +from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS ATTRIBUTE_ARRAY_MAX_LENGTH = 100 @@ -76,30 +76,12 @@ class SQLDB: trending_global integer not null default 0 ); - create index if not exists claim_resolve_idx on claim (normalized, claim_id); + create index if not exists claim_normalized_idx on claim (normalized, activation_height); + create index if not exists claim_channel_hash_idx on claim (channel_hash, signature, claim_hash); create index if not exists claim_claims_in_channel_idx on claim (signature_valid, channel_hash, normalized); - - create index if not exists claim_id_idx on claim (claim_id); - create index if not exists claim_normalized_idx on claim (normalized); create index if not exists claim_txo_hash_idx on claim (txo_hash); - create index if not exists claim_channel_hash_idx on claim (channel_hash); - create index if not exists claim_timestamp_idx on claim (timestamp); - create index if not exists claim_height_idx on claim (height); - create index if not exists claim_activation_height_idx on claim (activation_height); + create index if not exists claim_activation_height_idx on claim (activation_height, claim_hash); create index if not exists claim_expiration_height_idx on claim (expiration_height); - create index if not exists claim_public_key_hash_idx on claim (public_key_hash); - - create index if not exists claim_claim_type_idx on claim (claim_type); - create index if not exists claim_stream_type_idx on claim (stream_type); - create index if not exists claim_media_type_idx on claim (media_type); - create index if not exists claim_fee_amount_idx on claim (fee_amount); - create index if not exists claim_fee_currency_idx on claim (fee_currency); - - create index if not exists claim_signature_valid_idx on claim (signature_valid); - - create unique index if not exists claim_effective_amount_idx on claim (effective_amount, claim_hash, release_time); - create unique index if not exists claim_release_time_idx on claim (release_time, claim_hash); - create unique index if not exists claim_trending_global_mixed_idx on claim (trending_global, trending_mixed, claim_hash); """ CREATE_SUPPORT_TABLE = """ @@ -110,7 +92,6 @@ class SQLDB: claim_hash bytes not null, amount integer not null ); - create index if not exists support_txo_hash_idx on support (txo_hash); create index if not exists support_claim_hash_idx on support (claim_hash, height); """ @@ -120,7 +101,6 @@ class SQLDB: claim_hash bytes not null, height integer not null ); - create index if not exists tag_tag_idx on tag (tag); create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag); """ @@ -133,6 +113,37 @@ class SQLDB: create index if not exists claimtrie_claim_hash_idx on claimtrie (claim_hash); """ + SEARCH_INDEXES = """ + -- used by any tag clouds + create index if not exists tag_tag_idx on tag (tag, claim_hash); + {custom_tags_indexes} + + -- common ORDER BY + create unique index if not exists claim_effective_amount_idx on claim (effective_amount, claim_hash, release_time); + create unique index if not exists claim_release_time_idx on claim (release_time, claim_hash); + create unique index if not exists claim_trending_global_mixed_idx on claim (trending_global, trending_mixed, claim_hash); + + -- TODO: verify that all indexes below are used + create index if not exists claim_height_normalized_idx on claim (height, normalized asc); + + create index if not exists claim_resolve_idx on claim (normalized, claim_id); + + create index if not exists claim_id_idx on claim (claim_id, claim_hash); + create index if not exists claim_timestamp_idx on claim (timestamp); + create index if not exists claim_public_key_hash_idx on claim (public_key_hash); + + create index if not exists claim_claim_type_idx on claim (claim_type); + create index if not exists claim_stream_type_idx on claim (stream_type); + create index if not exists claim_media_type_idx on claim (media_type); + create index if not exists claim_fee_amount_idx on claim (fee_amount); + create index if not exists claim_fee_currency_idx on claim (fee_currency); + + create index if not exists claim_signature_valid_idx on claim (signature_valid); + """.format(custom_tags_indexes='\n'.join( + f'create unique index if not exists tag_{tag_key}_idx on tag (tag, claim_hash) WHERE tag="{tag_value}";' + for tag_value, tag_key in COMMON_TAGS.items() + )) + CREATE_TABLES_QUERY = ( PRAGMAS + CREATE_CLAIM_TABLE + @@ -688,6 +699,9 @@ class SQLDB: r(self.update_claimtrie, height, recalculate_claim_hashes, deleted_claim_names, forward_timer=True) r(calculate_trending, self.db, height, self.main.first_sync, daemon_height) + if self.main.first_sync and height == daemon_height: + self.db.executescript(self.SEARCH_INDEXES) + class LBRYDB(DB): diff --git a/lbry/tests/unit/schema/test_tags.py b/lbry/tests/unit/schema/test_tags.py index 110506832..4dd637a29 100644 --- a/lbry/tests/unit/schema/test_tags.py +++ b/lbry/tests/unit/schema/test_tags.py @@ -11,7 +11,7 @@ class TestTagNormalization(unittest.TestCase): def test_normalize_tag(self): tag = self.assertNormalizedTag tag('', ' \t #!~') - tag('tag', 'Tag') + tag('tag', 'T\'ag') tag('t ag', '\tT \nAG ') tag('tag hash', '#tag~#hash!')