diff --git a/lbry/wallet/server/block_processor.py b/lbry/wallet/server/block_processor.py index eceb62e08..7fe0ae059 100644 --- a/lbry/wallet/server/block_processor.py +++ b/lbry/wallet/server/block_processor.py @@ -768,6 +768,7 @@ class LBRYBlockProcessor(BlockProcessor): self.timer.run(self.sql.execute, self.sql.SEARCH_INDEXES, timer_name='executing SEARCH_INDEXES') if self.env.individual_tag_indexes: self.timer.run(self.sql.execute, self.sql.TAG_INDEXES, timer_name='executing TAG_INDEXES') + self.timer.run(self.sql.execute, self.sql.LANGUAGE_INDEXES, timer_name='executing LANGUAGE_INDEXES') def advance_txs(self, height, txs, header): timer = self.timer.sub_timers['advance_blocks'] diff --git a/lbry/wallet/server/db/common.py b/lbry/wallet/server/db/common.py index 8f75737e7..d1b88d49d 100644 --- a/lbry/wallet/server/db/common.py +++ b/lbry/wallet/server/db/common.py @@ -13,209 +13,306 @@ STREAM_TYPES = { 'model': 6 } +# 9/21/2020 +MOST_USED_TAGS = { + "gaming", + "people & blogs", + "entertainment", + "music", + "pop culture", + "education", + "technology", + "blockchain", + "news", + "funny", + "science & technology", + "learning", + "gameplay", + "news & politics", + "comedy", + "bitcoin", + "beliefs", + "nature", + "art", + "economics", + "film & animation", + "lets play", + "games", + "sports", + "howto & style", + "game", + "cryptocurrency", + "playstation 4", + "automotive", + "crypto", + "mature", + "sony interactive entertainment", + "walkthrough", + "tutorial", + "video game", + "weapons", + "pc", + "playthrough", + "anime", + "how to", + "btc", + "fun", + "ethereum", + "food", + "travel & events", + "minecraft", + "science", + "autos & vehicles", + "play", + "politics", + "commentary", + "twitch", + "ps4live", + "love", + "ps4", + "nonprofits & activism", + "ps4share", + "fortnite", + "xbox", + "porn", + "video games", + "trump", + "espaƱol", + "money", + "music video", + "movie", + "coronavirus", + "nintendo", + "donald trump", + "steam", + "trailer", + "android", + "podcast", + "xbox one", + "survival", + "linux", + "travel", + "funny moments", + "audio", + "litecoin", + "animation", + "gamer", + "lets", + "playstation", + "bitcoin news", + "history", + "fox news", + "xxx", + "god", + "dance", + "adventure", + "liberal", + "horror", + "government", + "freedom", + "2020", + "reaction", + "meme", + "photography", + "truth" +} + MATURE_TAGS = [ 'nsfw', 'porn', 'xxx', 'mature', 'adult', 'sex' ] + +def normalize_tag(tag): + return tag.replace(" ", "_").replace("&", "and").replace("-", "_") + + COMMON_TAGS = { - "gaming": "gaming", - "people & blogs": "people_and_blogs", - "pop culture": "pop_culture", - "entertainment": "entertainment", - "technology": "technology", - "music": "music", - "funny": "funny", - "education": "education", - "learning": "learning", - "news": "news", - "gameplay": "gameplay", - "science & technology": "science_and_technology", - "playstation 4": "playstation_4", - "beliefs": "beliefs", - "nature": "nature", - "news & politics": "news_and_politics", - "comedy": "comedy", - "games": "games", - "sony interactive entertainment": "sony_interactive_entertainment", - "film & animation": "film_and_animation", - "game": "game", - "howto & style": "howto_and_style", - "weapons": "weapons", - "blockchain": "blockchain", - "video game": "video_game", - "sports": "sports", - "walkthrough": "walkthrough", - "ps4live": "ps4live", - "art": "art", - "pc": "pc", - "economics": "economics", - "automotive": "automotive", - "minecraft": "minecraft", - "playthrough": "playthrough", - "ps4share": "ps4share", - "tutorial": "tutorial", - "play": "play", - "twitch": "twitch", - "how to": "how_to", - "ps4": "ps4", - "bitcoin": "bitcoin", - "fortnite": "fortnite", - "commentary": "commentary", - "lets play": "lets_play", - "fun": "fun", - "politics": "politics", - "xbox": "xbox", - "autos & vehicles": "autos_and_vehicles", - "travel & events": "travel_and_events", - "food": "food", - "science": "science", - "mature": "mature", - "xbox one": "xbox_one", - "liberal": "liberal", - "democrat": "democrat", - "progressive": "progressive", - "survival": "survival", - "nonprofits & activism": "nonprofits_and_activism", - "cryptocurrency": "cryptocurrency", - "playstation": "playstation", - "nintendo": "nintendo", - "government": "government", - "steam": "steam", - "podcast": "podcast", - "horror": "horror", - "conservative": "conservative", - "reaction": "reaction", - "trailer": "trailer", - "love": "love", - "cnn": "cnn", - "republican": "republican", - "gamer": "gamer", - "political": "political", - "hangoutsonair": "hangoutsonair", - "hoa": "hoa", - "msnbc": "msnbc", - "cbs": "cbs", - "donald trump": "donald_trump", - "fiction": "fiction", - "fox news": "fox_news", - "anime": "anime", - "crypto": "crypto", - "ethereum": "ethereum", - "call of duty": "call_of_duty", - "multiplayer": "multiplayer", - "android": "android", - "epic": "epic", - "rpg": "rpg", - "adventure": "adventure", - "secular talk": "secular_talk", - "btc": "btc", - "atheist": "atheist", - "atheism": "atheism", - "ps3": "ps3", - "video games": "video_games", - "cod": "cod", - "agnostic": "agnostic", - "movie": "movie", - "online": "online", - "fps": "fps", - "mod": "mod", - "reviews": "reviews", - "sharefactory": "sharefactory", - "world": "world", - "space": "space", - "hilarious": "hilarious", - "stream": "stream", - "lol": "lol", - "sony": "sony", - "god": "god", - "lets": "lets", - "dance": "dance", - "pvp": "pvp", - "tech": "tech", - "zombies": "zombies", - "pokemon": "pokemon", - "fail": "fail", - "xbox 360": "xbox_360", - "film": "film", - "unboxing": "unboxing", - "animation": "animation", - "travel": "travel", - "money": "money", - "wwe": "wwe", - "how": "how", - "mods": "mods", - "pubg": "pubg", - "indie": "indie", - "strategy": "strategy", - "history": "history", - "rap": "rap", - "ios": "ios", - "sony computer entertainment": "sony_computer_entertainment", - "mobile": "mobile", - "trump": "trump", - "flat earth": "flat_earth", - "hack": "hack", - "trap": "trap", - "fox": "fox", - "vlogging": "vlogging", - "news radio": "news_radio", - "humor": "humor", - "facebook": "facebook", - "edm": "edm", - "fitness": "fitness", - "vaping": "vaping", - "hip hop": "hip_hop", - "secular": "secular", - "jesus": "jesus", - "vape": "vape", - "song": "song", - "remix": "remix", - "guitar": "guitar", - "daily": "daily", - "mining": "mining", - "diy": "diy", - "videogame": "videogame", - "pets & animals": "pets_and_animals", - "funny moments": "funny_moments", - "religion": "religion", - "death": "death", - "media": "media", - "nbc": "nbc", - "war": "war", - "freedom": "freedom", - "viral": "viral", - "meme": "meme", - "family": "family", - "gold": "gold", - "photography": "photography", - "chill": "chill", - "zombie": "zombie", - "computer": "computer", - "sniper": "sniper", - "bible": "bible", - "linux": "linux", - "overwatch": "overwatch", - "pro": "pro", - "dragon": "dragon", - "litecoin": "litecoin", - "gta": "gta", - "iphone": "iphone", - "house": "house", - "bass": "bass", - "bitcoin news": "bitcoin_news", - "wii": "wii", - "crash": "crash", - "league of legends": "league_of_legends", - "grand theft auto v": "grand_theft_auto_v", - "mario": "mario", - "mmorpg": "mmorpg", - "satire": "satire", - "fire": "fire", - "racing": "racing", - "apple": "apple", - "health": "health", - "instrumental": "instrumental", - "destiny": "destiny", - "truth": "truth", - "race": "race" + tag: normalize_tag(tag) for tag in list(MOST_USED_TAGS) } + +INDEXED_LANGUAGES = [ + 'en', + 'aa', + 'ab', + 'ae', + 'af', + 'ak', + 'am', + 'an', + 'ar', + 'as', + 'av', + 'ay', + 'az', + 'ba', + 'be', + 'bg', + 'bh', + 'bi', + 'bm', + 'bn', + 'bo', + 'br', + 'bs', + 'ca', + 'ce', + 'ch', + 'co', + 'cr', + 'cs', + 'cu', + 'cv', + 'cy', + 'da', + 'de', + 'dv', + 'dz', + 'ee', + 'el', + 'eo', + 'es', + 'et', + 'eu', + 'fa', + 'ff', + 'fi', + 'fj', + 'fo', + 'fr', + 'fy', + 'ga', + 'gd', + 'gl', + 'gn', + 'gu', + 'gv', + 'ha', + 'he', + 'hi', + 'ho', + 'hr', + 'ht', + 'hu', + 'hy', + 'hz', + 'ia', + 'id', + 'ie', + 'ig', + 'ii', + 'ik', + 'io', + 'is', + 'it', + 'iu', + 'ja', + 'jv', + 'ka', + 'kg', + 'ki', + 'kj', + 'kk', + 'kl', + 'km', + 'kn', + 'ko', + 'kr', + 'ks', + 'ku', + 'kv', + 'kw', + 'ky', + 'la', + 'lb', + 'lg', + 'li', + 'ln', + 'lo', + 'lt', + 'lu', + 'lv', + 'mg', + 'mh', + 'mi', + 'mk', + 'ml', + 'mn', + 'mr', + 'ms', + 'mt', + 'my', + 'na', + 'nb', + 'nd', + 'ne', + 'ng', + 'nl', + 'nn', + 'no', + 'nr', + 'nv', + 'ny', + 'oc', + 'oj', + 'om', + 'or', + 'os', + 'pa', + 'pi', + 'pl', + 'ps', + 'pt', + 'qu', + 'rm', + 'rn', + 'ro', + 'ru', + 'rw', + 'sa', + 'sc', + 'sd', + 'se', + 'sg', + 'si', + 'sk', + 'sl', + 'sm', + 'sn', + 'so', + 'sq', + 'sr', + 'ss', + 'st', + 'su', + 'sv', + 'sw', + 'ta', + 'te', + 'tg', + 'th', + 'ti', + 'tk', + 'tl', + 'tn', + 'to', + 'tr', + 'ts', + 'tt', + 'tw', + 'ty', + 'ug', + 'uk', + 'ur', + 'uz', + 've', + 'vi', + 'vo', + 'wa', + 'wo', + 'xh', + 'yi', + 'yo', + 'za', + 'zh', + 'zu' +] diff --git a/lbry/wallet/server/db/reader.py b/lbry/wallet/server/db/reader.py index d05a3ae92..a3736750c 100644 --- a/lbry/wallet/server/db/reader.py +++ b/lbry/wallet/server/db/reader.py @@ -18,7 +18,7 @@ from lbry.schema.tags import clean_tags from lbry.schema.result import Outputs, Censor from lbry.wallet import Ledger, RegTestLedger -from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS +from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES from .full_text_search import FTS_ORDER_BY @@ -536,6 +536,18 @@ def _apply_constraints_for_array_attributes(constraints, attr, cleaner, for_coun AND tag IN ({values}) ) """ + elif attr == 'language': + indexed_languages = any_items & set(INDEXED_LANGUAGES) + if indexed_languages: + any_items -= indexed_languages + for language in indexed_languages: + any_queries[f'#_any_common_languages_{language}'] = f""" + EXISTS( + SELECT 1 FROM language INDEXED BY language_{language}_idx + WHERE {CLAIM_HASH_OR_REPOST_HASH_SQL}=language.claim_hash + AND language = '{language}' + ) + """ if any_items: diff --git a/lbry/wallet/server/db/writer.py b/lbry/wallet/server/db/writer.py index 988b7b266..e83f9f7ac 100644 --- a/lbry/wallet/server/db/writer.py +++ b/lbry/wallet/server/db/writer.py @@ -6,7 +6,6 @@ from decimal import Decimal from collections import namedtuple from multiprocessing import Manager from binascii import unhexlify - from lbry.wallet.server.leveldb import LevelDB from lbry.wallet.server.util import class_logger from lbry.wallet.database import query, constraints_to_sql @@ -19,7 +18,7 @@ from lbry.wallet.server.db.canonical import register_canonical_functions from lbry.wallet.server.db.full_text_search import update_full_text_search, CREATE_FULL_TEXT_SEARCH, first_sync_finished from lbry.wallet.server.db.trending import TRENDING_ALGORITHMS -from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS +from .common import CLAIM_TYPES, STREAM_TYPES, COMMON_TAGS, INDEXED_LANGUAGES ATTRIBUTE_ARRAY_MAX_LENGTH = 100 @@ -117,6 +116,15 @@ class SQLDB: create unique index if not exists tag_claim_hash_tag_idx on tag (claim_hash, tag); """ + CREATE_LANGUAGE_TABLE = """ + create table if not exists language ( + language text not null, + claim_hash bytes not null, + height integer not null + ); + create unique index if not exists language_claim_hash_language_idx on language (claim_hash, language); + """ + CREATE_CLAIMTRIE_TABLE = """ create table if not exists claimtrie ( normalized text primary key, @@ -174,12 +182,18 @@ class SQLDB: for tag_value, tag_key in COMMON_TAGS.items() ) + LANGUAGE_INDEXES = '\n'.join( + f"create unique index if not exists language_{language}_idx on language (language, claim_hash) WHERE language='{language}';" + for language in INDEXED_LANGUAGES + ) + CREATE_TABLES_QUERY = ( CREATE_CLAIM_TABLE + CREATE_FULL_TEXT_SEARCH + CREATE_SUPPORT_TABLE + CREATE_CLAIMTRIE_TABLE + - CREATE_TAG_TABLE + CREATE_TAG_TABLE + + CREATE_LANGUAGE_TABLE ) def __init__( @@ -305,7 +319,7 @@ class SQLDB: self.execute('commit;') def _upsertable_claims(self, txos: List[Output], header, clear_first=False): - claim_hashes, claims, tags = set(), [], {} + claim_hashes, claims, tags, languages = set(), [], {}, {} for txo in txos: tx = txo.tx_ref.tx @@ -316,6 +330,13 @@ class SQLDB: #self.logger.exception(f"Could not decode claim name for {tx.id}:{txo.position}.") continue + language = None + try: + if txo.claim.is_stream and txo.claim.stream.languages: + language = txo.claim.stream.languages[0].language + except: + pass + claim_hash = txo.claim_hash claim_hashes.add(claim_hash) claim_record = { @@ -373,6 +394,9 @@ class SQLDB: elif claim.is_channel: claim_record['claim_type'] = CLAIM_TYPES['channel'] + if language: + languages[(language, claim_hash)] = (language, claim_hash, tx.height) + for tag in clean_tags(claim.message.tags): tags[(tag, claim_hash)] = (tag, claim_hash, tx.height) @@ -383,6 +407,10 @@ class SQLDB: self.executemany( "INSERT OR IGNORE INTO tag (tag, claim_hash, height) VALUES (?, ?, ?)", tags.values() ) + if languages: + self.executemany( + "INSERT OR IGNORE INTO language (language, claim_hash, height) VALUES (?, ?, ?)", languages.values() + ) return claims diff --git a/tests/integration/blockchain/test_claim_commands.py b/tests/integration/blockchain/test_claim_commands.py index eb239978a..c80624c06 100644 --- a/tests/integration/blockchain/test_claim_commands.py +++ b/tests/integration/blockchain/test_claim_commands.py @@ -262,6 +262,17 @@ class ClaimSearchCommand(ClaimTestCase): await self.assertFindsClaims([claim4, claim3, claim2], fee_amount='<1.0', fee_currency='lbc') await self.assertFindsClaims([claim3], fee_amount='0.5', fee_currency='lbc') await self.assertFindsClaims([claim5], fee_currency='usd') + + async def test_search_by_language(self): + claim1 = await self.stream_create('claim1', fee_amount='1.0', fee_currency='lbc') + claim2 = await self.stream_create('claim2', fee_amount='0.9', fee_currency='lbc') + claim3 = await self.stream_create('claim3', fee_amount='0.5', fee_currency='lbc', languages='en') + claim4 = await self.stream_create('claim4', fee_amount='0.1', fee_currency='lbc', languages='en') + claim5 = await self.stream_create('claim5', fee_amount='1.0', fee_currency='usd', languages='es') + + await self.assertFindsClaims([claim4, claim3], any_languages=['en']) + await self.assertFindsClaims([claim5], any_languages=['es']) + await self.assertFindsClaims([claim5, claim4, claim3], any_languages=['en', 'es']) await self.assertFindsClaims([], fee_currency='foo') async def test_search_by_channel(self):