add tag normalization on storage and on search

This commit is contained in:
Lex Berezhny 2019-06-22 20:11:33 -04:00
parent 8ca8591ed6
commit a8b590ecbb
4 changed files with 47 additions and 14 deletions

13
lbry/lbry/schema/tags.py Normal file
View file

@ -0,0 +1,13 @@
from typing import List
import re
MULTI_SPACE_RE = re.compile(r"\s{2,}")
WEIRD_CHARS_RE = re.compile(r"[#!~]")
def normalize_tag(tag: str):
return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip()
def clean_tags(tags: List[str]):
return [tag for tag in (normalize_tag(tag) for tag in tags) if tag]

View file

@ -10,6 +10,7 @@ from torba.server.util import class_logger
from torba.client.basedatabase import query, constraints_to_sql
from lbry.schema.url import URL, normalize_name
from lbry.schema.tags import clean_tags
from lbry.schema.mime_types import guess_stream_type
from lbry.wallet.ledger import MainNetLedger, RegTestLedger
from lbry.wallet.transaction import Transaction, Output
@ -34,8 +35,8 @@ STREAM_TYPES = {
}
def _apply_constraints_for_array_attributes(constraints, attr):
any_items = constraints.pop(f'any_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
def _apply_constraints_for_array_attributes(constraints, attr, cleaner):
any_items = cleaner(constraints.pop(f'any_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
if any_items:
constraints.update({
f'$any_{attr}{i}': item for i, item in enumerate(any_items)
@ -47,7 +48,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
SELECT DISTINCT claim_hash FROM {attr} WHERE {attr} IN ({values})
"""
all_items = constraints.pop(f'all_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
all_items = cleaner(constraints.pop(f'all_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
if all_items:
constraints[f'$all_{attr}_count'] = len(all_items)
constraints.update({
@ -61,7 +62,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
GROUP BY claim_hash HAVING COUNT({attr}) = :$all_{attr}_count
"""
not_items = constraints.pop(f'not_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
not_items = cleaner(constraints.pop(f'not_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
if not_items:
constraints.update({
f'$not_{attr}{i}': item for i, item in enumerate(not_items)
@ -293,7 +294,7 @@ class SQLDB:
elif claim.is_channel:
claim_record['claim_type'] = CLAIM_TYPES['channel']
for tag in claim.message.tags:
for tag in clean_tags(claim.message.tags):
tags.append((tag, claim_hash, tx.height))
if clear_first:
@ -820,9 +821,9 @@ class SQLDB:
if media_types:
constraints['claim.media_type__in'] = media_types
_apply_constraints_for_array_attributes(constraints, 'tag')
_apply_constraints_for_array_attributes(constraints, 'language')
_apply_constraints_for_array_attributes(constraints, 'location')
_apply_constraints_for_array_attributes(constraints, 'tag', clean_tags)
_apply_constraints_for_array_attributes(constraints, 'language', lambda _: _)
_apply_constraints_for_array_attributes(constraints, 'location', lambda _: _)
select = f"SELECT {cols} FROM claim"

View file

@ -180,14 +180,14 @@ class ClaimSearchCommand(ClaimTestCase):
self.assertEqual(out_of_bounds, [])
async def test_tag_search(self):
claim1 = await self.stream_create('claim1', tags=['abc'])
claim2 = await self.stream_create('claim2', tags=['abc', 'def'])
claim1 = await self.stream_create('claim1', tags=['aBc'])
claim2 = await self.stream_create('claim2', tags=['#abc', 'def'])
claim3 = await self.stream_create('claim3', tags=['abc', 'ghi', 'jkl'])
claim4 = await self.stream_create('claim4', tags=['abc', 'ghi', 'mno'])
claim4 = await self.stream_create('claim4', tags=['abc\t', 'ghi', 'mno'])
claim5 = await self.stream_create('claim5', tags=['pqr'])
# any_tags
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['abc', 'pqr'])
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['\tabc', 'pqr'])
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc'])
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc', 'ghi'])
await self.assertFindsClaims([claim4, claim3], any_tags=['ghi'])
@ -196,7 +196,7 @@ class ClaimSearchCommand(ClaimTestCase):
# all_tags
await self.assertFindsClaims([], all_tags=['abc', 'pqr'])
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['abc'])
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['ABC'])
await self.assertFindsClaims([claim4, claim3], all_tags=['abc', 'ghi'])
await self.assertFindsClaims([claim4, claim3], all_tags=['ghi'])
await self.assertFindsClaims([], all_tags=['ghi', 'xyz'])
@ -204,7 +204,7 @@ class ClaimSearchCommand(ClaimTestCase):
# not_tags
await self.assertFindsClaims([], not_tags=['abc', 'pqr'])
await self.assertFindsClaims([claim5], not_tags=['abc'])
await self.assertFindsClaims([claim5], not_tags=['abC'])
await self.assertFindsClaims([claim5], not_tags=['abc', 'ghi'])
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi'])
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi', 'xyz'])

View file

@ -0,0 +1,19 @@
import unittest
from lbry.schema.tags import normalize_tag, clean_tags
class TestTagNormalization(unittest.TestCase):
def assertNormalizedTag(self, clean, dirty):
self.assertEqual(clean, normalize_tag(dirty))
def test_normalize_tag(self):
tag = self.assertNormalizedTag
tag('', ' \t #!~')
tag('tag', 'Tag')
tag('t ag', '\tT \nAG ')
tag('tag hash', '#tag~#hash!')
def test_clean_tags(self):
self.assertEqual(['tag'], clean_tags([' \t #!~', '!taG', '\t']))