add tag normalization on storage and on search
This commit is contained in:
parent
8ca8591ed6
commit
a8b590ecbb
4 changed files with 47 additions and 14 deletions
13
lbry/lbry/schema/tags.py
Normal file
13
lbry/lbry/schema/tags.py
Normal file
|
@ -0,0 +1,13 @@
|
|||
from typing import List
|
||||
import re
|
||||
|
||||
MULTI_SPACE_RE = re.compile(r"\s{2,}")
|
||||
WEIRD_CHARS_RE = re.compile(r"[#!~]")
|
||||
|
||||
|
||||
def normalize_tag(tag: str):
|
||||
return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip()
|
||||
|
||||
|
||||
def clean_tags(tags: List[str]):
|
||||
return [tag for tag in (normalize_tag(tag) for tag in tags) if tag]
|
|
@ -10,6 +10,7 @@ from torba.server.util import class_logger
|
|||
from torba.client.basedatabase import query, constraints_to_sql
|
||||
|
||||
from lbry.schema.url import URL, normalize_name
|
||||
from lbry.schema.tags import clean_tags
|
||||
from lbry.schema.mime_types import guess_stream_type
|
||||
from lbry.wallet.ledger import MainNetLedger, RegTestLedger
|
||||
from lbry.wallet.transaction import Transaction, Output
|
||||
|
@ -34,8 +35,8 @@ STREAM_TYPES = {
|
|||
}
|
||||
|
||||
|
||||
def _apply_constraints_for_array_attributes(constraints, attr):
|
||||
any_items = constraints.pop(f'any_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
def _apply_constraints_for_array_attributes(constraints, attr, cleaner):
|
||||
any_items = cleaner(constraints.pop(f'any_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
if any_items:
|
||||
constraints.update({
|
||||
f'$any_{attr}{i}': item for i, item in enumerate(any_items)
|
||||
|
@ -47,7 +48,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
|
|||
SELECT DISTINCT claim_hash FROM {attr} WHERE {attr} IN ({values})
|
||||
"""
|
||||
|
||||
all_items = constraints.pop(f'all_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
all_items = cleaner(constraints.pop(f'all_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
if all_items:
|
||||
constraints[f'$all_{attr}_count'] = len(all_items)
|
||||
constraints.update({
|
||||
|
@ -61,7 +62,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
|
|||
GROUP BY claim_hash HAVING COUNT({attr}) = :$all_{attr}_count
|
||||
"""
|
||||
|
||||
not_items = constraints.pop(f'not_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
not_items = cleaner(constraints.pop(f'not_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||
if not_items:
|
||||
constraints.update({
|
||||
f'$not_{attr}{i}': item for i, item in enumerate(not_items)
|
||||
|
@ -293,7 +294,7 @@ class SQLDB:
|
|||
elif claim.is_channel:
|
||||
claim_record['claim_type'] = CLAIM_TYPES['channel']
|
||||
|
||||
for tag in claim.message.tags:
|
||||
for tag in clean_tags(claim.message.tags):
|
||||
tags.append((tag, claim_hash, tx.height))
|
||||
|
||||
if clear_first:
|
||||
|
@ -820,9 +821,9 @@ class SQLDB:
|
|||
if media_types:
|
||||
constraints['claim.media_type__in'] = media_types
|
||||
|
||||
_apply_constraints_for_array_attributes(constraints, 'tag')
|
||||
_apply_constraints_for_array_attributes(constraints, 'language')
|
||||
_apply_constraints_for_array_attributes(constraints, 'location')
|
||||
_apply_constraints_for_array_attributes(constraints, 'tag', clean_tags)
|
||||
_apply_constraints_for_array_attributes(constraints, 'language', lambda _: _)
|
||||
_apply_constraints_for_array_attributes(constraints, 'location', lambda _: _)
|
||||
|
||||
select = f"SELECT {cols} FROM claim"
|
||||
|
||||
|
|
|
@ -180,14 +180,14 @@ class ClaimSearchCommand(ClaimTestCase):
|
|||
self.assertEqual(out_of_bounds, [])
|
||||
|
||||
async def test_tag_search(self):
|
||||
claim1 = await self.stream_create('claim1', tags=['abc'])
|
||||
claim2 = await self.stream_create('claim2', tags=['abc', 'def'])
|
||||
claim1 = await self.stream_create('claim1', tags=['aBc'])
|
||||
claim2 = await self.stream_create('claim2', tags=['#abc', 'def'])
|
||||
claim3 = await self.stream_create('claim3', tags=['abc', 'ghi', 'jkl'])
|
||||
claim4 = await self.stream_create('claim4', tags=['abc', 'ghi', 'mno'])
|
||||
claim4 = await self.stream_create('claim4', tags=['abc\t', 'ghi', 'mno'])
|
||||
claim5 = await self.stream_create('claim5', tags=['pqr'])
|
||||
|
||||
# any_tags
|
||||
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['abc', 'pqr'])
|
||||
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['\tabc', 'pqr'])
|
||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc'])
|
||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc', 'ghi'])
|
||||
await self.assertFindsClaims([claim4, claim3], any_tags=['ghi'])
|
||||
|
@ -196,7 +196,7 @@ class ClaimSearchCommand(ClaimTestCase):
|
|||
|
||||
# all_tags
|
||||
await self.assertFindsClaims([], all_tags=['abc', 'pqr'])
|
||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['abc'])
|
||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['ABC'])
|
||||
await self.assertFindsClaims([claim4, claim3], all_tags=['abc', 'ghi'])
|
||||
await self.assertFindsClaims([claim4, claim3], all_tags=['ghi'])
|
||||
await self.assertFindsClaims([], all_tags=['ghi', 'xyz'])
|
||||
|
@ -204,7 +204,7 @@ class ClaimSearchCommand(ClaimTestCase):
|
|||
|
||||
# not_tags
|
||||
await self.assertFindsClaims([], not_tags=['abc', 'pqr'])
|
||||
await self.assertFindsClaims([claim5], not_tags=['abc'])
|
||||
await self.assertFindsClaims([claim5], not_tags=['abC'])
|
||||
await self.assertFindsClaims([claim5], not_tags=['abc', 'ghi'])
|
||||
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi'])
|
||||
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi', 'xyz'])
|
||||
|
|
19
lbry/tests/unit/schema/test_tags.py
Normal file
19
lbry/tests/unit/schema/test_tags.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
import unittest
|
||||
|
||||
from lbry.schema.tags import normalize_tag, clean_tags
|
||||
|
||||
|
||||
class TestTagNormalization(unittest.TestCase):
|
||||
|
||||
def assertNormalizedTag(self, clean, dirty):
|
||||
self.assertEqual(clean, normalize_tag(dirty))
|
||||
|
||||
def test_normalize_tag(self):
|
||||
tag = self.assertNormalizedTag
|
||||
tag('', ' \t #!~')
|
||||
tag('tag', 'Tag')
|
||||
tag('t ag', '\tT \nAG ')
|
||||
tag('tag hash', '#tag~#hash!')
|
||||
|
||||
def test_clean_tags(self):
|
||||
self.assertEqual(['tag'], clean_tags([' \t #!~', '!taG', '\t']))
|
Loading…
Reference in a new issue