add tag normalization on storage and on search
This commit is contained in:
parent
8ca8591ed6
commit
a8b590ecbb
4 changed files with 47 additions and 14 deletions
13
lbry/lbry/schema/tags.py
Normal file
13
lbry/lbry/schema/tags.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
from typing import List
|
||||||
|
import re
|
||||||
|
|
||||||
|
MULTI_SPACE_RE = re.compile(r"\s{2,}")
|
||||||
|
WEIRD_CHARS_RE = re.compile(r"[#!~]")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_tag(tag: str):
|
||||||
|
return MULTI_SPACE_RE.sub(' ', WEIRD_CHARS_RE.sub(' ', tag.lower())).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def clean_tags(tags: List[str]):
|
||||||
|
return [tag for tag in (normalize_tag(tag) for tag in tags) if tag]
|
|
@ -10,6 +10,7 @@ from torba.server.util import class_logger
|
||||||
from torba.client.basedatabase import query, constraints_to_sql
|
from torba.client.basedatabase import query, constraints_to_sql
|
||||||
|
|
||||||
from lbry.schema.url import URL, normalize_name
|
from lbry.schema.url import URL, normalize_name
|
||||||
|
from lbry.schema.tags import clean_tags
|
||||||
from lbry.schema.mime_types import guess_stream_type
|
from lbry.schema.mime_types import guess_stream_type
|
||||||
from lbry.wallet.ledger import MainNetLedger, RegTestLedger
|
from lbry.wallet.ledger import MainNetLedger, RegTestLedger
|
||||||
from lbry.wallet.transaction import Transaction, Output
|
from lbry.wallet.transaction import Transaction, Output
|
||||||
|
@ -34,8 +35,8 @@ STREAM_TYPES = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _apply_constraints_for_array_attributes(constraints, attr):
|
def _apply_constraints_for_array_attributes(constraints, attr, cleaner):
|
||||||
any_items = constraints.pop(f'any_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
any_items = cleaner(constraints.pop(f'any_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||||
if any_items:
|
if any_items:
|
||||||
constraints.update({
|
constraints.update({
|
||||||
f'$any_{attr}{i}': item for i, item in enumerate(any_items)
|
f'$any_{attr}{i}': item for i, item in enumerate(any_items)
|
||||||
|
@ -47,7 +48,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
|
||||||
SELECT DISTINCT claim_hash FROM {attr} WHERE {attr} IN ({values})
|
SELECT DISTINCT claim_hash FROM {attr} WHERE {attr} IN ({values})
|
||||||
"""
|
"""
|
||||||
|
|
||||||
all_items = constraints.pop(f'all_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
all_items = cleaner(constraints.pop(f'all_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||||
if all_items:
|
if all_items:
|
||||||
constraints[f'$all_{attr}_count'] = len(all_items)
|
constraints[f'$all_{attr}_count'] = len(all_items)
|
||||||
constraints.update({
|
constraints.update({
|
||||||
|
@ -61,7 +62,7 @@ def _apply_constraints_for_array_attributes(constraints, attr):
|
||||||
GROUP BY claim_hash HAVING COUNT({attr}) = :$all_{attr}_count
|
GROUP BY claim_hash HAVING COUNT({attr}) = :$all_{attr}_count
|
||||||
"""
|
"""
|
||||||
|
|
||||||
not_items = constraints.pop(f'not_{attr}s', [])[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
not_items = cleaner(constraints.pop(f'not_{attr}s', []))[:ATTRIBUTE_ARRAY_MAX_LENGTH]
|
||||||
if not_items:
|
if not_items:
|
||||||
constraints.update({
|
constraints.update({
|
||||||
f'$not_{attr}{i}': item for i, item in enumerate(not_items)
|
f'$not_{attr}{i}': item for i, item in enumerate(not_items)
|
||||||
|
@ -293,7 +294,7 @@ class SQLDB:
|
||||||
elif claim.is_channel:
|
elif claim.is_channel:
|
||||||
claim_record['claim_type'] = CLAIM_TYPES['channel']
|
claim_record['claim_type'] = CLAIM_TYPES['channel']
|
||||||
|
|
||||||
for tag in claim.message.tags:
|
for tag in clean_tags(claim.message.tags):
|
||||||
tags.append((tag, claim_hash, tx.height))
|
tags.append((tag, claim_hash, tx.height))
|
||||||
|
|
||||||
if clear_first:
|
if clear_first:
|
||||||
|
@ -820,9 +821,9 @@ class SQLDB:
|
||||||
if media_types:
|
if media_types:
|
||||||
constraints['claim.media_type__in'] = media_types
|
constraints['claim.media_type__in'] = media_types
|
||||||
|
|
||||||
_apply_constraints_for_array_attributes(constraints, 'tag')
|
_apply_constraints_for_array_attributes(constraints, 'tag', clean_tags)
|
||||||
_apply_constraints_for_array_attributes(constraints, 'language')
|
_apply_constraints_for_array_attributes(constraints, 'language', lambda _: _)
|
||||||
_apply_constraints_for_array_attributes(constraints, 'location')
|
_apply_constraints_for_array_attributes(constraints, 'location', lambda _: _)
|
||||||
|
|
||||||
select = f"SELECT {cols} FROM claim"
|
select = f"SELECT {cols} FROM claim"
|
||||||
|
|
||||||
|
|
|
@ -180,14 +180,14 @@ class ClaimSearchCommand(ClaimTestCase):
|
||||||
self.assertEqual(out_of_bounds, [])
|
self.assertEqual(out_of_bounds, [])
|
||||||
|
|
||||||
async def test_tag_search(self):
|
async def test_tag_search(self):
|
||||||
claim1 = await self.stream_create('claim1', tags=['abc'])
|
claim1 = await self.stream_create('claim1', tags=['aBc'])
|
||||||
claim2 = await self.stream_create('claim2', tags=['abc', 'def'])
|
claim2 = await self.stream_create('claim2', tags=['#abc', 'def'])
|
||||||
claim3 = await self.stream_create('claim3', tags=['abc', 'ghi', 'jkl'])
|
claim3 = await self.stream_create('claim3', tags=['abc', 'ghi', 'jkl'])
|
||||||
claim4 = await self.stream_create('claim4', tags=['abc', 'ghi', 'mno'])
|
claim4 = await self.stream_create('claim4', tags=['abc\t', 'ghi', 'mno'])
|
||||||
claim5 = await self.stream_create('claim5', tags=['pqr'])
|
claim5 = await self.stream_create('claim5', tags=['pqr'])
|
||||||
|
|
||||||
# any_tags
|
# any_tags
|
||||||
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['abc', 'pqr'])
|
await self.assertFindsClaims([claim5, claim4, claim3, claim2, claim1], any_tags=['\tabc', 'pqr'])
|
||||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc'])
|
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc'])
|
||||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc', 'ghi'])
|
await self.assertFindsClaims([claim4, claim3, claim2, claim1], any_tags=['abc', 'ghi'])
|
||||||
await self.assertFindsClaims([claim4, claim3], any_tags=['ghi'])
|
await self.assertFindsClaims([claim4, claim3], any_tags=['ghi'])
|
||||||
|
@ -196,7 +196,7 @@ class ClaimSearchCommand(ClaimTestCase):
|
||||||
|
|
||||||
# all_tags
|
# all_tags
|
||||||
await self.assertFindsClaims([], all_tags=['abc', 'pqr'])
|
await self.assertFindsClaims([], all_tags=['abc', 'pqr'])
|
||||||
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['abc'])
|
await self.assertFindsClaims([claim4, claim3, claim2, claim1], all_tags=['ABC'])
|
||||||
await self.assertFindsClaims([claim4, claim3], all_tags=['abc', 'ghi'])
|
await self.assertFindsClaims([claim4, claim3], all_tags=['abc', 'ghi'])
|
||||||
await self.assertFindsClaims([claim4, claim3], all_tags=['ghi'])
|
await self.assertFindsClaims([claim4, claim3], all_tags=['ghi'])
|
||||||
await self.assertFindsClaims([], all_tags=['ghi', 'xyz'])
|
await self.assertFindsClaims([], all_tags=['ghi', 'xyz'])
|
||||||
|
@ -204,7 +204,7 @@ class ClaimSearchCommand(ClaimTestCase):
|
||||||
|
|
||||||
# not_tags
|
# not_tags
|
||||||
await self.assertFindsClaims([], not_tags=['abc', 'pqr'])
|
await self.assertFindsClaims([], not_tags=['abc', 'pqr'])
|
||||||
await self.assertFindsClaims([claim5], not_tags=['abc'])
|
await self.assertFindsClaims([claim5], not_tags=['abC'])
|
||||||
await self.assertFindsClaims([claim5], not_tags=['abc', 'ghi'])
|
await self.assertFindsClaims([claim5], not_tags=['abc', 'ghi'])
|
||||||
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi'])
|
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi'])
|
||||||
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi', 'xyz'])
|
await self.assertFindsClaims([claim5, claim2, claim1], not_tags=['ghi', 'xyz'])
|
||||||
|
|
19
lbry/tests/unit/schema/test_tags.py
Normal file
19
lbry/tests/unit/schema/test_tags.py
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from lbry.schema.tags import normalize_tag, clean_tags
|
||||||
|
|
||||||
|
|
||||||
|
class TestTagNormalization(unittest.TestCase):
|
||||||
|
|
||||||
|
def assertNormalizedTag(self, clean, dirty):
|
||||||
|
self.assertEqual(clean, normalize_tag(dirty))
|
||||||
|
|
||||||
|
def test_normalize_tag(self):
|
||||||
|
tag = self.assertNormalizedTag
|
||||||
|
tag('', ' \t #!~')
|
||||||
|
tag('tag', 'Tag')
|
||||||
|
tag('t ag', '\tT \nAG ')
|
||||||
|
tag('tag hash', '#tag~#hash!')
|
||||||
|
|
||||||
|
def test_clean_tags(self):
|
||||||
|
self.assertEqual(['tag'], clean_tags([' \t #!~', '!taG', '\t']))
|
Loading…
Reference in a new issue