Merge pull request #3301 from lbryio/no_repeat_claim_id
add `--remove_duplicates` to the search api
This commit is contained in:
commit
935adfb51a
4 changed files with 213 additions and 155 deletions
282
docs/api.json
282
docs/api.json
File diff suppressed because one or more lines are too long
|
@ -2352,7 +2352,7 @@ class Daemon(metaclass=JSONRPCServerType):
|
|||
[--not_locations=<not_locations>...]
|
||||
[--order_by=<order_by>...] [--no_totals] [--page=<page>] [--page_size=<page_size>]
|
||||
[--wallet_id=<wallet_id>] [--include_purchase_receipt] [--include_is_my_output]
|
||||
[--has_source | --has_no_source]
|
||||
[--remove_duplicates] [--has_source | --has_no_source]
|
||||
[--new_sdk_server=<new_sdk_server>]
|
||||
|
||||
Options:
|
||||
|
@ -2461,6 +2461,8 @@ class Daemon(metaclass=JSONRPCServerType):
|
|||
has purchased the claim
|
||||
--include_is_my_output : (bool) lookup and include a boolean indicating
|
||||
if claim being resolved is yours
|
||||
--remove_duplicates : (bool) removes duplicated content from search by picking either the
|
||||
original claim or the oldest matching repost
|
||||
--has_source : (bool) find claims containing a source field
|
||||
--has_no_source : (bool) find claims not containing a source field
|
||||
--new_sdk_server=<new_sdk_server> : (str) URL of the new SDK server (EXPERIMENTAL)
|
||||
|
|
|
@ -252,20 +252,15 @@ class SearchIndex:
|
|||
if not kwargs['channel_id'] or not isinstance(kwargs['channel_id'], str):
|
||||
return [], 0, 0
|
||||
try:
|
||||
if 'limit_claims_per_channel' in kwargs:
|
||||
return await self.search_ahead(**kwargs), 0, 0
|
||||
else:
|
||||
result = (await self.search_client.search(
|
||||
expand_query(**kwargs), index=self.index,
|
||||
track_total_hits=False if kwargs.get('no_totals') else 10_000
|
||||
))['hits']
|
||||
return await self.search_ahead(**kwargs)
|
||||
except NotFoundError:
|
||||
return [], 0, 0
|
||||
return expand_result(result['hits']), 0, result.get('total', {}).get('value', 0)
|
||||
|
||||
async def search_ahead(self, **kwargs):
|
||||
# 'limit_claims_per_channel' case. Fetch 1000 results, reorder, slice, inflate and return
|
||||
per_channel_per_page = kwargs.pop('limit_claims_per_channel')
|
||||
per_channel_per_page = kwargs.pop('limit_claims_per_channel', 0) or 0
|
||||
remove_duplicates = kwargs.pop('remove_duplicates', False)
|
||||
page_size = kwargs.pop('limit', 10)
|
||||
offset = kwargs.pop('offset', 0)
|
||||
kwargs['limit'] = 1000
|
||||
|
@ -278,14 +273,37 @@ class SearchIndex:
|
|||
reordered_hits = cache_item.result
|
||||
else:
|
||||
query = expand_query(**kwargs)
|
||||
reordered_hits = await self.__search_ahead(query, page_size, per_channel_per_page)
|
||||
search_hits = deque((await self.search_client.search(
|
||||
query, index=self.index, track_total_hits=False,
|
||||
_source_includes=['_id', 'channel_id', 'reposted_claim_id', 'creation_height']
|
||||
))['hits']['hits'])
|
||||
if remove_duplicates:
|
||||
search_hits = self.__remove_duplicates(search_hits)
|
||||
if per_channel_per_page > 0:
|
||||
reordered_hits = self.__search_ahead(search_hits, page_size, per_channel_per_page)
|
||||
else:
|
||||
reordered_hits = [(hit['_id'], hit['_source']['channel_id']) for hit in search_hits]
|
||||
cache_item.result = reordered_hits
|
||||
return list(await self.get_many(*(claim_id for claim_id, _ in reordered_hits[offset:(offset + page_size)])))
|
||||
result = list(await self.get_many(*(claim_id for claim_id, _ in reordered_hits[offset:(offset + page_size)])))
|
||||
return result, 0, len(reordered_hits)
|
||||
|
||||
async def __search_ahead(self, query: dict, page_size: int, per_channel_per_page: int):
|
||||
search_hits = deque((await self.search_client.search(
|
||||
query, index=self.index, track_total_hits=False, _source_includes=['_id', 'channel_id']
|
||||
))['hits']['hits'])
|
||||
def __remove_duplicates(self, search_hits: deque) -> deque:
|
||||
known_ids = {} # claim_id -> (creation_height, hit_id), where hit_id is either reposted claim id or original
|
||||
dropped = set()
|
||||
for hit in search_hits:
|
||||
hit_height, hit_id = hit['_source']['creation_height'], hit['_source']['reposted_claim_id'] or hit['_id']
|
||||
if hit_id not in known_ids:
|
||||
known_ids[hit_id] = (hit_height, hit['_id'])
|
||||
else:
|
||||
previous_height, previous_id = known_ids[hit_id]
|
||||
if hit_height < previous_height:
|
||||
known_ids[hit_id] = (hit_height, hit['_id'])
|
||||
dropped.add(previous_id)
|
||||
else:
|
||||
dropped.add(hit['_id'])
|
||||
return deque(hit for hit in search_hits if hit['_id'] not in dropped)
|
||||
|
||||
def __search_ahead(self, search_hits: list, page_size: int, per_channel_per_page: int):
|
||||
reordered_hits = []
|
||||
channel_counters = Counter()
|
||||
next_page_hits_maybe_check_later = deque()
|
||||
|
@ -298,7 +316,7 @@ class SearchIndex:
|
|||
break # means last page was incomplete and we are left with bad replacements
|
||||
for _ in range(len(next_page_hits_maybe_check_later)):
|
||||
claim_id, channel_id = next_page_hits_maybe_check_later.popleft()
|
||||
if channel_counters[channel_id] < per_channel_per_page:
|
||||
if per_channel_per_page > 0 and channel_counters[channel_id] < per_channel_per_page:
|
||||
reordered_hits.append((claim_id, channel_id))
|
||||
channel_counters[channel_id] += 1
|
||||
else:
|
||||
|
@ -306,7 +324,7 @@ class SearchIndex:
|
|||
while search_hits:
|
||||
hit = search_hits.popleft()
|
||||
hit_id, hit_channel_id = hit['_id'], hit['_source']['channel_id']
|
||||
if hit_channel_id is None:
|
||||
if hit_channel_id is None or per_channel_per_page <= 0:
|
||||
reordered_hits.append((hit_id, hit_channel_id))
|
||||
elif channel_counters[hit_channel_id] < per_channel_per_page:
|
||||
reordered_hits.append((hit_id, hit_channel_id))
|
||||
|
|
|
@ -398,6 +398,32 @@ class ClaimSearchCommand(ClaimTestCase):
|
|||
limit_claims_per_channel=3, claim_type='stream'
|
||||
)
|
||||
|
||||
async def test_no_duplicates(self):
|
||||
await self.generate(10)
|
||||
match = self.assertFindsClaims
|
||||
claims = []
|
||||
channels = []
|
||||
first = await self.stream_create('original_claim0')
|
||||
second = await self.stream_create('original_claim1')
|
||||
for i in range(10):
|
||||
repost_id = self.get_claim_id(second if i % 2 == 0 else first)
|
||||
channel = await self.channel_create(f'@chan{i}', bid='0.001')
|
||||
channels.append(channel)
|
||||
claims.append(
|
||||
await self.stream_repost(repost_id, f'claim{i}', bid='0.001', channel_id=self.get_claim_id(channel)))
|
||||
await match([first, second] + channels,
|
||||
remove_duplicates=True, order_by=['^height'])
|
||||
await match(list(reversed(channels)) + [second, first],
|
||||
remove_duplicates=True, order_by=['height'])
|
||||
# the original claims doesn't show up, so we pick the oldest reposts
|
||||
await match([channels[0], claims[0], channels[1], claims[1]] + channels[2:],
|
||||
height='>218',
|
||||
remove_duplicates=True, order_by=['^height'])
|
||||
# limit claims per channel, invert order, oldest ones are still chosen
|
||||
await match(channels[2:][::-1] + [claims[1], channels[1], claims[0], channels[0]],
|
||||
height='>218', limit_claims_per_channel=1,
|
||||
remove_duplicates=True, order_by=['height'])
|
||||
|
||||
async def test_limit_claims_per_channel_across_sorted_pages(self):
|
||||
await self.generate(10)
|
||||
match = self.assertFindsClaims
|
||||
|
@ -429,6 +455,12 @@ class ClaimSearchCommand(ClaimTestCase):
|
|||
[claims[6], claims[7], last], page_size=4, page=3,
|
||||
limit_claims_per_channel=1, claim_type='stream', order_by=['^height']
|
||||
)
|
||||
# feature disabled on 0 or negative values
|
||||
for limit in [None, 0, -1]:
|
||||
await match(
|
||||
[first, second] + claims + [last],
|
||||
limit_claims_per_channel=limit, claim_type='stream', order_by=['^height']
|
||||
)
|
||||
|
||||
async def test_claim_type_and_media_type_search(self):
|
||||
# create an invalid/unknown claim
|
||||
|
|
Loading…
Reference in a new issue