Merge pull request #3301 from lbryio/no_repeat_claim_id

add `--remove_duplicates` to the search api
2021-05-28 11:01:15 -04:00 · 2021-05-28 11:01:15 -04:00 · 935adfb51a
commit 935adfb51a
parent 099f3b6a62 3974df4a62
4 changed files with 213 additions and 155 deletions
--- a/docs/api.json
+++ b/docs/api.json
--- a/lbry/extras/daemon/daemon.py
+++ b/lbry/extras/daemon/daemon.py
@ -2352,7 +2352,7 @@ class Daemon(metaclass=JSONRPCServerType):
                         [--not_locations=<not_locations>...]
                         [--order_by=<order_by>...] [--no_totals] [--page=<page>] [--page_size=<page_size>]
                         [--wallet_id=<wallet_id>] [--include_purchase_receipt] [--include_is_my_output]
-                         [--has_source | --has_no_source]
+                         [--remove_duplicates] [--has_source | --has_no_source]
                         [--new_sdk_server=<new_sdk_server>]

        Options:
@ -2461,6 +2461,8 @@ class Daemon(metaclass=JSONRPCServerType):
                                                     has purchased the claim
            --include_is_my_output          : (bool) lookup and include a boolean indicating
                                                     if claim being resolved is yours
+            --remove_duplicates             : (bool) removes duplicated content from search by picking either the
+                                                     original claim or the oldest matching repost
            --has_source                    : (bool) find claims containing a source field
            --has_no_source                 : (bool) find claims not containing a source field
           --new_sdk_server=<new_sdk_server> : (str) URL of the new SDK server (EXPERIMENTAL)
--- a/lbry/wallet/server/db/elasticsearch/search.py
+++ b/lbry/wallet/server/db/elasticsearch/search.py
@ -252,20 +252,15 @@ class SearchIndex:
            if not kwargs['channel_id'] or not isinstance(kwargs['channel_id'], str):
                return [], 0, 0
        try:
-            if 'limit_claims_per_channel' in kwargs:
-                return await self.search_ahead(**kwargs), 0, 0
-            else:
-                result = (await self.search_client.search(
-                    expand_query(**kwargs), index=self.index,
-                    track_total_hits=False if kwargs.get('no_totals') else 10_000
-                ))['hits']
+            return await self.search_ahead(**kwargs)
        except NotFoundError:
            return [], 0, 0
        return expand_result(result['hits']), 0, result.get('total', {}).get('value', 0)

    async def search_ahead(self, **kwargs):
        # 'limit_claims_per_channel' case. Fetch 1000 results, reorder, slice, inflate and return
-        per_channel_per_page = kwargs.pop('limit_claims_per_channel')
+        per_channel_per_page = kwargs.pop('limit_claims_per_channel', 0) or 0
+        remove_duplicates = kwargs.pop('remove_duplicates', False)
        page_size = kwargs.pop('limit', 10)
        offset = kwargs.pop('offset', 0)
        kwargs['limit'] = 1000
@ -278,14 +273,37 @@ class SearchIndex:
                    reordered_hits = cache_item.result
                else:
                    query = expand_query(**kwargs)
-                    reordered_hits = await self.__search_ahead(query, page_size, per_channel_per_page)
-                    cache_item.result = reordered_hits
-        return list(await self.get_many(*(claim_id for claim_id, _ in reordered_hits[offset:(offset + page_size)])))
-
-    async def __search_ahead(self, query: dict, page_size: int, per_channel_per_page: int):
                    search_hits = deque((await self.search_client.search(
-            query, index=self.index, track_total_hits=False, _source_includes=['_id', 'channel_id']
+                        query, index=self.index, track_total_hits=False,
+                        _source_includes=['_id', 'channel_id', 'reposted_claim_id', 'creation_height']
                    ))['hits']['hits'])
+                    if remove_duplicates:
+                        search_hits = self.__remove_duplicates(search_hits)
+                    if per_channel_per_page > 0:
+                        reordered_hits = self.__search_ahead(search_hits, page_size, per_channel_per_page)
+                    else:
+                        reordered_hits = [(hit['_id'], hit['_source']['channel_id']) for hit in search_hits]
+                    cache_item.result = reordered_hits
+        result = list(await self.get_many(*(claim_id for claim_id, _ in reordered_hits[offset:(offset + page_size)])))
+        return result, 0, len(reordered_hits)
+
+    def __remove_duplicates(self, search_hits: deque) -> deque:
+        known_ids = {}  # claim_id -> (creation_height, hit_id), where hit_id is either reposted claim id or original
+        dropped = set()
+        for hit in search_hits:
+            hit_height, hit_id = hit['_source']['creation_height'], hit['_source']['reposted_claim_id'] or hit['_id']
+            if hit_id not in known_ids:
+                known_ids[hit_id] = (hit_height, hit['_id'])
+            else:
+                previous_height, previous_id = known_ids[hit_id]
+                if hit_height < previous_height:
+                    known_ids[hit_id] = (hit_height, hit['_id'])
+                    dropped.add(previous_id)
+                else:
+                    dropped.add(hit['_id'])
+        return deque(hit for hit in search_hits if hit['_id'] not in dropped)
+
+    def __search_ahead(self, search_hits: list, page_size: int, per_channel_per_page: int):
        reordered_hits = []
        channel_counters = Counter()
        next_page_hits_maybe_check_later = deque()
@ -298,7 +316,7 @@ class SearchIndex:
                break  # means last page was incomplete and we are left with bad replacements
            for _ in range(len(next_page_hits_maybe_check_later)):
                claim_id, channel_id = next_page_hits_maybe_check_later.popleft()
-                if channel_counters[channel_id] < per_channel_per_page:
+                if per_channel_per_page > 0 and channel_counters[channel_id] < per_channel_per_page:
                    reordered_hits.append((claim_id, channel_id))
                    channel_counters[channel_id] += 1
                else:
@ -306,7 +324,7 @@ class SearchIndex:
            while search_hits:
                hit = search_hits.popleft()
                hit_id, hit_channel_id = hit['_id'], hit['_source']['channel_id']
-                if hit_channel_id is None:
+                if hit_channel_id is None or per_channel_per_page <= 0:
                    reordered_hits.append((hit_id, hit_channel_id))
                elif channel_counters[hit_channel_id] < per_channel_per_page:
                    reordered_hits.append((hit_id, hit_channel_id))
--- a/tests/integration/blockchain/test_claim_commands.py
+++ b/tests/integration/blockchain/test_claim_commands.py
@ -398,6 +398,32 @@ class ClaimSearchCommand(ClaimTestCase):
            limit_claims_per_channel=3, claim_type='stream'
        )

+    async def test_no_duplicates(self):
+        await self.generate(10)
+        match = self.assertFindsClaims
+        claims = []
+        channels = []
+        first = await self.stream_create('original_claim0')
+        second = await self.stream_create('original_claim1')
+        for i in range(10):
+            repost_id = self.get_claim_id(second if i % 2 == 0 else first)
+            channel = await self.channel_create(f'@chan{i}', bid='0.001')
+            channels.append(channel)
+            claims.append(
+                await self.stream_repost(repost_id, f'claim{i}', bid='0.001', channel_id=self.get_claim_id(channel)))
+        await match([first, second] + channels,
+                    remove_duplicates=True, order_by=['^height'])
+        await match(list(reversed(channels)) + [second, first],
+                    remove_duplicates=True, order_by=['height'])
+        # the original claims doesn't show up, so we pick the oldest reposts
+        await match([channels[0], claims[0], channels[1], claims[1]] + channels[2:],
+                    height='>218',
+                    remove_duplicates=True, order_by=['^height'])
+        # limit claims per channel, invert order, oldest ones are still chosen
+        await match(channels[2:][::-1] + [claims[1], channels[1], claims[0], channels[0]],
+                    height='>218', limit_claims_per_channel=1,
+                    remove_duplicates=True, order_by=['height'])
+
    async def test_limit_claims_per_channel_across_sorted_pages(self):
        await self.generate(10)
        match = self.assertFindsClaims
@ -429,6 +455,12 @@ class ClaimSearchCommand(ClaimTestCase):
            [claims[6], claims[7], last], page_size=4, page=3,
            limit_claims_per_channel=1, claim_type='stream', order_by=['^height']
        )
+        # feature disabled on 0 or negative values
+        for limit in [None, 0, -1]:
+            await match(
+                [first, second] + claims + [last],
+                limit_claims_per_channel=limit, claim_type='stream', order_by=['^height']
+            )

    async def test_claim_type_and_media_type_search(self):
        # create an invalid/unknown claim