From 03455310aeea44e258e9871bee51218a51e45cc0 Mon Sep 17 00:00:00 2001 From: Lex Berezhny Date: Sun, 28 Apr 2019 15:21:01 -0400 Subject: [PATCH] rewrote URL parser --- lbrynet/schema/uri.py | 185 ---------------------------------- lbrynet/schema/url.py | 107 ++++++++++++++++++++ tests/unit/schema/test_uri.py | 113 --------------------- tests/unit/schema/test_url.py | 85 ++++++++++++++++ 4 files changed, 192 insertions(+), 298 deletions(-) delete mode 100644 lbrynet/schema/uri.py create mode 100644 lbrynet/schema/url.py delete mode 100644 tests/unit/schema/test_uri.py create mode 100644 tests/unit/schema/test_url.py diff --git a/lbrynet/schema/uri.py b/lbrynet/schema/uri.py deleted file mode 100644 index 60e9ef022..000000000 --- a/lbrynet/schema/uri.py +++ /dev/null @@ -1,185 +0,0 @@ -import re - -PROTOCOL = 'lbry://' -CHANNEL_CHAR = '@' -CLAIM_ID_CHAR = '#' -CLAIM_SEQUENCE_CHAR = ':' -BID_POSITION_CHAR = '$' -PATH_CHAR = '/' -QUERY_CHAR = '?' - -CLAIM_ID_MAX_LENGTH = 40 -CHANNEL_NAME_MIN_LENGTH = 1 - - -class URIParseError(Exception): - pass - - -class URI(object): - __slots__ = ['name', 'claim_sequence', 'bid_position', 'claim_id', 'path'] - - def __init__(self, name, claim_sequence=None, bid_position=None, claim_id=None, path=None): - if len([v for v in [claim_sequence, bid_position, claim_id] if v is not None]) > 1: - raise ValueError( - "Only one of these may be present at a time: claim_sequence, bid_position, claim_id" - ) - - self.name = name - self.claim_sequence = claim_sequence - self.bid_position = bid_position - self.claim_id = claim_id - self.path = path - - if self.path is not None and not self.contains_channel: - raise ValueError("Content claims cannot have paths") - - def __str__(self): - return self.to_uri_string() - - def __eq__(self, other): - for prop in self.__slots__: - if not hasattr(other, prop) or getattr(self, prop) != getattr(other, prop): - return False - return self.__class__ == other.__class__ - @property - def channel_name(self): - return self.name if self.contains_channel else None - - @property - def claim_name(self): - return self.name if not self.contains_channel else self.path - - @property - def contains_channel(self): - return self.name.startswith(CHANNEL_CHAR) - - @property - def is_channel(self): - return self.contains_channel and not self.path - - def to_uri_string(self): - uri_string = PROTOCOL + "%s" % self.name - - if self.claim_sequence is not None: - uri_string += CLAIM_SEQUENCE_CHAR + "%i" % self.claim_sequence - elif self.bid_position is not None: - uri_string += BID_POSITION_CHAR + "%i" % self.bid_position - elif self.claim_id is not None: - uri_string += CLAIM_ID_CHAR + "%s" % self.claim_id - - if self.path is not None: - uri_string += PATH_CHAR + "%s" % self.path - - return uri_string - - def to_dict(self): - return { - "name": self.name, - 'claim_sequence': self.claim_sequence, - 'bid_position': self.bid_position, - 'claim_id': self.claim_id, - 'path': self.path, - } - - @classmethod - def from_uri_string(cls, uri_string): - """ - Parses LBRY uri into its components - - :param uri_string: format - lbry://name:n$rank#id/path - optional modifiers: - claim_sequence (int): the nth claim to the name - bid_position (int): the bid queue position of the claim for the name - claim_id (str): the claim id for the claim - path (str): claim within a channel - :return: URI - """ - match = re.match(get_schema_regex(), uri_string) - - if match is None: - raise URIParseError('Invalid URI') - - if match.group('content_name') and match.group('path'): - raise URIParseError('Only channels may have paths') - - return cls( - name=match.group("content_or_channel_name"), - claim_sequence=int(match.group("claim_sequence")) if match.group( - "claim_sequence") is not None else None, - bid_position=int(match.group("bid_position")) if match.group( - "bid_position") is not None else None, - claim_id=match.group("claim_id"), - path=match.group("path") - ) - - @classmethod - def from_dict(cls, uri_dict): - """ - Creates URI from dict - - :return: URI - """ - return cls(**uri_dict) - - -def get_schema_regex(): - def _named(name, regex): - return "(?P<" + name + ">" + regex + ")" - - def _group(regex): - return "(?:" + regex + ")" - - # TODO: regex should include the fact that content names cannot have paths - # right now this is only enforced in code, not in the regex - - # Escape constants - claim_id_char = re.escape(CLAIM_ID_CHAR) - claim_sequence_char = re.escape(CLAIM_SEQUENCE_CHAR) - bid_position_char = re.escape(BID_POSITION_CHAR) - channel_char = re.escape(CHANNEL_CHAR) - path_char = re.escape(PATH_CHAR) - protocol = _named("protocol", re.escape(PROTOCOL)) - - # Define basic building blocks - valid_name_char = "[^=&#:$@%?/]" # from the grammar section of https://spec.lbry.io/ - name_content = valid_name_char + '+' - name_min_channel_length = valid_name_char + '{' + str(CHANNEL_NAME_MIN_LENGTH) + ',}' - - positive_number = "[1-9][0-9]*" - number = '\-?' + positive_number - - # Define URI components - content_name = _named("content_name", name_content) - channel_name = _named("channel_name", channel_char + name_min_channel_length) - content_or_channel_name = _named("content_or_channel_name", content_name + "|" + channel_name) - - claim_id_piece = _named("claim_id", "[0-9a-f]{1," + str(CLAIM_ID_MAX_LENGTH) + "}") - claim_id = _group(claim_id_char + claim_id_piece) - - bid_position_piece = _named("bid_position", number) - bid_position = _group(bid_position_char + bid_position_piece) - - claim_sequence_piece = _named("claim_sequence", number) - claim_sequence = _group(claim_sequence_char + claim_sequence_piece) - - modifier = _named("modifier", claim_id + "|" + bid_position + "|" + claim_sequence) - - path_piece = _named("path", name_content) - path = _group(path_char + path_piece) - - # Combine components - uri = _named("uri", ( - '^' + - protocol + '?' + - content_or_channel_name + - modifier + '?' + - path + '?' + - '$' - )) - - return uri - - -def parse_lbry_uri(lbry_uri): - return URI.from_uri_string(lbry_uri) diff --git a/lbrynet/schema/url.py b/lbrynet/schema/url.py new file mode 100644 index 000000000..ab80fa85c --- /dev/null +++ b/lbrynet/schema/url.py @@ -0,0 +1,107 @@ +import re +from typing import NamedTuple, Tuple + + +def _create_url_regex(): + # see https://spec.lbry.io/ + + def _named(name, regex): + return "(?P<" + name + ">" + regex + ")" + + def _group(regex): + return "(?:" + regex + ")" + + def _oneof(*choices): + return _group('|'.join(choices)) + + def _claim(name, prefix=""): + return _group( + _named(name+"_name", prefix + "[^=&#:$@%?/]+") + + _oneof( + _group('#' + _named(name+"_claim_id", "[0-9a-f]{1,40}")), + _group(':' + _named(name+"_sequence", '[1-9][0-9]*')), + _group(r'\$' + _named(name+"_amount_order", '[1-9][0-9]*')) + ) + '?' + ) + + return ( + '^' + + _named("scheme", "lbry://") + '?' + + _oneof( + _group(_claim("channel_with_stream", "@") + "/" + _claim("stream_in_channel")), + _claim("channel", "@"), + _claim("stream") + ) + + '$' + ) + + +URL_REGEX = _create_url_regex() + + +class PathSegment(NamedTuple): + name: str + claim_id: str = None + sequence: int = None + amount_order: int = None + + def __str__(self): + if self.claim_id is not None: + return f"{self.name}#{self.claim_id}" + elif self.sequence is not None: + return f"{self.name}:{self.sequence}" + elif self.amount_order is not None: + return f"{self.name}${self.amount_order}" + return self.name + + +class URL(NamedTuple): + stream: PathSegment + channel: PathSegment + + @property + def has_channel(self): + return self.channel is not None + + @property + def has_stream(self): + return self.stream is not None + + @property + def parts(self) -> Tuple: + if self.has_channel: + if self.has_stream: + return self.channel, self.stream + return self.channel, + return self.stream, + + @property + def first(self): + return self.parts[0] + + def __str__(self): + return f"lbry://{'/'.join(str(p) for p in self.parts)}" + + @classmethod + def parse(cls, url): + match = re.match(URL_REGEX, url) + + if match is None: + raise ValueError('Invalid LBRY URL') + + segments = {} + parts = match.groupdict() + for segment in ('channel', 'stream', 'channel_with_stream', 'stream_in_channel'): + if parts[f'{segment}_name'] is not None: + segments[segment] = PathSegment( + parts[f'{segment}_name'], + parts[f'{segment}_claim_id'], + parts[f'{segment}_sequence'], + parts[f'{segment}_amount_order'] + ) + + if 'channel_with_stream' in segments: + segments['channel'] = segments['channel_with_stream'] + segments['stream'] = segments['stream_in_channel'] + + return cls(segments.get('stream', None), segments.get('channel', None)) diff --git a/tests/unit/schema/test_uri.py b/tests/unit/schema/test_uri.py deleted file mode 100644 index 85371cabe..000000000 --- a/tests/unit/schema/test_uri.py +++ /dev/null @@ -1,113 +0,0 @@ -import unittest - -from lbrynet.schema.uri import URI, URIParseError - -claim_id_1 = "63f2da17b0d90042c559cc73b6b17f853945c43e" - -parsed_uri_matches = [ - ("test", URI("test"), False, False, "test", None), - ("test#%s" % claim_id_1, URI("test", claim_id=claim_id_1), False, False, "test", None), - ("test:1", URI("test", claim_sequence=1), False, False, "test", None), - ("test$1", URI("test", bid_position=1), False, False, "test", None), - ("lbry://test", URI("test"), False, False, "test", None), - ("lbry://test#%s" % claim_id_1, URI("test", claim_id=claim_id_1), False, False, "test", None), - ("lbry://test:1", URI("test", claim_sequence=1), False, False, "test", None), - ("lbry://test$1", URI("test", bid_position=1), False, False, "test", None), - ("@test", URI("@test"), True, True, None, "@test"), - ("@test#%s" % claim_id_1, URI("@test", claim_id=claim_id_1), True, True, None, "@test"), - ("@test:1", URI("@test", claim_sequence=1), True, True, None, "@test"), - ("@test$1", URI("@test", bid_position=1), True, True, None, "@test"), - ("lbry://@test1:1/fakepath", URI("@test1", claim_sequence=1, path="fakepath"), True, False, "fakepath", "@test1"), - ("lbry://@test1$1/fakepath", URI("@test1", bid_position=1, path="fakepath"), True, False, "fakepath", "@test1"), - ("lbry://@test1#abcdef/fakepath", URI("@test1", claim_id="abcdef", path="fakepath"), True, False, "fakepath", - "@test1"), - ("@z", URI("@z"), True, True, None, "@z"), - ("@yx", URI("@yx"), True, True, None, "@yx"), - ("@abc", URI("@abc"), True, True, None, "@abc") -] - -parsed_uri_raises = [ - ("lbry://", URIParseError), - ("lbry://test:3$1", URIParseError), - ("lbry://test$1:1", URIParseError), - ("lbry://test#x", URIParseError), - ("lbry://test#x/page", URIParseError), - ("lbry://test$", URIParseError), - ("lbry://test#", URIParseError), - ("lbry://test:", URIParseError), - ("lbry://test$x", URIParseError), - ("lbry://test:x", URIParseError), - ("lbry://@test@", URIParseError), - ("lbry://@test:", URIParseError), - ("lbry://test@", URIParseError), - ("lbry://tes@t", URIParseError), - ("lbry://test:1#%s" % claim_id_1, URIParseError), - ("lbry://test:0", URIParseError), - ("lbry://test$0", URIParseError), - ("lbry://test/path", URIParseError), - ("lbry://@test1#abcdef/fakepath:1", URIParseError), - ("lbry://@test1:1/fakepath:1", URIParseError), - ("lbry://@test1:1ab/fakepath", URIParseError), - ("lbry://test:1:1:1", URIParseError), - ("whatever/lbry://test", URIParseError), - ("lbry://lbry://test", URIParseError), - ("lbry://@/what", URIParseError), - ("lbry://abc:0x123", URIParseError), - ("lbry://abc:0x123/page", URIParseError), - ("lbry://@test1#ABCDEF/fakepath", URIParseError), - ("test:0001", URIParseError), - ("lbry://@test1$1/fakepath?arg1&arg2&arg3", URIParseError) -] - - -class TestURIParser(unittest.TestCase): - - maxDiff = 4000 - longMessage = True - - def test_uri_parse(self): - for test_string, expected_uri_obj, contains_channel, is_channel, claim_name, channel_name in parsed_uri_matches: - try: - # string -> URI - self.assertEqual(URI.from_uri_string(test_string), expected_uri_obj, test_string) - # URI -> dict -> URI - self.assertEqual(URI.from_dict(expected_uri_obj.to_dict()), expected_uri_obj, - test_string) - # contains_channel - self.assertEqual(URI.from_uri_string(test_string).contains_channel, contains_channel, - test_string) - # is_channel - self.assertEqual(URI.from_uri_string(test_string).is_channel, is_channel, - test_string) - # claim_name - self.assertEqual(URI.from_uri_string(test_string).claim_name, claim_name, - test_string) - # channel_name - self.assertEqual(URI.from_uri_string(test_string).channel_name, channel_name, - test_string) - - # convert-to-string test only works if protocol is present in test_string - if test_string.startswith('lbry://'): - # string -> URI -> string - self.assertEqual(URI.from_uri_string(test_string).to_uri_string(), test_string, - test_string) - # string -> URI -> dict -> URI -> string - uri_dict = URI.from_uri_string(test_string).to_dict() - self.assertEqual(URI.from_dict(uri_dict).to_uri_string(), test_string, - test_string) - # URI -> dict -> URI -> string - self.assertEqual(URI.from_dict(expected_uri_obj.to_dict()).to_uri_string(), - test_string, test_string) - except URIParseError as err: - print("ERROR: " + test_string) - raise - - def test_uri_errors(self): - for test_str, err in parsed_uri_raises: - try: - URI.from_uri_string(test_str) - except URIParseError: - pass - else: - print("\nSuccessfully parsed invalid url: " + test_str) - self.assertRaises(err, URI.from_uri_string, test_str) diff --git a/tests/unit/schema/test_url.py b/tests/unit/schema/test_url.py new file mode 100644 index 000000000..0e465b132 --- /dev/null +++ b/tests/unit/schema/test_url.py @@ -0,0 +1,85 @@ +import unittest + +from lbrynet.schema.url import URL + + +claim_id = "63f2da17b0d90042c559cc73b6b17f853945c43e" + + +class TestURLParsing(unittest.TestCase): + + segments = 'stream', 'channel' + fields = 'name', 'claim_id', 'sequence', 'amount_order' + + def _assert_url(self, url_string, **kwargs): + url = URL.parse(url_string) + self.assertEqual(url_string, str(url)) + present = {} + for key in kwargs: + for segment_name in self.segments: + if key.startswith(segment_name): + present[segment_name] = True + break + for segment_name in self.segments: + segment = getattr(url, segment_name) + if segment_name not in present: + self.assertIsNone(segment) + else: + for field in self.fields: + self.assertEqual( + getattr(segment, field), + kwargs.get(f'{segment_name}_{field}', None) + ) + + def _fail_url(self, url): + with self.assertRaisesRegex(ValueError, 'Invalid LBRY URL'): + URL.parse(url) + + def test_parser_valid_urls(self): + url = self._assert_url + # stream + url('test', stream_name='test') + url('test:1', stream_name='test', stream_sequence='1') + url('test$1', stream_name='test', stream_amount_order='1') + url(f'test#{claim_id}', stream_name='test', stream_claim_id=claim_id) + # channel + url('@test', channel_name='@test') + url('@test:1', channel_name='@test', channel_sequence='1') + url('@test$1', channel_name='@test', channel_amount_order='1') + url(f'@test#{claim_id}', channel_name='@test', channel_claim_id=claim_id) + # channel/stream + url('lbry://@test/stuff', channel_name='@test', stream_name='stuff') + url('lbry://@test:1/stuff', channel_name='@test', channel_sequence='1', stream_name='stuff') + url('lbry://@test$1/stuff', channel_name='@test', channel_amount_order='1', stream_name='stuff') + url(f'lbry://@test#{claim_id}/stuff', channel_name='@test', channel_claim_id=claim_id, stream_name='stuff') + + def test_parser_invalid_urls(self): + fail = self._fail_url + fail("lbry://") + fail("lbry://test:3$1") + fail("lbry://test$1:1") + fail("lbry://test#x") + fail("lbry://test#x/page") + fail("lbry://test$") + fail("lbry://test#") + fail("lbry://test:") + fail("lbry://test$x") + fail("lbry://test:x") + fail("lbry://@test@") + fail("lbry://@test:") + fail("lbry://test@") + fail("lbry://tes@t") + fail(f"lbry://test:1#{claim_id}") + fail("lbry://test:0") + fail("lbry://test$0") + fail("lbry://test/path") + fail("lbry://@test1:1ab/fakepath") + fail("lbry://test:1:1:1") + fail("whatever/lbry://test") + fail("lbry://lbry://test") + fail("lbry://@/what") + fail("lbry://abc:0x123") + fail("lbry://abc:0x123/page") + fail("lbry://@test1#ABCDEF/fakepath") + fail("test:0001") + fail("lbry://@test1$1/fakepath?arg1&arg2&arg3")