From 557348e345a979e7fefd09147e96876f9461f7f0 Mon Sep 17 00:00:00 2001 From: Eugene Dubinin Date: Thu, 13 Jan 2022 18:26:07 +0200 Subject: [PATCH 1/4] detect media_type from the file contents --- lbry/schema/mime_types.py | 33 +++++++++++++++++++++++++++++++++ setup.py | 3 ++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/lbry/schema/mime_types.py b/lbry/schema/mime_types.py index 95e6c08dc..5dee9ccbc 100644 --- a/lbry/schema/mime_types.py +++ b/lbry/schema/mime_types.py @@ -1,4 +1,6 @@ import os +import filetype +import logging types_map = { # http://www.iana.org/assignments/media-types @@ -166,10 +168,41 @@ types_map = { '.wmv': ('video/x-ms-wmv', 'video') } +# maps detected extensions to the possible analogs +# i.e. .cbz file is actually a .zip +synonyms_map = { + '.zip': ['.cbz'], + '.rar': ['.cbr'], + '.ar': ['.a'] +} + +log = logging.getLogger(__name__) + def guess_media_type(path): _, ext = os.path.splitext(path) extension = ext.strip().lower() + + # try detecting real file format if path points to a readable file + try: + kind = filetype.guess(path) + if kind: + realext = f".{kind.extension}" + + # override extension parsed from file... + if extension != realext: + if extension: + log.warning(f"file extension does not match it's contents {path}, identified as {realext}") + else: + log.debug(f"file {path} does not have extension, identified by contents as {realext}") + + # don't do anything if extension is in synonyms + if not extension in synonyms_map[realext]: + extension = realext + + except OSError as error: + pass + if extension[1:]: if extension in types_map: return types_map[extension] diff --git a/setup.py b/setup.py index 56832e8eb..da749bfd9 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,8 @@ setup( 'attrs==18.2.0', 'pylru==1.1.0', 'elasticsearch==7.10.1', - 'grpcio==1.38.0' + 'grpcio==1.38.0', + 'filetype==1.0.9' ] + PLYVEL, extras_require={ 'torrent': ['lbry-libtorrent'], From 5836a93b21070dda6704af40f782a0e2ba0a4a59 Mon Sep 17 00:00:00 2001 From: Eugene Dubinin Date: Thu, 13 Jan 2022 18:52:49 +0200 Subject: [PATCH 2/4] fixes KeyError on missing synonyms --- lbry/schema/mime_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lbry/schema/mime_types.py b/lbry/schema/mime_types.py index 5dee9ccbc..daaf202e2 100644 --- a/lbry/schema/mime_types.py +++ b/lbry/schema/mime_types.py @@ -197,7 +197,7 @@ def guess_media_type(path): log.debug(f"file {path} does not have extension, identified by contents as {realext}") # don't do anything if extension is in synonyms - if not extension in synonyms_map[realext]: + if not extension in synonyms_map.get(realext, []): extension = realext except OSError as error: From babc54a240b01e9ac961d10bd9b81eeb2450a878 Mon Sep 17 00:00:00 2001 From: Eugene Dubinin Date: Sat, 29 Jan 2022 15:22:02 +0200 Subject: [PATCH 3/4] adjusts code style --- lbry/schema/mime_types.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lbry/schema/mime_types.py b/lbry/schema/mime_types.py index daaf202e2..0d75268db 100644 --- a/lbry/schema/mime_types.py +++ b/lbry/schema/mime_types.py @@ -187,18 +187,18 @@ def guess_media_type(path): try: kind = filetype.guess(path) if kind: - realext = f".{kind.extension}" + real_extension = f".{kind.extension}" # override extension parsed from file... - if extension != realext: + if extension != real_extension: if extension: - log.warning(f"file extension does not match it's contents {path}, identified as {realext}") + log.warning(f"file extension does not match it's contents: {path}, identified as {real_extension}") else: - log.debug(f"file {path} does not have extension, identified by contents as {realext}") + log.debug(f"file {path} does not have extension, identified by it's contents as {real_extension}") # don't do anything if extension is in synonyms - if not extension in synonyms_map.get(realext, []): - extension = realext + if extension not in synonyms_map.get(real_extension, []): + extension = real_extension except OSError as error: pass From 9b463a8cabba5e7403cf36ad6a7573e19e992447 Mon Sep 17 00:00:00 2001 From: Eugene Dubinin Date: Sat, 29 Jan 2022 20:49:42 +0200 Subject: [PATCH 4/4] adds tests for guess_media_type removes unnecessary comments --- lbry/schema/mime_types.py | 3 -- tests/unit/schema/test_mime_types.py | 51 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 tests/unit/schema/test_mime_types.py diff --git a/lbry/schema/mime_types.py b/lbry/schema/mime_types.py index 0d75268db..62505be04 100644 --- a/lbry/schema/mime_types.py +++ b/lbry/schema/mime_types.py @@ -183,20 +183,17 @@ def guess_media_type(path): _, ext = os.path.splitext(path) extension = ext.strip().lower() - # try detecting real file format if path points to a readable file try: kind = filetype.guess(path) if kind: real_extension = f".{kind.extension}" - # override extension parsed from file... if extension != real_extension: if extension: log.warning(f"file extension does not match it's contents: {path}, identified as {real_extension}") else: log.debug(f"file {path} does not have extension, identified by it's contents as {real_extension}") - # don't do anything if extension is in synonyms if extension not in synonyms_map.get(real_extension, []): extension = real_extension diff --git a/tests/unit/schema/test_mime_types.py b/tests/unit/schema/test_mime_types.py new file mode 100644 index 000000000..6d5beed2b --- /dev/null +++ b/tests/unit/schema/test_mime_types.py @@ -0,0 +1,51 @@ +import unittest +import tempfile +import os + +from lbry.schema.mime_types import guess_media_type + +class MediaTypeTests(unittest.TestCase): + def test_guess_media_type_from_path_only(self): + kind = guess_media_type('/tmp/test.mkv') + self.assertEqual(kind, ('video/x-matroska', 'video')) + + def test_defaults_for_no_extension(self): + kind = guess_media_type('/tmp/test') + self.assertEqual(kind, ('application/octet-stream', 'binary')) + + def test_defaults_for_unknown_extension(self): + kind = guess_media_type('/tmp/test.unk') + self.assertEqual(kind, ('application/x-ext-unk', 'binary')) + + def test_spoofed_unknown(self): + with tempfile.TemporaryDirectory() as temp_dir: + file = os.path.join(temp_dir, 'spoofed_unknown.txt') + with open(file, 'wb') as fd: + bytes_lz4 = bytearray([0x04,0x22,0x4d,0x18]) + fd.write(bytes_lz4) + fd.close() + + kind = guess_media_type(file) + self.assertEqual(kind, ('application/x-ext-lz4', 'binary')) + + def test_spoofed_known(self): + with tempfile.TemporaryDirectory() as temp_dir: + file = os.path.join(temp_dir, 'spoofed_known.avi') + with open(file, 'wb') as fd: + bytes_zip = bytearray([0x50,0x4b,0x03,0x06]) + fd.write(bytes_zip) + fd.close() + + kind = guess_media_type(file) + self.assertEqual(kind, ('application/zip', 'binary')) + + def test_spoofed_synonym(self): + with tempfile.TemporaryDirectory() as temp_dir: + file = os.path.join(temp_dir, 'spoofed_known.cbz') + with open(file, 'wb') as fd: + bytes_zip = bytearray([0x50,0x4b,0x03,0x06]) + fd.write(bytes_zip) + fd.close() + + kind = guess_media_type(file) + self.assertEqual(kind, ('application/vnd.comicbook+zip', 'document'))