detect media_type from the file contents

This commit is contained in:
Eugene Dubinin 2022-01-13 18:26:07 +02:00
parent 9adfec6b00
commit 557348e345
2 changed files with 35 additions and 1 deletions

View file

@ -1,4 +1,6 @@
import os import os
import filetype
import logging
types_map = { types_map = {
# http://www.iana.org/assignments/media-types # http://www.iana.org/assignments/media-types
@ -166,10 +168,41 @@ types_map = {
'.wmv': ('video/x-ms-wmv', 'video') '.wmv': ('video/x-ms-wmv', 'video')
} }
# maps detected extensions to the possible analogs
# i.e. .cbz file is actually a .zip
synonyms_map = {
'.zip': ['.cbz'],
'.rar': ['.cbr'],
'.ar': ['.a']
}
log = logging.getLogger(__name__)
def guess_media_type(path): def guess_media_type(path):
_, ext = os.path.splitext(path) _, ext = os.path.splitext(path)
extension = ext.strip().lower() extension = ext.strip().lower()
# try detecting real file format if path points to a readable file
try:
kind = filetype.guess(path)
if kind:
realext = f".{kind.extension}"
# override extension parsed from file...
if extension != realext:
if extension:
log.warning(f"file extension does not match it's contents {path}, identified as {realext}")
else:
log.debug(f"file {path} does not have extension, identified by contents as {realext}")
# don't do anything if extension is in synonyms
if not extension in synonyms_map[realext]:
extension = realext
except OSError as error:
pass
if extension[1:]: if extension[1:]:
if extension in types_map: if extension in types_map:
return types_map[extension] return types_map[extension]

View file

@ -56,7 +56,8 @@ setup(
'attrs==18.2.0', 'attrs==18.2.0',
'pylru==1.1.0', 'pylru==1.1.0',
'elasticsearch==7.10.1', 'elasticsearch==7.10.1',
'grpcio==1.38.0' 'grpcio==1.38.0',
'filetype==1.0.9'
] + PLYVEL, ] + PLYVEL,
extras_require={ extras_require={
'torrent': ['lbry-libtorrent'], 'torrent': ['lbry-libtorrent'],