63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
binaryornot.helpers
|
||
|
-------------------
|
||
|
|
||
|
Helper utilities used by BinaryOrNot.
|
||
|
"""
|
||
|
def print_as_hex(s):
|
||
|
"""
|
||
|
Print a string as hex bytes.
|
||
|
"""
|
||
|
|
||
|
print(":".join("{0:x}".format(ord(c)) for c in s))
|
||
|
|
||
|
|
||
|
def get_starting_chunk(filename, length=1024):
|
||
|
"""
|
||
|
:param filename: File to open and get the first little chunk of.
|
||
|
:param length: Number of bytes to read, default 1024.
|
||
|
:returns: Starting chunk of bytes.
|
||
|
"""
|
||
|
# Ensure we open the file in binary mode
|
||
|
with open(filename, 'rb') as f:
|
||
|
chunk = f.read(length)
|
||
|
return chunk
|
||
|
|
||
|
|
||
|
_printable_extended_ascii = b'\n\r\t\f\b'
|
||
|
if bytes is str:
|
||
|
# Python 2 means we need to invoke chr() explicitly
|
||
|
_printable_extended_ascii += b''.join(map(chr, range(32, 256)))
|
||
|
else:
|
||
|
# Python 3 means bytes accepts integer input directly
|
||
|
_printable_extended_ascii += bytes(range(32, 256))
|
||
|
|
||
|
def is_binary_string(bytes_to_check):
|
||
|
"""
|
||
|
:param bytes: A chunk of bytes to check.
|
||
|
:returns: True if appears to be a binary, otherwise False.
|
||
|
"""
|
||
|
# Uses a simplified version of the Perl detection algorithm,
|
||
|
# based roughly on Eli Bendersky's translation to Python:
|
||
|
# http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
|
||
|
|
||
|
# This is biased slightly more in favour of deeming files as text
|
||
|
# files than the Perl algorithm, since all ASCII compatible character
|
||
|
# sets are accepted as text, not just utf-8
|
||
|
|
||
|
# Empty files are considered text files
|
||
|
if not bytes_to_check:
|
||
|
return False
|
||
|
|
||
|
# Check for NUL bytes first
|
||
|
if b'\x00' in bytes_to_check:
|
||
|
return True
|
||
|
|
||
|
# Now check for a high percentage of ASCII control characters
|
||
|
# Binary if control chars are > 30% of the string
|
||
|
control_chars = bytes_to_check.translate(None, _printable_extended_ascii)
|
||
|
nontext_ratio = float(len(control_chars)) / float(len(bytes_to_check))
|
||
|
return nontext_ratio > 0.3
|