RA.Aid/ra_aid/utils/file_utils.py

61 lines
2.0 KiB
Python

"""Utility functions for file operations."""
import os
try:
import magic
except ImportError:
magic = None
def is_binary_file(filepath):
"""Check if a file is binary using magic library if available."""
# First check if file is empty
if os.path.getsize(filepath) == 0:
return False # Empty files are not binary
if magic:
try:
mime = magic.from_file(filepath, mime=True)
file_type = magic.from_file(filepath)
# If MIME type starts with 'text/', it's likely a text file
if mime.startswith("text/"):
return False
# Also consider 'application/x-python' and similar script types as text
if any(mime.startswith(prefix) for prefix in ['application/x-python', 'application/javascript']):
return False
# Check for common text file descriptors
text_indicators = ["text", "script", "xml", "json", "yaml", "markdown", "HTML"]
if any(indicator.lower() in file_type.lower() for indicator in text_indicators):
return False
# If none of the text indicators are present, assume it's binary
return True
except Exception:
return _is_binary_fallback(filepath)
else:
return _is_binary_fallback(filepath)
def _is_binary_fallback(filepath):
"""Fallback method to detect binary files without using magic."""
try:
# First check if file is empty
if os.path.getsize(filepath) == 0:
return False # Empty files are not binary
with open(filepath, "r", encoding="utf-8") as f:
chunk = f.read(1024)
# Check for null bytes which indicate binary content
if "\0" in chunk:
return True
# If we can read it as text without errors, it's probably not binary
return False
except UnicodeDecodeError:
# If we can't decode as UTF-8, it's likely binary
return True