fix binary file detection

This commit is contained in:
AI Christianson 2025-03-03 18:43:49 -05:00
parent 5202d2e7f3
commit 6562b6c332
4 changed files with 379 additions and 6 deletions

View File

@ -168,6 +168,8 @@ Decision on Implementation
If this is a top-level README.md or docs folder, start there.
If the user explicitly requested implementation, that means you should first perform all the background research for that task, then call request_implementation where the implementation will be carried out.
NEVER ANNOUNCE WHAT YOU ARE DOING, JUST DO IT!
AS THE RESEARCH AGENT, YOU MUST NOT WRITE OR MODIFY ANY FILES. IF FILE MODIFICATION OR IMPLEMENTATINO IS REQUIRED, CALL request_implementation.

View File

@ -500,12 +500,20 @@ def is_binary_file(filepath):
mime = magic.from_file(filepath, mime=True)
file_type = magic.from_file(filepath)
if not mime.startswith("text/"):
return True
if "ASCII text" in file_type:
# If MIME type starts with 'text/', it's likely a text file
if mime.startswith("text/"):
return False
# Also consider 'application/x-python' and similar script types as text
if any(mime.startswith(prefix) for prefix in ['application/x-python', 'application/javascript']):
return False
# Check for common text file descriptors
text_indicators = ["text", "script", "xml", "json", "yaml", "markdown", "HTML"]
if any(indicator.lower() in file_type.lower() for indicator in text_indicators):
return False
# If none of the text indicators are present, assume it's binary
return True
except Exception:
return _is_binary_fallback(filepath)

View File

@ -0,0 +1,311 @@
"""Simplified mock of agent_utils.py for testing binary file detection.
This file includes typical Python constructs like imports, functions, classes, and docstrings
to replicate the characteristics of the real agent_utils.py that's causing issues with
binary file detection.
"""
import os
import sys
import time
import uuid
from datetime import datetime
from typing import Any, Dict, List, Optional, Literal, Sequence
from rich.console import Console
from rich.markdown import Markdown
from rich.panel import Panel
# Define a logger
logger = None # In real code, this would be an actual logger
class MockAgent:
"""Mock agent class to simulate the real agent class structure."""
def __init__(self, model=None, tools=None, max_tokens=4096, config=None):
"""Initialize a mock agent.
Args:
model: The language model to use
tools: List of tools available to the agent
max_tokens: Maximum tokens to use in context
config: Additional configuration
"""
self.model = model
self.tools = tools or []
self.max_tokens = max_tokens
self.config = config or {}
self._initialized = True
def run(self, input_text, config=None):
"""Run the agent on input text.
Args:
input_text: The text to process
config: Optional runtime configuration
Returns:
Mock agent response
"""
# Simulate processing with a delay
time.sleep(0.1)
return f"Processed: {input_text[:20]}..."
@staticmethod
def _estimate_tokens(text):
"""Estimate number of tokens in text.
Args:
text: Text to estimate tokens for
Returns:
Estimated token count (roughly 1 token per 4 characters)
"""
return len(text) // 4
def run_mock_agent(task: str, model=None, **kwargs) -> Optional[str]:
"""Run a mock agent on a task.
This function creates a new agent, sets up tools, and runs the agent on the task.
It includes various parameters and logic to mimic the complexity of the real agent_utils.py.
Args:
task: The task to process
model: The model to use
**kwargs: Additional keyword arguments
Returns:
Optional[str]: Result from the agent
"""
# Create a unique ID for this run
run_id = str(uuid.uuid4())
# Set up mock console for output
console = Console()
# Log the start of execution
console.print(Panel(Markdown(f"Starting agent with ID: {run_id}"), title="🤖 Agent"))
# Setup some complex nested data structures to mimic real code
memory = {
"task_history": [],
"agent_state": {
"initialized": True,
"tools_enabled": True
},
"config": {
"max_retries": 3,
"timeout": 30,
"debug": False
}
}
# Track some metrics
metrics = {
"start_time": datetime.now(),
"steps": 0,
"tokens_used": 0
}
# Create a mock agent
agent = MockAgent(model=model, config=kwargs.get("config"))
try:
# Process the task
memory["task_history"].append(task)
metrics["steps"] += 1
# Simulate token counting
task_tokens = MockAgent._estimate_tokens(task)
metrics["tokens_used"] += task_tokens
# Check if we should short-circuit for any reason
if task.lower() == "exit" or task.lower() == "quit":
return "Exit requested"
# Run the main agent logic
result = agent.run(task)
# Update completion time
metrics["end_time"] = datetime.now()
metrics["duration"] = (metrics["end_time"] - metrics["start_time"]).total_seconds()
# Generate a fancy completion message with some complex formatting
completion_message = f"""
## Agent Run Complete
- **Task**: {task[:50]}{"..." if len(task) > 50 else ""}
- **Duration**: {metrics["duration"]:.2f}s
- **Tokens**: {metrics["tokens_used"]}
- **Steps**: {metrics["steps"]}
- **Result**: Success
"""
console.print(Panel(Markdown(completion_message), title="✅ Complete"))
return result
except Exception as e:
# Handle errors
error_message = f"Agent failed: {str(e)}"
console.print(Panel(error_message, title="❌ Error", style="red"))
return None
def calculate_something_complex(a: int, b: int, operation: str = "add") -> int:
"""Calculate something using the specified operation.
Args:
a: First number
b: Second number
operation: Operation to perform (add, subtract, multiply, divide)
Returns:
Result of the calculation
Raises:
ValueError: If operation is invalid
"""
if operation == "add":
return a + b
elif operation == "subtract":
return a - b
elif operation == "multiply":
return a * b
elif operation == "divide":
if b == 0:
raise ValueError("Cannot divide by zero")
return a / b
else:
raise ValueError(f"Unknown operation: {operation}")
class DataProcessor:
"""Example class that processes data in various ways."""
def __init__(self, data: List[Any]):
"""Initialize with data.
Args:
data: List of data to process
"""
self.data = data
self.processed = False
self.results = {}
def process(self, method: str = "default"):
"""Process the data using specified method.
Args:
method: Processing method to use
Returns:
Processed data
"""
if method == "default":
result = [item for item in self.data if item is not None]
elif method == "sum":
result = sum(self.data)
elif method == "count":
result = len(self.data)
else:
result = self.data
self.results[method] = result
self.processed = True
return result
def get_stats(self) -> Dict[str, Any]:
"""Get statistics about the data.
Returns:
Dictionary of statistics
"""
if not self.data:
return {"count": 0, "empty": True}
return {
"count": len(self.data),
"empty": len(self.data) == 0,
"methods_used": list(self.results.keys()),
"has_nulls": any(item is None for item in self.data)
}
def __str__(self):
"""String representation.
Returns:
String describing the processor
"""
return f"DataProcessor(items={len(self.data)}, processed={self.processed})"
# Add some multi-line strings with various quotes and formatting
TEMPLATE = """
# Agent Report
## Overview
This report was generated by the agent system.
## Details
- Task: {task}
- Date: {date}
- Status: {status}
## Summary
{summary}
"""
SQL_QUERY = '''
SELECT *
FROM users
WHERE status = 'active'
AND last_login > '2023-01-01'
ORDER BY last_login DESC
LIMIT 10;
'''
# Regular expression pattern with escapes
PATTERN = r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\("
# Add a global dictionary with mixed types
GLOBAL_CONFIG = {
"debug": False,
"max_retries": 3,
"timeout": 30,
"endpoints": ["api/v1", "api/v2"],
"rate_limits": {
"minute": 60,
"hour": 3600,
"day": 86400
},
"features": {
"experimental": True,
"beta_tools": False
}
}
# Some unusual unicode characters to ensure encoding handling
UNICODE_EXAMPLE = "Hello 世界! This has unicode: ™ ® © ♥ ⚡ ☁ ☀"
def main():
"""Main function to demonstrate the module."""
# Run a simple example
result = run_mock_agent("Test the mock agent")
print(f"Result: {result}")
# Try the data processor
processor = DataProcessor([1, 2, 3, None, 5])
processor.process("default")
stats = processor.get_stats()
print(f"Stats: {stats}")
# Do a calculation
calc = calculate_something_complex(10, 5, "multiply")
print(f"Calculation: {calc}")
if __name__ == "__main__":
main()

View File

@ -1,4 +1,5 @@
import sys
import os
import types
import importlib
import pytest
@ -19,6 +20,8 @@ from ra_aid.tools.memory import (
log_work_event,
reset_work_log,
swap_task_order,
is_binary_file,
_is_binary_fallback,
)
from ra_aid.database.repositories.key_fact_repository import get_key_fact_repository
from ra_aid.database.repositories.key_snippet_repository import get_key_snippet_repository
@ -955,4 +958,53 @@ def test_is_binary_file_with_null_bytes(reset_memory, monkeypatch):
finally:
# Clean up
if os.path.exists(binary_file.name):
os.unlink(binary_file.name)
os.unlink(binary_file.name)
def test_python_file_detection():
"""Test that Python files are correctly identified as text files.
This test demonstrates an issue where certain Python files are
incorrectly identified as binary files when using the magic library.
The root cause is that the file doesn't have 'ASCII text' in its file type
description despite being a valid text file.
"""
# Path to our mock Python file
mock_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__),
'..', 'mocks', 'agent_utils_mock.py'))
# Verify the file exists
assert os.path.exists(mock_file_path), f"Test file not found: {mock_file_path}"
# Verify using fallback method correctly identifies as text file
is_binary_fallback = _is_binary_fallback(mock_file_path)
assert not is_binary_fallback, "Fallback method should identify Python file as text"
# The following test will fail with the current implementation when using magic
try:
import magic
if magic:
# Only run this part of the test if magic is available
with patch('ra_aid.tools.memory.magic') as mock_magic:
# Mock magic to simulate the behavior that causes the issue
mock_magic.from_file.side_effect = [
"text/x-python", # First call with mime=True
"Python script text executable" # Second call without mime=True
]
# This should return False (not binary) but currently returns True
is_binary = is_binary_file(mock_file_path)
# Verify the magic library was called correctly
mock_magic.from_file.assert_any_call(mock_file_path, mime=True)
mock_magic.from_file.assert_any_call(mock_file_path)
# This assertion is EXPECTED TO FAIL with the current implementation
# It demonstrates the bug we need to fix
assert not is_binary, (
"Python file incorrectly identified as binary. "
"The current implementation requires 'ASCII text' in file_type description, "
"but Python files often have 'Python script text' instead."
)
except ImportError:
pytest.skip("magic library not available, skipping magic-specific test")