fix binary file detection
This commit is contained in:
parent
5202d2e7f3
commit
6562b6c332
|
|
@ -168,6 +168,8 @@ Decision on Implementation
|
||||||
|
|
||||||
If this is a top-level README.md or docs folder, start there.
|
If this is a top-level README.md or docs folder, start there.
|
||||||
|
|
||||||
|
If the user explicitly requested implementation, that means you should first perform all the background research for that task, then call request_implementation where the implementation will be carried out.
|
||||||
|
|
||||||
NEVER ANNOUNCE WHAT YOU ARE DOING, JUST DO IT!
|
NEVER ANNOUNCE WHAT YOU ARE DOING, JUST DO IT!
|
||||||
|
|
||||||
AS THE RESEARCH AGENT, YOU MUST NOT WRITE OR MODIFY ANY FILES. IF FILE MODIFICATION OR IMPLEMENTATINO IS REQUIRED, CALL request_implementation.
|
AS THE RESEARCH AGENT, YOU MUST NOT WRITE OR MODIFY ANY FILES. IF FILE MODIFICATION OR IMPLEMENTATINO IS REQUIRED, CALL request_implementation.
|
||||||
|
|
|
||||||
|
|
@ -500,12 +500,20 @@ def is_binary_file(filepath):
|
||||||
mime = magic.from_file(filepath, mime=True)
|
mime = magic.from_file(filepath, mime=True)
|
||||||
file_type = magic.from_file(filepath)
|
file_type = magic.from_file(filepath)
|
||||||
|
|
||||||
if not mime.startswith("text/"):
|
# If MIME type starts with 'text/', it's likely a text file
|
||||||
return True
|
if mime.startswith("text/"):
|
||||||
|
|
||||||
if "ASCII text" in file_type:
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Also consider 'application/x-python' and similar script types as text
|
||||||
|
if any(mime.startswith(prefix) for prefix in ['application/x-python', 'application/javascript']):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for common text file descriptors
|
||||||
|
text_indicators = ["text", "script", "xml", "json", "yaml", "markdown", "HTML"]
|
||||||
|
if any(indicator.lower() in file_type.lower() for indicator in text_indicators):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# If none of the text indicators are present, assume it's binary
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
return _is_binary_fallback(filepath)
|
return _is_binary_fallback(filepath)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,311 @@
|
||||||
|
"""Simplified mock of agent_utils.py for testing binary file detection.
|
||||||
|
|
||||||
|
This file includes typical Python constructs like imports, functions, classes, and docstrings
|
||||||
|
to replicate the characteristics of the real agent_utils.py that's causing issues with
|
||||||
|
binary file detection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Optional, Literal, Sequence
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.markdown import Markdown
|
||||||
|
from rich.panel import Panel
|
||||||
|
|
||||||
|
# Define a logger
|
||||||
|
logger = None # In real code, this would be an actual logger
|
||||||
|
|
||||||
|
|
||||||
|
class MockAgent:
|
||||||
|
"""Mock agent class to simulate the real agent class structure."""
|
||||||
|
|
||||||
|
def __init__(self, model=None, tools=None, max_tokens=4096, config=None):
|
||||||
|
"""Initialize a mock agent.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: The language model to use
|
||||||
|
tools: List of tools available to the agent
|
||||||
|
max_tokens: Maximum tokens to use in context
|
||||||
|
config: Additional configuration
|
||||||
|
"""
|
||||||
|
self.model = model
|
||||||
|
self.tools = tools or []
|
||||||
|
self.max_tokens = max_tokens
|
||||||
|
self.config = config or {}
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
def run(self, input_text, config=None):
|
||||||
|
"""Run the agent on input text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_text: The text to process
|
||||||
|
config: Optional runtime configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mock agent response
|
||||||
|
"""
|
||||||
|
# Simulate processing with a delay
|
||||||
|
time.sleep(0.1)
|
||||||
|
return f"Processed: {input_text[:20]}..."
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _estimate_tokens(text):
|
||||||
|
"""Estimate number of tokens in text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to estimate tokens for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Estimated token count (roughly 1 token per 4 characters)
|
||||||
|
"""
|
||||||
|
return len(text) // 4
|
||||||
|
|
||||||
|
|
||||||
|
def run_mock_agent(task: str, model=None, **kwargs) -> Optional[str]:
|
||||||
|
"""Run a mock agent on a task.
|
||||||
|
|
||||||
|
This function creates a new agent, sets up tools, and runs the agent on the task.
|
||||||
|
It includes various parameters and logic to mimic the complexity of the real agent_utils.py.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task: The task to process
|
||||||
|
model: The model to use
|
||||||
|
**kwargs: Additional keyword arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[str]: Result from the agent
|
||||||
|
"""
|
||||||
|
# Create a unique ID for this run
|
||||||
|
run_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
# Set up mock console for output
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Log the start of execution
|
||||||
|
console.print(Panel(Markdown(f"Starting agent with ID: {run_id}"), title="🤖 Agent"))
|
||||||
|
|
||||||
|
# Setup some complex nested data structures to mimic real code
|
||||||
|
memory = {
|
||||||
|
"task_history": [],
|
||||||
|
"agent_state": {
|
||||||
|
"initialized": True,
|
||||||
|
"tools_enabled": True
|
||||||
|
},
|
||||||
|
"config": {
|
||||||
|
"max_retries": 3,
|
||||||
|
"timeout": 30,
|
||||||
|
"debug": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Track some metrics
|
||||||
|
metrics = {
|
||||||
|
"start_time": datetime.now(),
|
||||||
|
"steps": 0,
|
||||||
|
"tokens_used": 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a mock agent
|
||||||
|
agent = MockAgent(model=model, config=kwargs.get("config"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process the task
|
||||||
|
memory["task_history"].append(task)
|
||||||
|
metrics["steps"] += 1
|
||||||
|
|
||||||
|
# Simulate token counting
|
||||||
|
task_tokens = MockAgent._estimate_tokens(task)
|
||||||
|
metrics["tokens_used"] += task_tokens
|
||||||
|
|
||||||
|
# Check if we should short-circuit for any reason
|
||||||
|
if task.lower() == "exit" or task.lower() == "quit":
|
||||||
|
return "Exit requested"
|
||||||
|
|
||||||
|
# Run the main agent logic
|
||||||
|
result = agent.run(task)
|
||||||
|
|
||||||
|
# Update completion time
|
||||||
|
metrics["end_time"] = datetime.now()
|
||||||
|
metrics["duration"] = (metrics["end_time"] - metrics["start_time"]).total_seconds()
|
||||||
|
|
||||||
|
# Generate a fancy completion message with some complex formatting
|
||||||
|
completion_message = f"""
|
||||||
|
## Agent Run Complete
|
||||||
|
|
||||||
|
- **Task**: {task[:50]}{"..." if len(task) > 50 else ""}
|
||||||
|
- **Duration**: {metrics["duration"]:.2f}s
|
||||||
|
- **Tokens**: {metrics["tokens_used"]}
|
||||||
|
- **Steps**: {metrics["steps"]}
|
||||||
|
- **Result**: Success
|
||||||
|
"""
|
||||||
|
|
||||||
|
console.print(Panel(Markdown(completion_message), title="✅ Complete"))
|
||||||
|
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
# Handle errors
|
||||||
|
error_message = f"Agent failed: {str(e)}"
|
||||||
|
console.print(Panel(error_message, title="❌ Error", style="red"))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_something_complex(a: int, b: int, operation: str = "add") -> int:
|
||||||
|
"""Calculate something using the specified operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
a: First number
|
||||||
|
b: Second number
|
||||||
|
operation: Operation to perform (add, subtract, multiply, divide)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Result of the calculation
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If operation is invalid
|
||||||
|
"""
|
||||||
|
if operation == "add":
|
||||||
|
return a + b
|
||||||
|
elif operation == "subtract":
|
||||||
|
return a - b
|
||||||
|
elif operation == "multiply":
|
||||||
|
return a * b
|
||||||
|
elif operation == "divide":
|
||||||
|
if b == 0:
|
||||||
|
raise ValueError("Cannot divide by zero")
|
||||||
|
return a / b
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown operation: {operation}")
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessor:
|
||||||
|
"""Example class that processes data in various ways."""
|
||||||
|
|
||||||
|
def __init__(self, data: List[Any]):
|
||||||
|
"""Initialize with data.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: List of data to process
|
||||||
|
"""
|
||||||
|
self.data = data
|
||||||
|
self.processed = False
|
||||||
|
self.results = {}
|
||||||
|
|
||||||
|
def process(self, method: str = "default"):
|
||||||
|
"""Process the data using specified method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
method: Processing method to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Processed data
|
||||||
|
"""
|
||||||
|
if method == "default":
|
||||||
|
result = [item for item in self.data if item is not None]
|
||||||
|
elif method == "sum":
|
||||||
|
result = sum(self.data)
|
||||||
|
elif method == "count":
|
||||||
|
result = len(self.data)
|
||||||
|
else:
|
||||||
|
result = self.data
|
||||||
|
|
||||||
|
self.results[method] = result
|
||||||
|
self.processed = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get statistics about the data.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of statistics
|
||||||
|
"""
|
||||||
|
if not self.data:
|
||||||
|
return {"count": 0, "empty": True}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(self.data),
|
||||||
|
"empty": len(self.data) == 0,
|
||||||
|
"methods_used": list(self.results.keys()),
|
||||||
|
"has_nulls": any(item is None for item in self.data)
|
||||||
|
}
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
"""String representation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String describing the processor
|
||||||
|
"""
|
||||||
|
return f"DataProcessor(items={len(self.data)}, processed={self.processed})"
|
||||||
|
|
||||||
|
|
||||||
|
# Add some multi-line strings with various quotes and formatting
|
||||||
|
TEMPLATE = """
|
||||||
|
# Agent Report
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
This report was generated by the agent system.
|
||||||
|
|
||||||
|
## Details
|
||||||
|
- Task: {task}
|
||||||
|
- Date: {date}
|
||||||
|
- Status: {status}
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
{summary}
|
||||||
|
"""
|
||||||
|
|
||||||
|
SQL_QUERY = '''
|
||||||
|
SELECT *
|
||||||
|
FROM users
|
||||||
|
WHERE status = 'active'
|
||||||
|
AND last_login > '2023-01-01'
|
||||||
|
ORDER BY last_login DESC
|
||||||
|
LIMIT 10;
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Regular expression pattern with escapes
|
||||||
|
PATTERN = r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\("
|
||||||
|
|
||||||
|
# Add a global dictionary with mixed types
|
||||||
|
GLOBAL_CONFIG = {
|
||||||
|
"debug": False,
|
||||||
|
"max_retries": 3,
|
||||||
|
"timeout": 30,
|
||||||
|
"endpoints": ["api/v1", "api/v2"],
|
||||||
|
"rate_limits": {
|
||||||
|
"minute": 60,
|
||||||
|
"hour": 3600,
|
||||||
|
"day": 86400
|
||||||
|
},
|
||||||
|
"features": {
|
||||||
|
"experimental": True,
|
||||||
|
"beta_tools": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Some unusual unicode characters to ensure encoding handling
|
||||||
|
UNICODE_EXAMPLE = "Hello 世界! This has unicode: ™ ® © ♥ ⚡ ☁ ☀"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to demonstrate the module."""
|
||||||
|
# Run a simple example
|
||||||
|
result = run_mock_agent("Test the mock agent")
|
||||||
|
print(f"Result: {result}")
|
||||||
|
|
||||||
|
# Try the data processor
|
||||||
|
processor = DataProcessor([1, 2, 3, None, 5])
|
||||||
|
processor.process("default")
|
||||||
|
stats = processor.get_stats()
|
||||||
|
print(f"Stats: {stats}")
|
||||||
|
|
||||||
|
# Do a calculation
|
||||||
|
calc = calculate_something_complex(10, 5, "multiply")
|
||||||
|
print(f"Calculation: {calc}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import sys
|
import sys
|
||||||
|
import os
|
||||||
import types
|
import types
|
||||||
import importlib
|
import importlib
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -19,6 +20,8 @@ from ra_aid.tools.memory import (
|
||||||
log_work_event,
|
log_work_event,
|
||||||
reset_work_log,
|
reset_work_log,
|
||||||
swap_task_order,
|
swap_task_order,
|
||||||
|
is_binary_file,
|
||||||
|
_is_binary_fallback,
|
||||||
)
|
)
|
||||||
from ra_aid.database.repositories.key_fact_repository import get_key_fact_repository
|
from ra_aid.database.repositories.key_fact_repository import get_key_fact_repository
|
||||||
from ra_aid.database.repositories.key_snippet_repository import get_key_snippet_repository
|
from ra_aid.database.repositories.key_snippet_repository import get_key_snippet_repository
|
||||||
|
|
@ -956,3 +959,52 @@ def test_is_binary_file_with_null_bytes(reset_memory, monkeypatch):
|
||||||
# Clean up
|
# Clean up
|
||||||
if os.path.exists(binary_file.name):
|
if os.path.exists(binary_file.name):
|
||||||
os.unlink(binary_file.name)
|
os.unlink(binary_file.name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_python_file_detection():
|
||||||
|
"""Test that Python files are correctly identified as text files.
|
||||||
|
|
||||||
|
This test demonstrates an issue where certain Python files are
|
||||||
|
incorrectly identified as binary files when using the magic library.
|
||||||
|
The root cause is that the file doesn't have 'ASCII text' in its file type
|
||||||
|
description despite being a valid text file.
|
||||||
|
"""
|
||||||
|
# Path to our mock Python file
|
||||||
|
mock_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__),
|
||||||
|
'..', 'mocks', 'agent_utils_mock.py'))
|
||||||
|
|
||||||
|
# Verify the file exists
|
||||||
|
assert os.path.exists(mock_file_path), f"Test file not found: {mock_file_path}"
|
||||||
|
|
||||||
|
# Verify using fallback method correctly identifies as text file
|
||||||
|
is_binary_fallback = _is_binary_fallback(mock_file_path)
|
||||||
|
assert not is_binary_fallback, "Fallback method should identify Python file as text"
|
||||||
|
|
||||||
|
# The following test will fail with the current implementation when using magic
|
||||||
|
try:
|
||||||
|
import magic
|
||||||
|
if magic:
|
||||||
|
# Only run this part of the test if magic is available
|
||||||
|
with patch('ra_aid.tools.memory.magic') as mock_magic:
|
||||||
|
# Mock magic to simulate the behavior that causes the issue
|
||||||
|
mock_magic.from_file.side_effect = [
|
||||||
|
"text/x-python", # First call with mime=True
|
||||||
|
"Python script text executable" # Second call without mime=True
|
||||||
|
]
|
||||||
|
|
||||||
|
# This should return False (not binary) but currently returns True
|
||||||
|
is_binary = is_binary_file(mock_file_path)
|
||||||
|
|
||||||
|
# Verify the magic library was called correctly
|
||||||
|
mock_magic.from_file.assert_any_call(mock_file_path, mime=True)
|
||||||
|
mock_magic.from_file.assert_any_call(mock_file_path)
|
||||||
|
|
||||||
|
# This assertion is EXPECTED TO FAIL with the current implementation
|
||||||
|
# It demonstrates the bug we need to fix
|
||||||
|
assert not is_binary, (
|
||||||
|
"Python file incorrectly identified as binary. "
|
||||||
|
"The current implementation requires 'ASCII text' in file_type description, "
|
||||||
|
"but Python files often have 'Python script text' instead."
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("magic library not available, skipping magic-specific test")
|
||||||
Loading…
Reference in New Issue