fix binary file detection

2025-03-03 18:43:49 -05:00 · 2025-03-03 18:43:49 -05:00 · 6562b6c332
parent 5202d2e7f3
commit 6562b6c332
4 changed files with 379 additions and 6 deletions
--- a/ra_aid/prompts/research_prompts.py
+++ b/ra_aid/prompts/research_prompts.py
@ -168,6 +168,8 @@ Decision on Implementation
 If this is a top-level README.md or docs folder, start there.
 If the user explicitly requested implementation, that means you should first perform all the background research for that task, then call request_implementation where the implementation will be carried out.
 NEVER ANNOUNCE WHAT YOU ARE DOING, JUST DO IT!
 AS THE RESEARCH AGENT, YOU MUST NOT WRITE OR MODIFY ANY FILES. IF FILE MODIFICATION OR IMPLEMENTATINO IS REQUIRED, CALL request_implementation.
--- a/ra_aid/tools/memory.py
+++ b/ra_aid/tools/memory.py
@ -500,12 +500,20 @@ def is_binary_file(filepath):
            mime = magic.from_file(filepath, mime=True)
            file_type = magic.from_file(filepath)
-            if not mime.startswith("text/"):
+            # If MIME type starts with 'text/', it's likely a text file
-                return True
+            if mime.startswith("text/"):
            if "ASCII text" in file_type:
                return False
            # Also consider 'application/x-python' and similar script types as text
            if any(mime.startswith(prefix) for prefix in ['application/x-python', 'application/javascript']):
                return False
            # Check for common text file descriptors
            text_indicators = ["text", "script", "xml", "json", "yaml", "markdown", "HTML"]
            if any(indicator.lower() in file_type.lower() for indicator in text_indicators):
                return False
            # If none of the text indicators are present, assume it's binary
            return True
        except Exception:
            return _is_binary_fallback(filepath)
--- a/tests/ra_aid/mocks/agent_utils_mock.py
+++ b/tests/ra_aid/mocks/agent_utils_mock.py
@ -0,0 +1,311 @@
 """Simplified mock of agent_utils.py for testing binary file detection.
 This file includes typical Python constructs like imports, functions, classes, and docstrings
 to replicate the characteristics of the real agent_utils.py that's causing issues with
 binary file detection.
 """
 import os
 import sys
 import time
 import uuid
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Literal, Sequence
 from rich.console import Console
 from rich.markdown import Markdown
 from rich.panel import Panel
 # Define a logger
 logger = None  # In real code, this would be an actual logger
 class MockAgent:
    """Mock agent class to simulate the real agent class structure."""
    def __init__(self, model=None, tools=None, max_tokens=4096, config=None):
        """Initialize a mock agent.
        Args:
            model: The language model to use
            tools: List of tools available to the agent
            max_tokens: Maximum tokens to use in context
            config: Additional configuration
        """
        self.model = model
        self.tools = tools or []
        self.max_tokens = max_tokens
        self.config = config or {}
        self._initialized = True
    def run(self, input_text, config=None):
        """Run the agent on input text.
        Args:
            input_text: The text to process
            config: Optional runtime configuration
        Returns:
            Mock agent response
        """
        # Simulate processing with a delay
        time.sleep(0.1)
        return f"Processed: {input_text[:20]}..."
    @staticmethod
    def _estimate_tokens(text):
        """Estimate number of tokens in text.
        Args:
            text: Text to estimate tokens for
        Returns:
            Estimated token count (roughly 1 token per 4 characters)
        """
        return len(text) // 4
 def run_mock_agent(task: str, model=None, **kwargs) -> Optional[str]:
    """Run a mock agent on a task.
    This function creates a new agent, sets up tools, and runs the agent on the task.
    It includes various parameters and logic to mimic the complexity of the real agent_utils.py.
    Args:
        task: The task to process
        model: The model to use
        **kwargs: Additional keyword arguments
    Returns:
        Optional[str]: Result from the agent
    """
    # Create a unique ID for this run
    run_id = str(uuid.uuid4())
    # Set up mock console for output
    console = Console()
    # Log the start of execution
    console.print(Panel(Markdown(f"Starting agent with ID: {run_id}"), title="🤖 Agent"))
    # Setup some complex nested data structures to mimic real code
    memory = {
        "task_history": [],
        "agent_state": {
            "initialized": True,
            "tools_enabled": True
        },
        "config": {
            "max_retries": 3,
            "timeout": 30,
            "debug": False
        }
    }
    # Track some metrics
    metrics = {
        "start_time": datetime.now(),
        "steps": 0,
        "tokens_used": 0
    }
    # Create a mock agent
    agent = MockAgent(model=model, config=kwargs.get("config"))
    try:
        # Process the task
        memory["task_history"].append(task)
        metrics["steps"] += 1
        # Simulate token counting
        task_tokens = MockAgent._estimate_tokens(task)
        metrics["tokens_used"] += task_tokens
        # Check if we should short-circuit for any reason
        if task.lower() == "exit" or task.lower() == "quit":
            return "Exit requested"
        # Run the main agent logic
        result = agent.run(task)
        # Update completion time
        metrics["end_time"] = datetime.now()
        metrics["duration"] = (metrics["end_time"] - metrics["start_time"]).total_seconds()
        # Generate a fancy completion message with some complex formatting
        completion_message = f"""
 ## Agent Run Complete
 - **Task**: {task[:50]}{"..." if len(task) > 50 else ""}
 - **Duration**: {metrics["duration"]:.2f}s
 - **Tokens**: {metrics["tokens_used"]}
 - **Steps**: {metrics["steps"]}
 - **Result**: Success
        """
        console.print(Panel(Markdown(completion_message), title="✅ Complete"))
        return result
    except Exception as e:
        # Handle errors
        error_message = f"Agent failed: {str(e)}"
        console.print(Panel(error_message, title="❌ Error", style="red"))
        return None
 def calculate_something_complex(a: int, b: int, operation: str = "add") -> int:
    """Calculate something using the specified operation.
    Args:
        a: First number
        b: Second number
        operation: Operation to perform (add, subtract, multiply, divide)
    Returns:
        Result of the calculation
    Raises:
        ValueError: If operation is invalid
    """
    if operation == "add":
        return a + b
    elif operation == "subtract":
        return a - b
    elif operation == "multiply":
        return a * b
    elif operation == "divide":
        if b == 0:
            raise ValueError("Cannot divide by zero")
        return a / b
    else:
        raise ValueError(f"Unknown operation: {operation}")
 class DataProcessor:
    """Example class that processes data in various ways."""
    def __init__(self, data: List[Any]):
        """Initialize with data.
        Args:
            data: List of data to process
        """
        self.data = data
        self.processed = False
        self.results = {}
    def process(self, method: str = "default"):
        """Process the data using specified method.
        Args:
            method: Processing method to use
        Returns:
            Processed data
        """
        if method == "default":
            result = [item for item in self.data if item is not None]
        elif method == "sum":
            result = sum(self.data)
        elif method == "count":
            result = len(self.data)
        else:
            result = self.data
        self.results[method] = result
        self.processed = True
        return result
    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about the data.
        Returns:
            Dictionary of statistics
        """
        if not self.data:
            return {"count": 0, "empty": True}
        return {
            "count": len(self.data),
            "empty": len(self.data) == 0,
            "methods_used": list(self.results.keys()),
            "has_nulls": any(item is None for item in self.data)
        }
    def __str__(self):
        """String representation.
        Returns:
            String describing the processor
        """
        return f"DataProcessor(items={len(self.data)}, processed={self.processed})"
 # Add some multi-line strings with various quotes and formatting
 TEMPLATE = """
 # Agent Report
 ## Overview
 This report was generated by the agent system.
 ## Details
 - Task: {task}
 - Date: {date}
 - Status: {status}
 ## Summary
 {summary}
 """
 SQL_QUERY = '''
 SELECT *
 FROM users
 WHERE status = 'active'
  AND last_login > '2023-01-01'
 ORDER BY last_login DESC
 LIMIT 10;
 '''
 # Regular expression pattern with escapes
 PATTERN = r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\("
 # Add a global dictionary with mixed types
 GLOBAL_CONFIG = {
    "debug": False,
    "max_retries": 3,
    "timeout": 30,
    "endpoints": ["api/v1", "api/v2"],
    "rate_limits": {
        "minute": 60,
        "hour": 3600,
        "day": 86400
    },
    "features": {
        "experimental": True,
        "beta_tools": False
    }
 }
 # Some unusual unicode characters to ensure encoding handling
 UNICODE_EXAMPLE = "Hello 世界! This has unicode: ™ ® © ♥ ⚡ ☁ ☀"
 def main():
    """Main function to demonstrate the module."""
    # Run a simple example
    result = run_mock_agent("Test the mock agent")
    print(f"Result: {result}")
    # Try the data processor
    processor = DataProcessor([1, 2, 3, None, 5])
    processor.process("default")
    stats = processor.get_stats()
    print(f"Stats: {stats}")
    # Do a calculation
    calc = calculate_something_complex(10, 5, "multiply")
    print(f"Calculation: {calc}")
 if __name__ == "__main__":
    main()
--- a/tests/ra_aid/tools/test_memory.py
+++ b/tests/ra_aid/tools/test_memory.py
@ -1,4 +1,5 @@
 import sys
 import os
 import types
 import importlib
 import pytest
@ -19,6 +20,8 @@ from ra_aid.tools.memory import (
    log_work_event,
    reset_work_log,
    swap_task_order,
    is_binary_file,
    _is_binary_fallback,
 )
 from ra_aid.database.repositories.key_fact_repository import get_key_fact_repository
 from ra_aid.database.repositories.key_snippet_repository import get_key_snippet_repository
@ -956,3 +959,52 @@ def test_is_binary_file_with_null_bytes(reset_memory, monkeypatch):
        # Clean up
        if os.path.exists(binary_file.name):
            os.unlink(binary_file.name)
 def test_python_file_detection():
    """Test that Python files are correctly identified as text files.
    This test demonstrates an issue where certain Python files are
    incorrectly identified as binary files when using the magic library.
    The root cause is that the file doesn't have 'ASCII text' in its file type
    description despite being a valid text file.
    """
    # Path to our mock Python file
    mock_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), 
                                               '..', 'mocks', 'agent_utils_mock.py'))
    # Verify the file exists
    assert os.path.exists(mock_file_path), f"Test file not found: {mock_file_path}"
    # Verify using fallback method correctly identifies as text file
    is_binary_fallback = _is_binary_fallback(mock_file_path)
    assert not is_binary_fallback, "Fallback method should identify Python file as text"
    # The following test will fail with the current implementation when using magic
    try:
        import magic
        if magic:
            # Only run this part of the test if magic is available
            with patch('ra_aid.tools.memory.magic') as mock_magic:
                # Mock magic to simulate the behavior that causes the issue
                mock_magic.from_file.side_effect = [
                    "text/x-python",  # First call with mime=True
                    "Python script text executable"  # Second call without mime=True
                ]
                # This should return False (not binary) but currently returns True
                is_binary = is_binary_file(mock_file_path)
                # Verify the magic library was called correctly
                mock_magic.from_file.assert_any_call(mock_file_path, mime=True)
                mock_magic.from_file.assert_any_call(mock_file_path)
                # This assertion is EXPECTED TO FAIL with the current implementation
                # It demonstrates the bug we need to fix
                assert not is_binary, (
                    "Python file incorrectly identified as binary. "
                    "The current implementation requires 'ASCII text' in file_type description, "
                    "but Python files often have 'Python script text' instead."
                )
    except ImportError:
        pytest.skip("magic library not available, skipping magic-specific test")