improve work logging; use reasoning_effort=high for openai expert models; improve prompts

2025-02-08 14:36:08 -05:00 · 2025-02-08 14:36:08 -05:00 · f40e11ee21
parent 5fad3fc755
commit f40e11ee21
6 changed files with 74 additions and 19 deletions
--- a/ra_aid/agent_utils.py
+++ b/ra_aid/agent_utils.py
@ -62,6 +62,7 @@ from ra_aid.tools.memory import (
    _global_memory,
    get_memory_value,
    get_related_files,
+    log_work_event,
 )

 console = Console()
@ -405,7 +406,11 @@ def run_research_agent(

        if agent is not None:
            logger.debug("Research agent completed successfully")
-            return run_agent_with_retry(agent, prompt, run_config)
+            _result = run_agent_with_retry(agent, prompt, run_config)
+            if _result:
+                # Log research completion
+                log_work_event(f"Completed research phase for: {base_task_or_query}")
+            return _result
        else:
            logger.debug("No model provided, running web research tools directly")
            return run_web_research_agent(
@ -517,7 +522,11 @@ def run_web_research_agent(
            console.print(Panel(Markdown(console_message), title="🔬 Researching..."))

        logger.debug("Web research agent completed successfully")
-        return run_agent_with_retry(agent, prompt, run_config)
+        _result = run_agent_with_retry(agent, prompt, run_config)
+        if _result:
+            # Log web research completion
+            log_work_event(f"Completed web research phase for: {query}")
+        return _result

    except (KeyboardInterrupt, AgentInterrupt):
        raise
@ -618,7 +627,11 @@ def run_planning_agent(
    try:
        print_stage_header("Planning Stage")
        logger.debug("Planning agent completed successfully")
-        return run_agent_with_retry(agent, planning_prompt, run_config)
+        _result = run_agent_with_retry(agent, planning_prompt, run_config)
+        if _result:
+            # Log planning completion
+            log_work_event(f"Completed planning phase for: {base_task}")
+        return _result
    except (KeyboardInterrupt, AgentInterrupt):
        raise
    except Exception as e:
@ -719,7 +732,11 @@ def run_task_implementation_agent(

    try:
        logger.debug("Implementation agent completed successfully")
-        return run_agent_with_retry(agent, prompt, run_config)
+        _result = run_agent_with_retry(agent, prompt, run_config)
+        if _result:
+            # Log task implementation completion
+            log_work_event(f"Completed implementation of task: {task}")
+        return _result
    except (KeyboardInterrupt, AgentInterrupt):
        raise
    except Exception as e:
--- a/ra_aid/llm.py
+++ b/ra_aid/llm.py
@ -181,11 +181,14 @@ def create_llm_client(
            is_expert=is_expert,
        )
    elif provider == "openai":
-        return ChatOpenAI(
-            api_key=config["api_key"],
-            model=model_name,
+        openai_kwargs = {
+            "api_key": config["api_key"],
+            "model": model_name,
            **temp_kwargs,
-        )
+        }
+        if is_expert:
+            openai_kwargs["reasoning_effort"] = "high"
+        return ChatOpenAI(**openai_kwargs)
    elif provider == "anthropic":
        return ChatAnthropic(
            api_key=config["api_key"],
--- a/ra_aid/prompts.py
+++ b/ra_aid/prompts.py
@ -17,6 +17,8 @@ Expert Consultation:
    - Use emit_expert_context to provide all relevant context about what you've found
    - Wait for the expert response before proceeding with research
    - The expert can help analyze complex codebases, unclear patterns, or subtle edge cases
+
+The expert is really good at logic, debugging and planning, but it only has access to the context you give it, and it is unable to access the outside world.
 """

 EXPERT_PROMPT_SECTION_PLANNING = """
@ -25,6 +27,10 @@ Expert Consultation:
    - First use emit_expert_context to provide all relevant context
    - Wait for the expert's response before defining tasks in non-trivial scenarios
    - The expert can help with architectural decisions, correctness checks, and detailed planning
+
+The expert is really good at logic, debugging and planning, but it only has access to the context you give it, and it is unable to access the outside world.
+
+**ALWAYS** use the expert to come up with the high level plan.
 """

 EXPERT_PROMPT_SECTION_IMPLEMENTATION = """
@ -33,6 +39,8 @@ Expert Consultation:
    - Use emit_expert_context to provide context about your specific concern
    - Ask the expert to perform deep analysis or correctness checks
    - Wait for expert guidance before proceeding with implementation
+
+The expert is really good at logic, debugging and planning, but it only has access to the context you give it, and it is unable to access the outside world.
 """

 EXPERT_PROMPT_SECTION_CHAT = """
@ -41,6 +49,8 @@ Expert Consultation:
    - Use emit_expert_context to provide the current conversation state, user requirements, and discovered details
    - Ask the expert for advice on handling ambiguous user requests or complex technical challenges, and to verify correctness
    - Wait for the expert’s guidance before making decisions that significantly alter the approach or final outcome
+
+The expert is really good at logic, debugging and planning, but it only has access to the context you give it, and it is unable to access the outside world.
 """

 # Human-specific prompt sections
@ -124,6 +134,8 @@ Because this is a new project
 Remember, this is the research phase. Your main focus right now is research and creating instructions for the implementation which will be handed off to the implementation team.
 Focus on finding best practices, idiomatic approaches, and using all available research tools as well as the expert, if available.
 Remember, our scope and capabilities are limited --unless the user specifically asks, we do not want to set up servers like postgres. We want to use sqlite or similar for initial implementation, but make it extensible.
+
+If the expert tool is available, **ALWAYS** ask the expert to review and refine your research before requesting implementation.
 """

 # Research stage prompt - guides initial codebase analysis
@ -597,7 +609,11 @@ IMPLEMENTATION_PROMPT = """Current Date: {current_date}
 Working Directory: {working_directory}

 Base-level task (for reference only):
-{base_task} --keep it simple
+<base task>
+{base_task}
+</base task>
+
+keep it simple. if the expert tool is available, use it frequently for high level logic and planning.

 Plan Overview (for reference only, remember you are only implementing your specific task):
 {plan}
--- a/ra_aid/tools/programmer.py
+++ b/ra_aid/tools/programmer.py
@ -10,12 +10,19 @@ from rich.text import Text
 from ra_aid.logging_config import get_logger
 from ra_aid.proc.interactive import run_interactive_command
 from ra_aid.text.processing import truncate_output
-from ra_aid.tools.memory import _global_memory
+from ra_aid.tools.memory import _global_memory, log_work_event

 console = Console()
 logger = get_logger(__name__)


+def _truncate_for_log(text: str, max_length: int = 300) -> str:
+    """Truncate text for logging, adding [truncated] if necessary."""
+    if len(text) <= max_length:
+        return text
+    return text[:max_length] + "... [truncated]"
+
+
@tool
 def run_programming_task(
    instructions: str, files: List[str] = []
@ -101,14 +108,17 @@ def run_programming_task(
    try:
        # Run the command interactively
        print()
-        output, return_code = run_interactive_command(command)
+        result = run_interactive_command(command)
        print()

+        # Log the programming task
+        log_work_event(f"Executed programming task: {_truncate_for_log(instructions)}")
+        
        # Return structured output
        return {
-            "output": truncate_output(output.decode() if output else ""),
-            "return_code": return_code,
-            "success": return_code == 0,
+            "output": truncate_output(result[0].decode()) if result[0] else "",
+            "return_code": result[1],
+            "success": result[1] == 0,
        }

    except Exception as e:
--- a/ra_aid/tools/shell.py
+++ b/ra_aid/tools/shell.py
@ -8,11 +8,18 @@ from rich.prompt import Prompt
 from ra_aid.console.cowboy_messages import get_cowboy_message
 from ra_aid.proc.interactive import run_interactive_command
 from ra_aid.text.processing import truncate_output
-from ra_aid.tools.memory import _global_memory
+from ra_aid.tools.memory import _global_memory, log_work_event

 console = Console()


+def _truncate_for_log(text: str, max_length: int = 300) -> str:
+    """Truncate text for logging, adding [truncated] if necessary."""
+    if len(text) <= max_length:
+        return text
+    return text[:max_length] + "... [truncated]"
+
+
@tool
 def run_shell_command(command: str) -> Dict[str, Union[str, int, bool]]:
    """Execute a shell command and return its output.
@ -68,11 +75,13 @@ def run_shell_command(command: str) -> Dict[str, Union[str, int, bool]]:
        print()
        output, return_code = run_interactive_command(["/bin/bash", "-c", command])
        print()
-        return {
+        result = {
            "output": truncate_output(output.decode()) if output else "",
            "return_code": return_code,
            "success": return_code == 0,
        }
+        log_work_event(f"Executed shell command: {_truncate_for_log(command)}")
+        return result
    except Exception as e:
        print()
        console.print(Panel(str(e), title="❌ Error", border_style="red"))
--- a/tests/ra_aid/test_llm.py
+++ b/tests/ra_aid/test_llm.py
@ -54,7 +54,7 @@ def test_initialize_expert_defaults(clean_env, mock_openai, monkeypatch):
    monkeypatch.setenv("EXPERT_OPENAI_API_KEY", "test-key")
    _llm = initialize_expert_llm()

-    mock_openai.assert_called_once_with(api_key="test-key", model="o1")
+    mock_openai.assert_called_once_with(api_key="test-key", model="o1", reasoning_effort="high")


 def test_initialize_expert_openai_custom(clean_env, mock_openai, monkeypatch):
@ -63,7 +63,7 @@ def test_initialize_expert_openai_custom(clean_env, mock_openai, monkeypatch):
    _llm = initialize_expert_llm("openai", "gpt-4-preview")

    mock_openai.assert_called_once_with(
-        api_key="test-key", model="gpt-4-preview", temperature=0
+        api_key="test-key", model="gpt-4-preview", temperature=0, reasoning_effort="high"
    )


@ -348,7 +348,7 @@ def test_environment_variable_precedence(clean_env, mock_openai, monkeypatch):

    # Test LLM client creation with expert mode
    _llm = create_llm_client("openai", "o1", is_expert=True)
-    mock_openai.assert_called_with(api_key="expert-key", model="o1")
+    mock_openai.assert_called_with(api_key="expert-key", model="o1", reasoning_effort="high")

    # Test environment validation
    monkeypatch.setenv("EXPERT_OPENAI_API_KEY", "")