feat(fallback): implement automatic fallback to alternative LLM models on consecutive failures to enhance user experience and prevent infinite error loops

refactor(ciayn_agent): restructure tool failure handling to track consecutive failures and fallback attempts more effectively fix(logging): add pretty logging option for improved log readability chore(config): define valid providers for LLM selection and update fallback model loading logic test(ciayn_agent): add unit tests for fallback logic and tool failure handling to ensure reliability and correctness
2025-02-10 14:13:19 -08:00 · 2025-02-10 14:13:19 -08:00 · d8ee4e04f4
parent 45b993cfd0
commit d8ee4e04f4
7 changed files with 688 additions and 97 deletions
--- a/issue.md
+++ b/issue.md
@ -4,15 +4,7 @@
 Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures.
 ## Background
-Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience.
+Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls.
 ## Relevant Files
 - ra_aid/agents/ciayn_agent.py
 - ra_aid/llm.py
 - ra_aid/agent_utils.py
 - ra_aid/__main__.py
 - ra_aid/models_params.py
 ## Implementation Details
@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing
 Define fallback sequences for each provider based on model capabilities:
 1. Try same provider's smaller models
-2. Try alternative providers' equivalent models
+2. Try alternative providers' similar models
 3. Raise final error if all fallbacks fail
 ### Provider Strategy Updates
 Update provider strategies to support fallback configuration:
 - Add provider-specific fallback sequences
 - Handle model capability validation during fallback
 - Track successful/failed attempts
 ## Risks and Mitigations
-1. **Performance Impact**
+1. **Cost**
   - Risk: Multiple fallback attempts could increase latency
   - Mitigation: Set reasonable max_failures limit and timeouts
 2. **Consistency**
   - Risk: Different models may give slightly different outputs
   - Mitigation: Validate output schema consistency across models
 3. **Cost**
   - Risk: Fallback to more expensive models
   - Mitigation: Configure cost limits and preferred fallback sequences
-4. **State Management** 
+2. **State Management** 
   - Risk: Loss of context during fallbacks
   - Mitigation: Preserve conversation state and tool context
 ## Relevant Files
 - ra_aid/agents/ciayn_agent.py
 - ra_aid/llm.py
 - ra_aid/agent_utils.py
 - ra_aid/__main__.py
 - ra_aid/models_params.py
 ## Acceptance Criteria
 1. Tool calls automatically attempt fallback models after N consecutive failures
 2. `--no-fallback-tool` argument successfully disables fallback behavior
@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration:
 5. Unit tests cover fallback scenarios and edge cases
 6. README.md updated to reflect new behavior
 ## Testing
 1. Unit tests for fallback wrapper
 2. Integration tests with mock LLM failures 
 3. Provider strategy fallback tests
 4. Command line argument handling
 5. Error preservation and reporting
 6. Performance impact measurement
 7. Edge cases (e.g., partial failures, timeout handling)
 8. State preservation during fallbacks
 ## Documentation Updates
 1. Add fallback feature to main README
 2. Document `--no-fallback-tool` in CLI help
--- a/ra_aid/main.py
+++ b/ra_aid/main.py
@ -17,7 +17,11 @@ from ra_aid.agent_utils import (
    run_planning_agent,
    run_research_agent,
 )
-from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT
+from ra_aid.config import (
    DEFAULT_MAX_TEST_CMD_RETRIES,
    DEFAULT_RECURSION_LIMIT,
    VALID_PROVIDERS,
 )
 from ra_aid.dependencies import check_dependencies
 from ra_aid.env import validate_environment
 from ra_aid.llm import initialize_llm
@ -40,14 +44,6 @@ def launch_webui(host: str, port: int):
 def parse_arguments(args=None):
    VALID_PROVIDERS = [
        "anthropic",
        "openai",
        "openrouter",
        "openai-compatible",
        "deepseek",
        "gemini",
    ]
    ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
    OPENAI_DEFAULT_MODEL = "gpt-4o"
@ -80,9 +76,11 @@ Examples:
    parser.add_argument(
        "--provider",
        type=str,
-        default="openai"
+        default=(
            "openai"
            if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
-        else "anthropic",
+            else "anthropic"
        ),
        choices=VALID_PROVIDERS,
        help="The LLM provider to use",
    )
@ -138,6 +136,9 @@ Examples:
    parser.add_argument(
        "--verbose", action="store_true", help="Enable verbose logging output"
    )
    parser.add_argument(
        "--pretty-logger", action="store_true", help="Enable pretty logging output"
    )
    parser.add_argument(
        "--temperature",
        type=float,
@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool:
 def main():
    """Main entry point for the ra-aid command line tool."""
    args = parse_arguments()
-    setup_logging(args.verbose)
+    setup_logging(args.verbose, args.pretty_logger)
    logger.debug("Starting RA.Aid with arguments: %s", args)
    # Launch web interface if requested
@ -378,9 +379,9 @@ def main():
                chat_agent,
                CHAT_PROMPT.format(
                    initial_request=initial_request,
-                    web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT
+                    web_research_section=(
-                    if web_research_enabled
+                        WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else ""
-                    else "",
+                    ),
                    working_directory=working_directory,
                    current_date=current_date,
                    project_info=formatted_project_info,
--- a/ra_aid/agents/ciayn_agent.py
+++ b/ra_aid/agents/ciayn_agent.py
@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
 from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES
 from ra_aid.exceptions import ToolExecutionError
 from ra_aid.logging_config import get_logger
 from ra_aid.models_params import DEFAULT_TOKEN_LIMIT
@ -68,22 +69,6 @@ class CiaynAgent:
    - Memory management with configurable limits
    """
    class ToolCallFailure:
        """Tracks consecutive failures and fallback model usage for tool calls.
        Attributes:
            consecutive_failures (int): Count of consecutive failures for current model
            current_provider (Optional[str]): Current provider being used
            current_model (Optional[str]): Current model being used
            used_fallbacks (Set[str]): Set of fallback models already attempted
        """
        def __init__(self):
            self.consecutive_failures = 0
            self.current_provider = None
            self.current_model = None
            self.used_fallbacks = set()
    def __init__(
        self,
        model,
@ -106,10 +91,8 @@ class CiaynAgent:
        self.config = config
        self.provider = config.get("provider", "openai")
        self.fallback_enabled = config.get("fallback_tool_enabled", True)
-        fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4")
+        self.fallback_tool_models = self._load_fallback_tool_models(config)
-        self.fallback_tool_models = [
+
            m.strip() for m in fallback_models_str.split(",") if m.strip()
        ]
        self.model = model
        self.tools = tools
        self.max_history_messages = max_history_messages
@ -117,7 +100,18 @@ class CiaynAgent:
        self.available_functions = []
        for t in tools:
            self.available_functions.append(get_function_info(t.func))
-        self._tool_failure = CiaynAgent.ToolCallFailure()
+        self.tool_failure_consecutive_failures = 0
        self.tool_failure_current_provider = None
        self.tool_failure_current_model = None
        self.tool_failure_used_fallbacks = set()
    def _load_fallback_tool_models(self, config: dict) -> list:
        fallback_tool_models_config = config.get("fallback_tool_models")
        if fallback_tool_models_config:
            return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()]
        else:
            from ra_aid.tool_leaderboard import supported_top_tool_models
            return [item["model"] for item in supported_top_tool_models[:5]]
    def _build_prompt(self, last_result: Optional[str] = None) -> str:
        """Build the prompt for the agent including available tools and context."""
@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**"""
        last_error = None
        while retries < max_retries:
            try:
                logger.debug(
                    f"_execute_tool: attempt {retries+1}, original code: {code}"
                )
                code = code.strip()
                if validate_function_call_pattern(code):
                    functions_list = "\n\n".join(self.available_functions)
                    code = _extract_tool_call(code, functions_list)
                globals_dict = {tool.func.__name__: tool.func for tool in self.tools}
                logger.debug(f"_execute_tool: evaluating code: {code}")
                result = eval(code, globals_dict)
-                self._tool_failure.consecutive_failures = 0
+                logger.debug(
                    f"_execute_tool: tool executed successfully with result: {result}"
                )
                self.tool_failure_consecutive_failures = 0
                return result
            except Exception as e:
                logger.debug(f"_execute_tool: exception caught: {e}")
                self._handle_tool_failure(code, e)
                last_error = e
                retries += 1
                logger.debug(f"_execute_tool: retrying, new attempt count: {retries}")
        raise ToolExecutionError(
            f"Error executing code after {max_retries} attempts: {str(last_error)}"
        )
    def _handle_tool_failure(self, code: str, error: Exception) -> None:
-        self._tool_failure.consecutive_failures += 1
+        logger.debug(
-        max_failures = self.config.get("max_tool_failures", 3)
+            f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}"
        )
        self.tool_failure_consecutive_failures += 1
        max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES)
        logger.debug(
            f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}"
        )
        if (
            self.fallback_enabled
-            and self._tool_failure.consecutive_failures >= max_failures
+            and self.tool_failure_consecutive_failures >= max_failures
            and self.fallback_tool_models
        ):
            logger.debug(
                "_handle_tool_failure: threshold reached, invoking fallback mechanism."
            )
            self._attempt_fallback(code)
    def _attempt_fallback(self, code: str) -> None:
        logger.debug(f"_attempt_fallback: initiating fallback for code: {code}")
        new_model = self.fallback_tool_models[0]
-        failed_tool_call_name = code.split('(')[0].strip()
+        failed_tool_call_name = code.split("(")[0].strip()
        logger.error(
-            f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
+            f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
        )
        try:
-            from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env
+            from ra_aid.llm import (
                initialize_llm,
                merge_chat_history,
                validate_provider_env,
            )
            logger.debug(f"_attempt_fallback: validating provider {self.provider}")
            if not validate_provider_env(self.provider):
-                logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.")
+                logger.error(
                    f"Missing environment configuration for provider {self.provider}. Cannot fallback."
                )
            else:
                logger.debug(
                    f"_attempt_fallback: initializing fallback model {new_model}"
                )
                self.model = initialize_llm(self.provider, new_model)
                logger.debug(
                    f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}"
                )
                self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name)
-                self._tool_failure.used_fallbacks.add(new_model)
+                self.tool_failure_used_fallbacks.add(new_model)
-                merge_chat_history()  # Assuming merge_chat_history handles merging fallback history
+                logger.debug("_attempt_fallback: merging chat history for fallback")
-                self._tool_failure.consecutive_failures = 0
+                merge_chat_history()
                self.tool_failure_consecutive_failures = 0
                logger.debug(
                    "_attempt_fallback: fallback successful and tool failure counter reset"
                )
        except Exception as switch_e:
            logger.error(f"Fallback model switching failed: {switch_e}")
--- a/ra_aid/config.py
+++ b/ra_aid/config.py
@ -3,4 +3,12 @@
 DEFAULT_RECURSION_LIMIT = 100
 DEFAULT_MAX_TEST_CMD_RETRIES = 3
 DEFAULT_MAX_TOOL_FAILURES = 3
-MAX_TOOL_FAILURES = 3
+
 VALID_PROVIDERS = [
    "anthropic",
    "openai",
    "openrouter",
    "openai-compatible",
    "deepseek",
    "gemini",
 ]
--- a/ra_aid/logging_config.py
+++ b/ra_aid/logging_config.py
@ -1,13 +1,48 @@
 import logging
 import sys
 from typing import Optional
 from rich.console import Console
 from rich.panel import Panel
 from rich.markdown import Markdown
-def setup_logging(verbose: bool = False) -> None:
+class PrettyHandler(logging.Handler):
    def __init__(self, level=logging.NOTSET):
        super().__init__(level)
        self.console = Console()
    def emit(self, record):
        try:
            msg = self.format(record)
            # Determine title and style based on log level
            if record.levelno >= logging.CRITICAL:
                title = "🔥 CRITICAL"
                style = "bold red"
            elif record.levelno >= logging.ERROR:
                title = "❌ ERROR"
                style = "red"
            elif record.levelno >= logging.WARNING:
                title = "⚠️ WARNING"
                style = "yellow"
            elif record.levelno >= logging.INFO:
                title = "ℹ️ INFO"
                style = "green"
            else:
                title = "🐞 DEBUG"
                style = "blue"
            self.console.print(Panel(Markdown(msg.strip()), title=title, style=style))
        except Exception:
            self.handleError(record)
 def setup_logging(verbose: bool = False, pretty: bool = False) -> None:
    logger = logging.getLogger("ra_aid")
    logger.setLevel(logging.DEBUG if verbose else logging.INFO)
    if not logger.handlers:
        if pretty:
            handler = PrettyHandler()
        else:
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
--- a/ra_aid/tool_leaderboard.py
+++ b/ra_aid/tool_leaderboard.py
@ -0,0 +1,529 @@
 from ra_aid.config import VALID_PROVIDERS
 # Data extracted at 2/10/2025:
 # https://gorilla.cs.berkeley.edu/leaderboard.html
 # In order of overall_acc
 leaderboard_data = [
    {
        "overall_acc": 74.31,
        "model": "watt-tool-70B",
        "type": "FC",
        "link": "https://huggingface.co/watt-ai/watt-tool-70B/",
        "cost": "N/A",
        "latency": 3.4,
        "ast_summary": 84.06,
        "exec_summary": 89.39,
        "live_ast_acc": 77.74,
        "multi_turn_acc": 58.75,
        "relevance": 94.44,
        "irrelevance": 76.32,
        "organization": "Watt AI Lab",
        "license": "Apache-2.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 72.08,
        "model": "gpt-4o-2024-11-20",
        "type": "Prompt",
        "link": "https://openai.com/index/hello-gpt-4o/",
        "cost": 13.54,
        "latency": 0.78,
        "ast_summary": 88.1,
        "exec_summary": 89.38,
        "live_ast_acc": 79.83,
        "multi_turn_acc": 47.62,
        "relevance": 83.33,
        "irrelevance": 83.76,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 69.58,
        "model": "gpt-4o-2024-11-20",
        "type": "FC",
        "link": "https://openai.com/index/hello-gpt-4o/",
        "cost": 8.23,
        "latency": 1.11,
        "ast_summary": 87.42,
        "exec_summary": 89.2,
        "live_ast_acc": 79.65,
        "multi_turn_acc": 41,
        "relevance": 83.33,
        "irrelevance": 83.15,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 67.98,
        "model": "watt-tool-8B",
        "type": "FC",
        "link": "https://huggingface.co/watt-ai/watt-tool-8B/",
        "cost": "N/A",
        "latency": 1.31,
        "ast_summary": 86.56,
        "exec_summary": 89.34,
        "live_ast_acc": 76.5,
        "multi_turn_acc": 39.12,
        "relevance": 83.33,
        "irrelevance": 83.15,
        "organization": "Watt AI Lab",
        "license": "Apache-2.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 67.88,
        "model": "GPT-4-turbo-2024-04-09",
        "type": "FC",
        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
        "cost": 33.22,
        "latency": 2.47,
        "ast_summary": 84.73,
        "exec_summary": 85.21,
        "live_ast_acc": 80.5,
        "multi_turn_acc": 38.12,
        "relevance": 72.22,
        "irrelevance": 83.81,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 66.73,
        "model": "o1-2024-12-17",
        "type": "Prompt",
        "link": "https://openai.com/o1/",
        "cost": 102.47,
        "latency": 5.3,
        "ast_summary": 85.67,
        "exec_summary": 79.77,
        "live_ast_acc": 80.63,
        "multi_turn_acc": 36,
        "relevance": 72.22,
        "irrelevance": 87.78,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 64.1,
        "model": "GPT-4o-mini-2024-07-18",
        "type": "FC",
        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
        "cost": 0.51,
        "latency": 1.49,
        "ast_summary": 85.21,
        "exec_summary": 83.57,
        "live_ast_acc": 74.41,
        "multi_turn_acc": 34.12,
        "relevance": 83.33,
        "irrelevance": 74.75,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 62.79,
        "model": "o1-mini-2024-09-12",
        "type": "Prompt",
        "link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
        "cost": 29.76,
        "latency": 8.44,
        "ast_summary": 78.92,
        "exec_summary": 82.7,
        "live_ast_acc": 78.14,
        "multi_turn_acc": 28.25,
        "relevance": 61.11,
        "irrelevance": 89.62,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 62.73,
        "model": "Functionary-Medium-v3.1",
        "type": "FC",
        "link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
        "cost": "N/A",
        "latency": 14.06,
        "ast_summary": 89.88,
        "exec_summary": 91.32,
        "live_ast_acc": 76.63,
        "multi_turn_acc": 21.62,
        "relevance": 72.22,
        "irrelevance": 76.08,
        "organization": "MeetKai",
        "license": "MIT",
        "provider": "unknown",
    },
    {
        "overall_acc": 62.19,
        "model": "Gemini-1.5-Pro-002",
        "type": "Prompt",
        "link": "https://deepmind.google/technologies/gemini/pro/",
        "cost": 7.05,
        "latency": 5.94,
        "ast_summary": 88.58,
        "exec_summary": 91.27,
        "live_ast_acc": 76.72,
        "multi_turn_acc": 20.75,
        "relevance": 72.22,
        "irrelevance": 78.15,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 61.83,
        "model": "Hammer2.1-7b",
        "type": "FC",
        "link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
        "cost": "N/A",
        "latency": 2.08,
        "ast_summary": 88.65,
        "exec_summary": 85.48,
        "live_ast_acc": 75.11,
        "multi_turn_acc": 23.5,
        "relevance": 82.35,
        "irrelevance": 78.59,
        "organization": "MadeAgents",
        "license": "cc-by-nc-4.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 61.74,
        "model": "Gemini-2.0-Flash-Exp",
        "type": "Prompt",
        "link": "https://deepmind.google/technologies/gemini/flash/",
        "cost": 0.0,
        "latency": 1.18,
        "ast_summary": 89.96,
        "exec_summary": 79.89,
        "live_ast_acc": 82.01,
        "multi_turn_acc": 17.88,
        "relevance": 77.78,
        "irrelevance": 86.44,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 61.38,
        "model": "Amazon-Nova-Pro-v1:0",
        "type": "FC",
        "link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
        "cost": 5.26,
        "latency": 2.67,
        "ast_summary": 84.46,
        "exec_summary": 85.64,
        "live_ast_acc": 74.32,
        "multi_turn_acc": 26.12,
        "relevance": 77.78,
        "irrelevance": 70.98,
        "organization": "Amazon",
        "license": "Proprietary",
        "provider": "unknown",
    },
    {
        "overall_acc": 61.31,
        "model": "Qwen2.5-72B-Instruct",
        "type": "Prompt",
        "link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
        "cost": "N/A",
        "latency": 3.72,
        "ast_summary": 90.81,
        "exec_summary": 92.7,
        "live_ast_acc": 75.3,
        "multi_turn_acc": 18,
        "relevance": 100,
        "irrelevance": 72.81,
        "organization": "Qwen",
        "license": "qwen",
        "provider": "unknown",
    },
    {
        "overall_acc": 60.97,
        "model": "Gemini-1.5-Pro-002",
        "type": "FC",
        "link": "https://deepmind.google/technologies/gemini/pro/",
        "cost": 5.39,
        "latency": 2.07,
        "ast_summary": 87.29,
        "exec_summary": 84.61,
        "live_ast_acc": 76.28,
        "multi_turn_acc": 21.62,
        "relevance": 72.22,
        "irrelevance": 76.9,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 60.89,
        "model": "GPT-4o-mini-2024-07-18",
        "type": "Prompt",
        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
        "cost": 0.84,
        "latency": 1.31,
        "ast_summary": 86.77,
        "exec_summary": 80.84,
        "live_ast_acc": 76.5,
        "multi_turn_acc": 22,
        "relevance": 83.33,
        "irrelevance": 80.67,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 60.59,
        "model": "Gemini-2.0-Flash-Exp",
        "type": "FC",
        "link": "https://deepmind.google/technologies/gemini/flash/",
        "cost": 0.0,
        "latency": 0.85,
        "ast_summary": 85.1,
        "exec_summary": 77.46,
        "live_ast_acc": 79.03,
        "multi_turn_acc": 20.25,
        "relevance": 55.56,
        "irrelevance": 91.51,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 60.46,
        "model": "Gemini-1.5-Pro-001",
        "type": "Prompt",
        "link": "https://deepmind.google/technologies/gemini/pro/",
        "cost": 7.0,
        "latency": 1.54,
        "ast_summary": 85.56,
        "exec_summary": 85.77,
        "live_ast_acc": 76.68,
        "multi_turn_acc": 18.88,
        "relevance": 55.56,
        "irrelevance": 84.81,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 60.38,
        "model": "Gemini-Exp-1206",
        "type": "FC",
        "link": "https://blog.google/feed/gemini-exp-1206/",
        "cost": 0.0,
        "latency": 3.42,
        "ast_summary": 85.17,
        "exec_summary": 80.86,
        "live_ast_acc": 78.54,
        "multi_turn_acc": 20.25,
        "relevance": 77.78,
        "irrelevance": 79.64,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 59.67,
        "model": "Qwen2.5-32B-Instruct",
        "type": "Prompt",
        "link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
        "cost": "N/A",
        "latency": 2.26,
        "ast_summary": 85.81,
        "exec_summary": 89.79,
        "live_ast_acc": 74.23,
        "multi_turn_acc": 17.75,
        "relevance": 100,
        "irrelevance": 73.75,
        "organization": "Qwen",
        "license": "apache-2.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 59.57,
        "model": "GPT-4-turbo-2024-04-09",
        "type": "Prompt",
        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
        "cost": 58.87,
        "latency": 1.24,
        "ast_summary": 90.88,
        "exec_summary": 89.45,
        "live_ast_acc": 63.84,
        "multi_turn_acc": 30.25,
        "relevance": 100,
        "irrelevance": 35.57,
        "organization": "OpenAI",
        "license": "Proprietary",
        "provider": "openai",
    },
    {
        "overall_acc": 59.42,
        "model": "Gemini-1.5-Pro-001",
        "type": "FC",
        "link": "https://deepmind.google/technologies/gemini/pro/",
        "cost": 5.1,
        "latency": 1.43,
        "ast_summary": 84.33,
        "exec_summary": 87.95,
        "live_ast_acc": 76.23,
        "multi_turn_acc": 16,
        "relevance": 50,
        "irrelevance": 84.39,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 59.07,
        "model": "Hammer2.1-3b",
        "type": "FC",
        "link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
        "cost": "N/A",
        "latency": 1.95,
        "ast_summary": 86.85,
        "exec_summary": 84.09,
        "live_ast_acc": 74.04,
        "multi_turn_acc": 17.38,
        "relevance": 82.35,
        "irrelevance": 81.87,
        "organization": "MadeAgents",
        "license": "qwen-research",
        "provider": "unknown",
    },
    {
        "overall_acc": 58.45,
        "model": "mistral-large-2407",
        "type": "FC",
        "link": "https://mistral.ai/news/mistral-large-2407/",
        "cost": 12.68,
        "latency": 3.12,
        "ast_summary": 86.81,
        "exec_summary": 84.38,
        "live_ast_acc": 69.88,
        "multi_turn_acc": 23.75,
        "relevance": 72.22,
        "irrelevance": 52.85,
        "organization": "Mistral AI",
        "license": "Proprietary",
        "provider": "mistral",
    },
    {
        "overall_acc": 58.42,
        "model": "ToolACE-8B",
        "type": "FC",
        "link": "https://huggingface.co/Team-ACE/ToolACE-8B",
        "cost": "N/A",
        "latency": 5.24,
        "ast_summary": 87.54,
        "exec_summary": 89.21,
        "live_ast_acc": 78.59,
        "multi_turn_acc": 7.75,
        "relevance": 83.33,
        "irrelevance": 87.88,
        "organization": "Huawei Noah & USTC",
        "license": "Apache-2.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 57.78,
        "model": "xLAM-8x22b-r",
        "type": "FC",
        "link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
        "cost": "N/A",
        "latency": 9.26,
        "ast_summary": 83.69,
        "exec_summary": 87.88,
        "live_ast_acc": 72.59,
        "multi_turn_acc": 16.25,
        "relevance": 88.89,
        "irrelevance": 67.81,
        "organization": "Salesforce",
        "license": "cc-by-nc-4.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 57.68,
        "model": "Qwen2.5-14B-Instruct",
        "type": "Prompt",
        "link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
        "cost": "N/A",
        "latency": 2.02,
        "ast_summary": 85.69,
        "exec_summary": 88.84,
        "live_ast_acc": 74.14,
        "multi_turn_acc": 12.25,
        "relevance": 77.78,
        "irrelevance": 77.06,
        "organization": "Qwen",
        "license": "apache-2.0",
        "provider": "unknown",
    },
    {
        "overall_acc": 57.23,
        "model": "DeepSeek-V3",
        "type": "FC",
        "link": "https://api-docs.deepseek.com/news/news1226",
        "cost": "N/A",
        "latency": 2.58,
        "ast_summary": 89.17,
        "exec_summary": 83.39,
        "live_ast_acc": 68.41,
        "multi_turn_acc": 18.62,
        "relevance": 88.89,
        "irrelevance": 59.36,
        "organization": "DeepSeek",
        "license": "DeepSeek License",
        "provider": "unknown",
    },
    {
        "overall_acc": 57.09,
        "model": "Gemini-1.5-Flash-001",
        "type": "Prompt",
        "link": "https://deepmind.google/technologies/gemini/flash/",
        "cost": 0.48,
        "latency": 0.71,
        "ast_summary": 85.69,
        "exec_summary": 83.59,
        "live_ast_acc": 68.9,
        "multi_turn_acc": 19.5,
        "relevance": 83.33,
        "irrelevance": 62.78,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
    {
        "overall_acc": 56.79,
        "model": "Gemini-1.5-Flash-002",
        "type": "Prompt",
        "link": "https://deepmind.google/technologies/gemini/flash/",
        "cost": 0.46,
        "latency": 0.81,
        "ast_summary": 81.65,
        "exec_summary": 80.64,
        "live_ast_acc": 76.72,
        "multi_turn_acc": 12.5,
        "relevance": 83.33,
        "irrelevance": 78.49,
        "organization": "Google",
        "license": "Proprietary",
        "provider": "google",
    },
 ]
 supported_top_tool_models = [
    {
        "cost": item["cost"],
        "model": item["model"],
        "type": item["type"],
        "provider": item["provider"],
    }
    for item in leaderboard_data
    if item["provider"] in VALID_PROVIDERS
 ]
--- a/tests/ra_aid/test_ciayn_agent.py
+++ b/tests/ra_aid/test_ciayn_agent.py
@ -31,6 +31,7 @@ class DummyModel:
            content = "dummy_tool()"
        return Response()
    def bind_tools(self, tools, tool_choice):
        pass
@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        # Create a dummy tool that always fails for testing fallback
        def always_fail():
            raise Exception("Failure for fallback test")
        self.always_fail_tool = DummyTool(always_fail)
        # Create a dummy model that does minimal work for fallback tests
        self.dummy_model = DummyModel()
@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        self.agent = CiaynAgent(
            self.dummy_model,
            [self.always_fail_tool],
-            config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"}
+            config={
                "max_tool_failures": 2,
                "fallback_tool_models": "dummy-fallback-model",
            },
        )
    def test_handle_tool_failure_increments_counter(self):
-        initial_failures = self.agent._tool_failure.consecutive_failures
+        initial_failures = self.agent.tool_failure_consecutive_failures
        self.agent._handle_tool_failure("dummy_call()", Exception("Test error"))
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1)
+        self.assertEqual(
            self.agent.tool_failure_consecutive_failures, initial_failures + 1
        )
    def test_attempt_fallback_invokes_fallback_logic(self):
        # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
        # to simulate fallback switching without external dependencies.
        def dummy_initialize_llm(provider, model_name, temperature=None):
            return self.dummy_model
        def dummy_merge_chat_history():
            return ["merged"]
        def dummy_validate_provider_env(provider):
            return True
        import ra_aid.llm as llm
        original_initialize = llm.initialize_llm
        original_merge = llm.merge_chat_history
        original_validate = llm.validate_provider_env
@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        llm.validate_provider_env = dummy_validate_provider_env
        # Set failure counter high enough to trigger fallback in _handle_tool_failure
-        self.agent._tool_failure.consecutive_failures = 2
+        self.agent.tool_failure_consecutive_failures = 2
        # Call _attempt_fallback; it should reset the failure counter to 0 on success.
        self.agent._attempt_fallback("always_fail_tool()")
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, 0)
+        self.assertEqual(self.agent.tool_failure_consecutive_failures, 0)
        # Restore original functions
        llm.initialize_llm = original_initialize
        llm.merge_chat_history = original_merge
        llm.validate_provider_env = original_validate
 if __name__ == "__main__":
    unittest.main()