feat(fallback): implement automatic fallback to alternative LLM models on consecutive failures to enhance user experience and prevent infinite error loops

refactor(ciayn_agent): restructure tool failure handling to track consecutive failures and fallback attempts more effectively fix(logging): add pretty logging option for improved log readability chore(config): define valid providers for LLM selection and update fallback model loading logic test(ciayn_agent): add unit tests for fallback logic and tool failure handling to ensure reliability and correctness
2025-02-10 14:13:19 -08:00 · 2025-02-10 14:13:19 -08:00 · d8ee4e04f4
parent 45b993cfd0
commit d8ee4e04f4
7 changed files with 688 additions and 97 deletions
--- a/issue.md
+++ b/issue.md
@ -4,15 +4,7 @@
 Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures.

 ## Background
-Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience.
-
-## Relevant Files
- ra_aid/agents/ciayn_agent.py
- ra_aid/llm.py
- ra_aid/agent_utils.py
- ra_aid/__main__.py
- ra_aid/models_params.py
-
+Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls.

 ## Implementation Details

@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing
 Define fallback sequences for each provider based on model capabilities:

 1. Try same provider's smaller models
-2. Try alternative providers' equivalent models
+2. Try alternative providers' similar models
 3. Raise final error if all fallbacks fail

-### Provider Strategy Updates
-Update provider strategies to support fallback configuration:
- Add provider-specific fallback sequences
- Handle model capability validation during fallback
- Track successful/failed attempts
-
 ## Risks and Mitigations
-1. **Performance Impact**
-   - Risk: Multiple fallback attempts could increase latency
-   - Mitigation: Set reasonable max_failures limit and timeouts
-
-2. **Consistency**
-   - Risk: Different models may give slightly different outputs
-   - Mitigation: Validate output schema consistency across models
-
-3. **Cost**
+1. **Cost**
   - Risk: Fallback to more expensive models
   - Mitigation: Configure cost limits and preferred fallback sequences

-4. **State Management** 
+2. **State Management** 
   - Risk: Loss of context during fallbacks
   - Mitigation: Preserve conversation state and tool context

+## Relevant Files
+- ra_aid/agents/ciayn_agent.py
+- ra_aid/llm.py
+- ra_aid/agent_utils.py
+- ra_aid/__main__.py
+- ra_aid/models_params.py
+
 ## Acceptance Criteria
 1. Tool calls automatically attempt fallback models after N consecutive failures
 2. `--no-fallback-tool` argument successfully disables fallback behavior
@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration:
 5. Unit tests cover fallback scenarios and edge cases
 6. README.md updated to reflect new behavior

-## Testing
-1. Unit tests for fallback wrapper
-2. Integration tests with mock LLM failures 
-3. Provider strategy fallback tests
-4. Command line argument handling
-5. Error preservation and reporting
-6. Performance impact measurement
-7. Edge cases (e.g., partial failures, timeout handling)
-8. State preservation during fallbacks
-
 ## Documentation Updates
 1. Add fallback feature to main README
 2. Document `--no-fallback-tool` in CLI help
--- a/ra_aid/main.py
+++ b/ra_aid/main.py
@ -17,7 +17,11 @@ from ra_aid.agent_utils import (
    run_planning_agent,
    run_research_agent,
 )
-from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT
+from ra_aid.config import (
+    DEFAULT_MAX_TEST_CMD_RETRIES,
+    DEFAULT_RECURSION_LIMIT,
+    VALID_PROVIDERS,
+)
 from ra_aid.dependencies import check_dependencies
 from ra_aid.env import validate_environment
 from ra_aid.llm import initialize_llm
@ -40,14 +44,6 @@ def launch_webui(host: str, port: int):


 def parse_arguments(args=None):
-    VALID_PROVIDERS = [
-        "anthropic",
-        "openai",
-        "openrouter",
-        "openai-compatible",
-        "deepseek",
-        "gemini",
-    ]
    ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
    OPENAI_DEFAULT_MODEL = "gpt-4o"

@ -80,9 +76,11 @@ Examples:
    parser.add_argument(
        "--provider",
        type=str,
-        default="openai"
-        if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
-        else "anthropic",
+        default=(
+            "openai"
+            if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
+            else "anthropic"
+        ),
        choices=VALID_PROVIDERS,
        help="The LLM provider to use",
    )
@ -138,6 +136,9 @@ Examples:
    parser.add_argument(
        "--verbose", action="store_true", help="Enable verbose logging output"
    )
+    parser.add_argument(
+        "--pretty-logger", action="store_true", help="Enable pretty logging output"
+    )
    parser.add_argument(
        "--temperature",
        type=float,
@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool:
 def main():
    """Main entry point for the ra-aid command line tool."""
    args = parse_arguments()
-    setup_logging(args.verbose)
+    setup_logging(args.verbose, args.pretty_logger)
    logger.debug("Starting RA.Aid with arguments: %s", args)

    # Launch web interface if requested
@ -378,9 +379,9 @@ def main():
                chat_agent,
                CHAT_PROMPT.format(
                    initial_request=initial_request,
-                    web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT
-                    if web_research_enabled
-                    else "",
+                    web_research_section=(
+                        WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else ""
+                    ),
                    working_directory=working_directory,
                    current_date=current_date,
                    project_info=formatted_project_info,
--- a/ra_aid/agents/ciayn_agent.py
+++ b/ra_aid/agents/ciayn_agent.py
@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union

 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage

+from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES
 from ra_aid.exceptions import ToolExecutionError
 from ra_aid.logging_config import get_logger
 from ra_aid.models_params import DEFAULT_TOKEN_LIMIT
@ -68,22 +69,6 @@ class CiaynAgent:
    - Memory management with configurable limits
    """

-    class ToolCallFailure:
-        """Tracks consecutive failures and fallback model usage for tool calls.
-
-        Attributes:
-            consecutive_failures (int): Count of consecutive failures for current model
-            current_provider (Optional[str]): Current provider being used
-            current_model (Optional[str]): Current model being used
-            used_fallbacks (Set[str]): Set of fallback models already attempted
-        """
-
-        def __init__(self):
-            self.consecutive_failures = 0
-            self.current_provider = None
-            self.current_model = None
-            self.used_fallbacks = set()
-
    def __init__(
        self,
        model,
@ -106,10 +91,8 @@ class CiaynAgent:
        self.config = config
        self.provider = config.get("provider", "openai")
        self.fallback_enabled = config.get("fallback_tool_enabled", True)
-        fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4")
-        self.fallback_tool_models = [
-            m.strip() for m in fallback_models_str.split(",") if m.strip()
-        ]
+        self.fallback_tool_models = self._load_fallback_tool_models(config)
+
        self.model = model
        self.tools = tools
        self.max_history_messages = max_history_messages
@ -117,7 +100,18 @@ class CiaynAgent:
        self.available_functions = []
        for t in tools:
            self.available_functions.append(get_function_info(t.func))
-        self._tool_failure = CiaynAgent.ToolCallFailure()
+        self.tool_failure_consecutive_failures = 0
+        self.tool_failure_current_provider = None
+        self.tool_failure_current_model = None
+        self.tool_failure_used_fallbacks = set()
+
+    def _load_fallback_tool_models(self, config: dict) -> list:
+        fallback_tool_models_config = config.get("fallback_tool_models")
+        if fallback_tool_models_config:
+            return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()]
+        else:
+            from ra_aid.tool_leaderboard import supported_top_tool_models
+            return [item["model"] for item in supported_top_tool_models[:5]]

    def _build_prompt(self, last_result: Optional[str] = None) -> str:
        """Build the prompt for the agent including available tools and context."""
@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**"""
        last_error = None
        while retries < max_retries:
            try:
+                logger.debug(
+                    f"_execute_tool: attempt {retries+1}, original code: {code}"
+                )
                code = code.strip()
                if validate_function_call_pattern(code):
                    functions_list = "\n\n".join(self.available_functions)
                    code = _extract_tool_call(code, functions_list)
                globals_dict = {tool.func.__name__: tool.func for tool in self.tools}
+                logger.debug(f"_execute_tool: evaluating code: {code}")
                result = eval(code, globals_dict)
-                self._tool_failure.consecutive_failures = 0
+                logger.debug(
+                    f"_execute_tool: tool executed successfully with result: {result}"
+                )
+                self.tool_failure_consecutive_failures = 0
                return result
            except Exception as e:
+                logger.debug(f"_execute_tool: exception caught: {e}")
                self._handle_tool_failure(code, e)
                last_error = e
                retries += 1
+                logger.debug(f"_execute_tool: retrying, new attempt count: {retries}")
        raise ToolExecutionError(
            f"Error executing code after {max_retries} attempts: {str(last_error)}"
        )

    def _handle_tool_failure(self, code: str, error: Exception) -> None:
-        self._tool_failure.consecutive_failures += 1
-        max_failures = self.config.get("max_tool_failures", 3)
+        logger.debug(
+            f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}"
+        )
+        self.tool_failure_consecutive_failures += 1
+        max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES)
+        logger.debug(
+            f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}"
+        )
        if (
            self.fallback_enabled
-            and self._tool_failure.consecutive_failures >= max_failures
+            and self.tool_failure_consecutive_failures >= max_failures
            and self.fallback_tool_models
        ):
+            logger.debug(
+                "_handle_tool_failure: threshold reached, invoking fallback mechanism."
+            )
            self._attempt_fallback(code)

    def _attempt_fallback(self, code: str) -> None:
+        logger.debug(f"_attempt_fallback: initiating fallback for code: {code}")
        new_model = self.fallback_tool_models[0]
-        failed_tool_call_name = code.split('(')[0].strip()
+        failed_tool_call_name = code.split("(")[0].strip()
        logger.error(
-            f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
+            f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
        )
        try:
-            from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env
+            from ra_aid.llm import (
+                initialize_llm,
+                merge_chat_history,
+                validate_provider_env,
+            )
+
+            logger.debug(f"_attempt_fallback: validating provider {self.provider}")
            if not validate_provider_env(self.provider):
-                logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.")
+                logger.error(
+                    f"Missing environment configuration for provider {self.provider}. Cannot fallback."
+                )
            else:
+                logger.debug(
+                    f"_attempt_fallback: initializing fallback model {new_model}"
+                )
                self.model = initialize_llm(self.provider, new_model)
+                logger.debug(
+                    f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}"
+                )
                self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name)
-                self._tool_failure.used_fallbacks.add(new_model)
-                merge_chat_history()  # Assuming merge_chat_history handles merging fallback history
-                self._tool_failure.consecutive_failures = 0
+                self.tool_failure_used_fallbacks.add(new_model)
+                logger.debug("_attempt_fallback: merging chat history for fallback")
+                merge_chat_history()
+                self.tool_failure_consecutive_failures = 0
+                logger.debug(
+                    "_attempt_fallback: fallback successful and tool failure counter reset"
+                )
        except Exception as switch_e:
            logger.error(f"Fallback model switching failed: {switch_e}")

--- a/ra_aid/config.py
+++ b/ra_aid/config.py
@ -3,4 +3,12 @@
 DEFAULT_RECURSION_LIMIT = 100
 DEFAULT_MAX_TEST_CMD_RETRIES = 3
 DEFAULT_MAX_TOOL_FAILURES = 3
-MAX_TOOL_FAILURES = 3
+
+VALID_PROVIDERS = [
+    "anthropic",
+    "openai",
+    "openrouter",
+    "openai-compatible",
+    "deepseek",
+    "gemini",
+]
--- a/ra_aid/logging_config.py
+++ b/ra_aid/logging_config.py
@ -1,18 +1,53 @@
 import logging
 import sys
 from typing import Optional
+from rich.console import Console
+from rich.panel import Panel
+from rich.markdown import Markdown


-def setup_logging(verbose: bool = False) -> None:
+class PrettyHandler(logging.Handler):
+    def __init__(self, level=logging.NOTSET):
+        super().__init__(level)
+        self.console = Console()
+
+    def emit(self, record):
+        try:
+            msg = self.format(record)
+            # Determine title and style based on log level
+            if record.levelno >= logging.CRITICAL:
+                title = "🔥 CRITICAL"
+                style = "bold red"
+            elif record.levelno >= logging.ERROR:
+                title = "❌ ERROR"
+                style = "red"
+            elif record.levelno >= logging.WARNING:
+                title = "⚠️ WARNING"
+                style = "yellow"
+            elif record.levelno >= logging.INFO:
+                title = "ℹ️ INFO"
+                style = "green"
+            else:
+                title = "🐞 DEBUG"
+                style = "blue"
+            self.console.print(Panel(Markdown(msg.strip()), title=title, style=style))
+        except Exception:
+            self.handleError(record)
+
+
+def setup_logging(verbose: bool = False, pretty: bool = False) -> None:
    logger = logging.getLogger("ra_aid")
    logger.setLevel(logging.DEBUG if verbose else logging.INFO)

    if not logger.handlers:
-        handler = logging.StreamHandler(sys.stdout)
-        formatter = logging.Formatter(
-            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-        )
-        handler.setFormatter(formatter)
+        if pretty:
+            handler = PrettyHandler()
+        else:
+            handler = logging.StreamHandler(sys.stdout)
+            formatter = logging.Formatter(
+                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+            )
+            handler.setFormatter(formatter)
        logger.addHandler(handler)


--- a/ra_aid/tool_leaderboard.py
+++ b/ra_aid/tool_leaderboard.py
@ -0,0 +1,529 @@
+from ra_aid.config import VALID_PROVIDERS
+
+# Data extracted at 2/10/2025:
+# https://gorilla.cs.berkeley.edu/leaderboard.html
+# In order of overall_acc
+leaderboard_data = [
+    {
+        "overall_acc": 74.31,
+        "model": "watt-tool-70B",
+        "type": "FC",
+        "link": "https://huggingface.co/watt-ai/watt-tool-70B/",
+        "cost": "N/A",
+        "latency": 3.4,
+        "ast_summary": 84.06,
+        "exec_summary": 89.39,
+        "live_ast_acc": 77.74,
+        "multi_turn_acc": 58.75,
+        "relevance": 94.44,
+        "irrelevance": 76.32,
+        "organization": "Watt AI Lab",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 72.08,
+        "model": "gpt-4o-2024-11-20",
+        "type": "Prompt",
+        "link": "https://openai.com/index/hello-gpt-4o/",
+        "cost": 13.54,
+        "latency": 0.78,
+        "ast_summary": 88.1,
+        "exec_summary": 89.38,
+        "live_ast_acc": 79.83,
+        "multi_turn_acc": 47.62,
+        "relevance": 83.33,
+        "irrelevance": 83.76,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 69.58,
+        "model": "gpt-4o-2024-11-20",
+        "type": "FC",
+        "link": "https://openai.com/index/hello-gpt-4o/",
+        "cost": 8.23,
+        "latency": 1.11,
+        "ast_summary": 87.42,
+        "exec_summary": 89.2,
+        "live_ast_acc": 79.65,
+        "multi_turn_acc": 41,
+        "relevance": 83.33,
+        "irrelevance": 83.15,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 67.98,
+        "model": "watt-tool-8B",
+        "type": "FC",
+        "link": "https://huggingface.co/watt-ai/watt-tool-8B/",
+        "cost": "N/A",
+        "latency": 1.31,
+        "ast_summary": 86.56,
+        "exec_summary": 89.34,
+        "live_ast_acc": 76.5,
+        "multi_turn_acc": 39.12,
+        "relevance": 83.33,
+        "irrelevance": 83.15,
+        "organization": "Watt AI Lab",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 67.88,
+        "model": "GPT-4-turbo-2024-04-09",
+        "type": "FC",
+        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "cost": 33.22,
+        "latency": 2.47,
+        "ast_summary": 84.73,
+        "exec_summary": 85.21,
+        "live_ast_acc": 80.5,
+        "multi_turn_acc": 38.12,
+        "relevance": 72.22,
+        "irrelevance": 83.81,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 66.73,
+        "model": "o1-2024-12-17",
+        "type": "Prompt",
+        "link": "https://openai.com/o1/",
+        "cost": 102.47,
+        "latency": 5.3,
+        "ast_summary": 85.67,
+        "exec_summary": 79.77,
+        "live_ast_acc": 80.63,
+        "multi_turn_acc": 36,
+        "relevance": 72.22,
+        "irrelevance": 87.78,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 64.1,
+        "model": "GPT-4o-mini-2024-07-18",
+        "type": "FC",
+        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
+        "cost": 0.51,
+        "latency": 1.49,
+        "ast_summary": 85.21,
+        "exec_summary": 83.57,
+        "live_ast_acc": 74.41,
+        "multi_turn_acc": 34.12,
+        "relevance": 83.33,
+        "irrelevance": 74.75,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 62.79,
+        "model": "o1-mini-2024-09-12",
+        "type": "Prompt",
+        "link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
+        "cost": 29.76,
+        "latency": 8.44,
+        "ast_summary": 78.92,
+        "exec_summary": 82.7,
+        "live_ast_acc": 78.14,
+        "multi_turn_acc": 28.25,
+        "relevance": 61.11,
+        "irrelevance": 89.62,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 62.73,
+        "model": "Functionary-Medium-v3.1",
+        "type": "FC",
+        "link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
+        "cost": "N/A",
+        "latency": 14.06,
+        "ast_summary": 89.88,
+        "exec_summary": 91.32,
+        "live_ast_acc": 76.63,
+        "multi_turn_acc": 21.62,
+        "relevance": 72.22,
+        "irrelevance": 76.08,
+        "organization": "MeetKai",
+        "license": "MIT",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 62.19,
+        "model": "Gemini-1.5-Pro-002",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 7.05,
+        "latency": 5.94,
+        "ast_summary": 88.58,
+        "exec_summary": 91.27,
+        "live_ast_acc": 76.72,
+        "multi_turn_acc": 20.75,
+        "relevance": 72.22,
+        "irrelevance": 78.15,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 61.83,
+        "model": "Hammer2.1-7b",
+        "type": "FC",
+        "link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
+        "cost": "N/A",
+        "latency": 2.08,
+        "ast_summary": 88.65,
+        "exec_summary": 85.48,
+        "live_ast_acc": 75.11,
+        "multi_turn_acc": 23.5,
+        "relevance": 82.35,
+        "irrelevance": 78.59,
+        "organization": "MadeAgents",
+        "license": "cc-by-nc-4.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 61.74,
+        "model": "Gemini-2.0-Flash-Exp",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.0,
+        "latency": 1.18,
+        "ast_summary": 89.96,
+        "exec_summary": 79.89,
+        "live_ast_acc": 82.01,
+        "multi_turn_acc": 17.88,
+        "relevance": 77.78,
+        "irrelevance": 86.44,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 61.38,
+        "model": "Amazon-Nova-Pro-v1:0",
+        "type": "FC",
+        "link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
+        "cost": 5.26,
+        "latency": 2.67,
+        "ast_summary": 84.46,
+        "exec_summary": 85.64,
+        "live_ast_acc": 74.32,
+        "multi_turn_acc": 26.12,
+        "relevance": 77.78,
+        "irrelevance": 70.98,
+        "organization": "Amazon",
+        "license": "Proprietary",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 61.31,
+        "model": "Qwen2.5-72B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
+        "cost": "N/A",
+        "latency": 3.72,
+        "ast_summary": 90.81,
+        "exec_summary": 92.7,
+        "live_ast_acc": 75.3,
+        "multi_turn_acc": 18,
+        "relevance": 100,
+        "irrelevance": 72.81,
+        "organization": "Qwen",
+        "license": "qwen",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 60.97,
+        "model": "Gemini-1.5-Pro-002",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 5.39,
+        "latency": 2.07,
+        "ast_summary": 87.29,
+        "exec_summary": 84.61,
+        "live_ast_acc": 76.28,
+        "multi_turn_acc": 21.62,
+        "relevance": 72.22,
+        "irrelevance": 76.9,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.89,
+        "model": "GPT-4o-mini-2024-07-18",
+        "type": "Prompt",
+        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
+        "cost": 0.84,
+        "latency": 1.31,
+        "ast_summary": 86.77,
+        "exec_summary": 80.84,
+        "live_ast_acc": 76.5,
+        "multi_turn_acc": 22,
+        "relevance": 83.33,
+        "irrelevance": 80.67,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 60.59,
+        "model": "Gemini-2.0-Flash-Exp",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.0,
+        "latency": 0.85,
+        "ast_summary": 85.1,
+        "exec_summary": 77.46,
+        "live_ast_acc": 79.03,
+        "multi_turn_acc": 20.25,
+        "relevance": 55.56,
+        "irrelevance": 91.51,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.46,
+        "model": "Gemini-1.5-Pro-001",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 7.0,
+        "latency": 1.54,
+        "ast_summary": 85.56,
+        "exec_summary": 85.77,
+        "live_ast_acc": 76.68,
+        "multi_turn_acc": 18.88,
+        "relevance": 55.56,
+        "irrelevance": 84.81,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.38,
+        "model": "Gemini-Exp-1206",
+        "type": "FC",
+        "link": "https://blog.google/feed/gemini-exp-1206/",
+        "cost": 0.0,
+        "latency": 3.42,
+        "ast_summary": 85.17,
+        "exec_summary": 80.86,
+        "live_ast_acc": 78.54,
+        "multi_turn_acc": 20.25,
+        "relevance": 77.78,
+        "irrelevance": 79.64,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 59.67,
+        "model": "Qwen2.5-32B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
+        "cost": "N/A",
+        "latency": 2.26,
+        "ast_summary": 85.81,
+        "exec_summary": 89.79,
+        "live_ast_acc": 74.23,
+        "multi_turn_acc": 17.75,
+        "relevance": 100,
+        "irrelevance": 73.75,
+        "organization": "Qwen",
+        "license": "apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 59.57,
+        "model": "GPT-4-turbo-2024-04-09",
+        "type": "Prompt",
+        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "cost": 58.87,
+        "latency": 1.24,
+        "ast_summary": 90.88,
+        "exec_summary": 89.45,
+        "live_ast_acc": 63.84,
+        "multi_turn_acc": 30.25,
+        "relevance": 100,
+        "irrelevance": 35.57,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 59.42,
+        "model": "Gemini-1.5-Pro-001",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 5.1,
+        "latency": 1.43,
+        "ast_summary": 84.33,
+        "exec_summary": 87.95,
+        "live_ast_acc": 76.23,
+        "multi_turn_acc": 16,
+        "relevance": 50,
+        "irrelevance": 84.39,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 59.07,
+        "model": "Hammer2.1-3b",
+        "type": "FC",
+        "link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
+        "cost": "N/A",
+        "latency": 1.95,
+        "ast_summary": 86.85,
+        "exec_summary": 84.09,
+        "live_ast_acc": 74.04,
+        "multi_turn_acc": 17.38,
+        "relevance": 82.35,
+        "irrelevance": 81.87,
+        "organization": "MadeAgents",
+        "license": "qwen-research",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 58.45,
+        "model": "mistral-large-2407",
+        "type": "FC",
+        "link": "https://mistral.ai/news/mistral-large-2407/",
+        "cost": 12.68,
+        "latency": 3.12,
+        "ast_summary": 86.81,
+        "exec_summary": 84.38,
+        "live_ast_acc": 69.88,
+        "multi_turn_acc": 23.75,
+        "relevance": 72.22,
+        "irrelevance": 52.85,
+        "organization": "Mistral AI",
+        "license": "Proprietary",
+        "provider": "mistral",
+    },
+    {
+        "overall_acc": 58.42,
+        "model": "ToolACE-8B",
+        "type": "FC",
+        "link": "https://huggingface.co/Team-ACE/ToolACE-8B",
+        "cost": "N/A",
+        "latency": 5.24,
+        "ast_summary": 87.54,
+        "exec_summary": 89.21,
+        "live_ast_acc": 78.59,
+        "multi_turn_acc": 7.75,
+        "relevance": 83.33,
+        "irrelevance": 87.88,
+        "organization": "Huawei Noah & USTC",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.78,
+        "model": "xLAM-8x22b-r",
+        "type": "FC",
+        "link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
+        "cost": "N/A",
+        "latency": 9.26,
+        "ast_summary": 83.69,
+        "exec_summary": 87.88,
+        "live_ast_acc": 72.59,
+        "multi_turn_acc": 16.25,
+        "relevance": 88.89,
+        "irrelevance": 67.81,
+        "organization": "Salesforce",
+        "license": "cc-by-nc-4.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.68,
+        "model": "Qwen2.5-14B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
+        "cost": "N/A",
+        "latency": 2.02,
+        "ast_summary": 85.69,
+        "exec_summary": 88.84,
+        "live_ast_acc": 74.14,
+        "multi_turn_acc": 12.25,
+        "relevance": 77.78,
+        "irrelevance": 77.06,
+        "organization": "Qwen",
+        "license": "apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.23,
+        "model": "DeepSeek-V3",
+        "type": "FC",
+        "link": "https://api-docs.deepseek.com/news/news1226",
+        "cost": "N/A",
+        "latency": 2.58,
+        "ast_summary": 89.17,
+        "exec_summary": 83.39,
+        "live_ast_acc": 68.41,
+        "multi_turn_acc": 18.62,
+        "relevance": 88.89,
+        "irrelevance": 59.36,
+        "organization": "DeepSeek",
+        "license": "DeepSeek License",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.09,
+        "model": "Gemini-1.5-Flash-001",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.48,
+        "latency": 0.71,
+        "ast_summary": 85.69,
+        "exec_summary": 83.59,
+        "live_ast_acc": 68.9,
+        "multi_turn_acc": 19.5,
+        "relevance": 83.33,
+        "irrelevance": 62.78,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 56.79,
+        "model": "Gemini-1.5-Flash-002",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.46,
+        "latency": 0.81,
+        "ast_summary": 81.65,
+        "exec_summary": 80.64,
+        "live_ast_acc": 76.72,
+        "multi_turn_acc": 12.5,
+        "relevance": 83.33,
+        "irrelevance": 78.49,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+]
+
+
+supported_top_tool_models = [
+    {
+        "cost": item["cost"],
+        "model": item["model"],
+        "type": item["type"],
+        "provider": item["provider"],
+    }
+    for item in leaderboard_data
+    if item["provider"] in VALID_PROVIDERS
+]
--- a/tests/ra_aid/test_ciayn_agent.py
+++ b/tests/ra_aid/test_ciayn_agent.py
@ -31,6 +31,7 @@ class DummyModel:
            content = "dummy_tool()"

        return Response()
+
    def bind_tools(self, tools, tool_choice):
        pass

@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        # Create a dummy tool that always fails for testing fallback
        def always_fail():
            raise Exception("Failure for fallback test")
+
        self.always_fail_tool = DummyTool(always_fail)
        # Create a dummy model that does minimal work for fallback tests
        self.dummy_model = DummyModel()
@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        self.agent = CiaynAgent(
            self.dummy_model,
            [self.always_fail_tool],
-            config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"}
+            config={
+                "max_tool_failures": 2,
+                "fallback_tool_models": "dummy-fallback-model",
+            },
        )

    def test_handle_tool_failure_increments_counter(self):
-        initial_failures = self.agent._tool_failure.consecutive_failures
+        initial_failures = self.agent.tool_failure_consecutive_failures
        self.agent._handle_tool_failure("dummy_call()", Exception("Test error"))
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1)
+        self.assertEqual(
+            self.agent.tool_failure_consecutive_failures, initial_failures + 1
+        )

    def test_attempt_fallback_invokes_fallback_logic(self):
        # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
        # to simulate fallback switching without external dependencies.
        def dummy_initialize_llm(provider, model_name, temperature=None):
            return self.dummy_model
+
        def dummy_merge_chat_history():
            return ["merged"]
+
        def dummy_validate_provider_env(provider):
            return True
+
        import ra_aid.llm as llm
+
        original_initialize = llm.initialize_llm
        original_merge = llm.merge_chat_history
        original_validate = llm.validate_provider_env
@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
        llm.validate_provider_env = dummy_validate_provider_env

        # Set failure counter high enough to trigger fallback in _handle_tool_failure
-        self.agent._tool_failure.consecutive_failures = 2
+        self.agent.tool_failure_consecutive_failures = 2
        # Call _attempt_fallback; it should reset the failure counter to 0 on success.
        self.agent._attempt_fallback("always_fail_tool()")
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, 0)
+        self.assertEqual(self.agent.tool_failure_consecutive_failures, 0)
        # Restore original functions
        llm.initialize_llm = original_initialize
        llm.merge_chat_history = original_merge
        llm.validate_provider_env = original_validate

+
 if __name__ == "__main__":
    unittest.main()