From d8ee4e04f4c7ccaec4e8f78a6fbf9f0e25135ae3 Mon Sep 17 00:00:00 2001
From: Ariel Frischer <arielfrischer@gmail.com>
Date: Mon, 10 Feb 2025 14:13:19 -0800
Subject: [PATCH] feat(fallback): implement automatic fallback to alternative
 LLM models on consecutive failures to enhance user experience and prevent
 infinite error loops refactor(ciayn_agent): restructure tool failure handling
 to track consecutive failures and fallback attempts more effectively
 fix(logging): add pretty logging option for improved log readability
 chore(config): define valid providers for LLM selection and update fallback
 model loading logic test(ciayn_agent): add unit tests for fallback logic and
 tool failure handling to ensure reliability and correctness

---
 issue.md                         |  47 +--
 ra_aid/__main__.py               |  33 +-
 ra_aid/agents/ciayn_agent.py     |  95 ++++--
 ra_aid/config.py                 |  10 +-
 ra_aid/logging_config.py         |  47 ++-
 ra_aid/tool_leaderboard.py       | 529 +++++++++++++++++++++++++++++++
 tests/ra_aid/test_ciayn_agent.py |  24 +-
 7 files changed, 688 insertions(+), 97 deletions(-)
 create mode 100644 ra_aid/tool_leaderboard.py

diff --git a/issue.md b/issue.md
index 62e246b..3bd0988 100644
--- a/issue.md
+++ b/issue.md
@@ -4,15 +4,7 @@
 Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures.
 
 ## Background
-Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience.
-
-## Relevant Files
-- ra_aid/agents/ciayn_agent.py
-- ra_aid/llm.py
-- ra_aid/agent_utils.py
-- ra_aid/__main__.py
-- ra_aid/models_params.py
-
+Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls.
 
 ## Implementation Details
 
@@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing
 Define fallback sequences for each provider based on model capabilities:
 
 1. Try same provider's smaller models
-2. Try alternative providers' equivalent models
+2. Try alternative providers' similar models
 3. Raise final error if all fallbacks fail
 
-### Provider Strategy Updates
-Update provider strategies to support fallback configuration:
-- Add provider-specific fallback sequences
-- Handle model capability validation during fallback
-- Track successful/failed attempts
-
 ## Risks and Mitigations
-1. **Performance Impact**
-   - Risk: Multiple fallback attempts could increase latency
-   - Mitigation: Set reasonable max_failures limit and timeouts
-
-2. **Consistency**
-   - Risk: Different models may give slightly different outputs
-   - Mitigation: Validate output schema consistency across models
-
-3. **Cost**
+1. **Cost**
    - Risk: Fallback to more expensive models
    - Mitigation: Configure cost limits and preferred fallback sequences
 
-4. **State Management** 
+2. **State Management** 
    - Risk: Loss of context during fallbacks
    - Mitigation: Preserve conversation state and tool context
 
+## Relevant Files
+- ra_aid/agents/ciayn_agent.py
+- ra_aid/llm.py
+- ra_aid/agent_utils.py
+- ra_aid/__main__.py
+- ra_aid/models_params.py
+
 ## Acceptance Criteria
 1. Tool calls automatically attempt fallback models after N consecutive failures
 2. `--no-fallback-tool` argument successfully disables fallback behavior
@@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration:
 5. Unit tests cover fallback scenarios and edge cases
 6. README.md updated to reflect new behavior
 
-## Testing
-1. Unit tests for fallback wrapper
-2. Integration tests with mock LLM failures 
-3. Provider strategy fallback tests
-4. Command line argument handling
-5. Error preservation and reporting
-6. Performance impact measurement
-7. Edge cases (e.g., partial failures, timeout handling)
-8. State preservation during fallbacks
-
 ## Documentation Updates
 1. Add fallback feature to main README
 2. Document `--no-fallback-tool` in CLI help
diff --git a/ra_aid/__main__.py b/ra_aid/__main__.py
index 16fd5df..654dd60 100644
--- a/ra_aid/__main__.py
+++ b/ra_aid/__main__.py
@@ -17,7 +17,11 @@ from ra_aid.agent_utils import (
     run_planning_agent,
     run_research_agent,
 )
-from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT
+from ra_aid.config import (
+    DEFAULT_MAX_TEST_CMD_RETRIES,
+    DEFAULT_RECURSION_LIMIT,
+    VALID_PROVIDERS,
+)
 from ra_aid.dependencies import check_dependencies
 from ra_aid.env import validate_environment
 from ra_aid.llm import initialize_llm
@@ -40,14 +44,6 @@ def launch_webui(host: str, port: int):
 
 
 def parse_arguments(args=None):
-    VALID_PROVIDERS = [
-        "anthropic",
-        "openai",
-        "openrouter",
-        "openai-compatible",
-        "deepseek",
-        "gemini",
-    ]
     ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
     OPENAI_DEFAULT_MODEL = "gpt-4o"
 
@@ -80,9 +76,11 @@ Examples:
     parser.add_argument(
         "--provider",
         type=str,
-        default="openai"
-        if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
-        else "anthropic",
+        default=(
+            "openai"
+            if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
+            else "anthropic"
+        ),
         choices=VALID_PROVIDERS,
         help="The LLM provider to use",
     )
@@ -138,6 +136,9 @@ Examples:
     parser.add_argument(
         "--verbose", action="store_true", help="Enable verbose logging output"
     )
+    parser.add_argument(
+        "--pretty-logger", action="store_true", help="Enable pretty logging output"
+    )
     parser.add_argument(
         "--temperature",
         type=float,
@@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool:
 def main():
     """Main entry point for the ra-aid command line tool."""
     args = parse_arguments()
-    setup_logging(args.verbose)
+    setup_logging(args.verbose, args.pretty_logger)
     logger.debug("Starting RA.Aid with arguments: %s", args)
 
     # Launch web interface if requested
@@ -378,9 +379,9 @@ def main():
                 chat_agent,
                 CHAT_PROMPT.format(
                     initial_request=initial_request,
-                    web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT
-                    if web_research_enabled
-                    else "",
+                    web_research_section=(
+                        WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else ""
+                    ),
                     working_directory=working_directory,
                     current_date=current_date,
                     project_info=formatted_project_info,
diff --git a/ra_aid/agents/ciayn_agent.py b/ra_aid/agents/ciayn_agent.py
index cbe52bf..a651ab8 100644
--- a/ra_aid/agents/ciayn_agent.py
+++ b/ra_aid/agents/ciayn_agent.py
@@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union
 
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
 
+from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES
 from ra_aid.exceptions import ToolExecutionError
 from ra_aid.logging_config import get_logger
 from ra_aid.models_params import DEFAULT_TOKEN_LIMIT
@@ -68,22 +69,6 @@ class CiaynAgent:
     - Memory management with configurable limits
     """
 
-    class ToolCallFailure:
-        """Tracks consecutive failures and fallback model usage for tool calls.
-
-        Attributes:
-            consecutive_failures (int): Count of consecutive failures for current model
-            current_provider (Optional[str]): Current provider being used
-            current_model (Optional[str]): Current model being used
-            used_fallbacks (Set[str]): Set of fallback models already attempted
-        """
-
-        def __init__(self):
-            self.consecutive_failures = 0
-            self.current_provider = None
-            self.current_model = None
-            self.used_fallbacks = set()
-
     def __init__(
         self,
         model,
@@ -106,10 +91,8 @@ class CiaynAgent:
         self.config = config
         self.provider = config.get("provider", "openai")
         self.fallback_enabled = config.get("fallback_tool_enabled", True)
-        fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4")
-        self.fallback_tool_models = [
-            m.strip() for m in fallback_models_str.split(",") if m.strip()
-        ]
+        self.fallback_tool_models = self._load_fallback_tool_models(config)
+
         self.model = model
         self.tools = tools
         self.max_history_messages = max_history_messages
@@ -117,7 +100,18 @@ class CiaynAgent:
         self.available_functions = []
         for t in tools:
             self.available_functions.append(get_function_info(t.func))
-        self._tool_failure = CiaynAgent.ToolCallFailure()
+        self.tool_failure_consecutive_failures = 0
+        self.tool_failure_current_provider = None
+        self.tool_failure_current_model = None
+        self.tool_failure_used_fallbacks = set()
+
+    def _load_fallback_tool_models(self, config: dict) -> list:
+        fallback_tool_models_config = config.get("fallback_tool_models")
+        if fallback_tool_models_config:
+            return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()]
+        else:
+            from ra_aid.tool_leaderboard import supported_top_tool_models
+            return [item["model"] for item in supported_top_tool_models[:5]]
 
     def _build_prompt(self, last_result: Optional[str] = None) -> str:
         """Build the prompt for the agent including available tools and context."""
@@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**"""
         last_error = None
         while retries < max_retries:
             try:
+                logger.debug(
+                    f"_execute_tool: attempt {retries+1}, original code: {code}"
+                )
                 code = code.strip()
                 if validate_function_call_pattern(code):
                     functions_list = "\n\n".join(self.available_functions)
                     code = _extract_tool_call(code, functions_list)
                 globals_dict = {tool.func.__name__: tool.func for tool in self.tools}
+                logger.debug(f"_execute_tool: evaluating code: {code}")
                 result = eval(code, globals_dict)
-                self._tool_failure.consecutive_failures = 0
+                logger.debug(
+                    f"_execute_tool: tool executed successfully with result: {result}"
+                )
+                self.tool_failure_consecutive_failures = 0
                 return result
             except Exception as e:
+                logger.debug(f"_execute_tool: exception caught: {e}")
                 self._handle_tool_failure(code, e)
                 last_error = e
                 retries += 1
+                logger.debug(f"_execute_tool: retrying, new attempt count: {retries}")
         raise ToolExecutionError(
             f"Error executing code after {max_retries} attempts: {str(last_error)}"
         )
 
     def _handle_tool_failure(self, code: str, error: Exception) -> None:
-        self._tool_failure.consecutive_failures += 1
-        max_failures = self.config.get("max_tool_failures", 3)
+        logger.debug(
+            f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}"
+        )
+        self.tool_failure_consecutive_failures += 1
+        max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES)
+        logger.debug(
+            f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}"
+        )
         if (
             self.fallback_enabled
-            and self._tool_failure.consecutive_failures >= max_failures
+            and self.tool_failure_consecutive_failures >= max_failures
             and self.fallback_tool_models
         ):
+            logger.debug(
+                "_handle_tool_failure: threshold reached, invoking fallback mechanism."
+            )
             self._attempt_fallback(code)
 
     def _attempt_fallback(self, code: str) -> None:
+        logger.debug(f"_attempt_fallback: initiating fallback for code: {code}")
         new_model = self.fallback_tool_models[0]
-        failed_tool_call_name = code.split('(')[0].strip()
+        failed_tool_call_name = code.split("(")[0].strip()
         logger.error(
-            f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
+            f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
         )
         try:
-            from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env
+            from ra_aid.llm import (
+                initialize_llm,
+                merge_chat_history,
+                validate_provider_env,
+            )
+
+            logger.debug(f"_attempt_fallback: validating provider {self.provider}")
             if not validate_provider_env(self.provider):
-                logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.")
+                logger.error(
+                    f"Missing environment configuration for provider {self.provider}. Cannot fallback."
+                )
             else:
+                logger.debug(
+                    f"_attempt_fallback: initializing fallback model {new_model}"
+                )
                 self.model = initialize_llm(self.provider, new_model)
+                logger.debug(
+                    f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}"
+                )
                 self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name)
-                self._tool_failure.used_fallbacks.add(new_model)
-                merge_chat_history()  # Assuming merge_chat_history handles merging fallback history
-                self._tool_failure.consecutive_failures = 0
+                self.tool_failure_used_fallbacks.add(new_model)
+                logger.debug("_attempt_fallback: merging chat history for fallback")
+                merge_chat_history()
+                self.tool_failure_consecutive_failures = 0
+                logger.debug(
+                    "_attempt_fallback: fallback successful and tool failure counter reset"
+                )
         except Exception as switch_e:
             logger.error(f"Fallback model switching failed: {switch_e}")
 
diff --git a/ra_aid/config.py b/ra_aid/config.py
index 6c12a93..41868dd 100644
--- a/ra_aid/config.py
+++ b/ra_aid/config.py
@@ -3,4 +3,12 @@
 DEFAULT_RECURSION_LIMIT = 100
 DEFAULT_MAX_TEST_CMD_RETRIES = 3
 DEFAULT_MAX_TOOL_FAILURES = 3
-MAX_TOOL_FAILURES = 3
+
+VALID_PROVIDERS = [
+    "anthropic",
+    "openai",
+    "openrouter",
+    "openai-compatible",
+    "deepseek",
+    "gemini",
+]
diff --git a/ra_aid/logging_config.py b/ra_aid/logging_config.py
index a40aa3a..fb3bf63 100644
--- a/ra_aid/logging_config.py
+++ b/ra_aid/logging_config.py
@@ -1,18 +1,53 @@
 import logging
 import sys
 from typing import Optional
+from rich.console import Console
+from rich.panel import Panel
+from rich.markdown import Markdown
 
 
-def setup_logging(verbose: bool = False) -> None:
+class PrettyHandler(logging.Handler):
+    def __init__(self, level=logging.NOTSET):
+        super().__init__(level)
+        self.console = Console()
+
+    def emit(self, record):
+        try:
+            msg = self.format(record)
+            # Determine title and style based on log level
+            if record.levelno >= logging.CRITICAL:
+                title = "🔥 CRITICAL"
+                style = "bold red"
+            elif record.levelno >= logging.ERROR:
+                title = "❌ ERROR"
+                style = "red"
+            elif record.levelno >= logging.WARNING:
+                title = "⚠️ WARNING"
+                style = "yellow"
+            elif record.levelno >= logging.INFO:
+                title = "ℹ️ INFO"
+                style = "green"
+            else:
+                title = "🐞 DEBUG"
+                style = "blue"
+            self.console.print(Panel(Markdown(msg.strip()), title=title, style=style))
+        except Exception:
+            self.handleError(record)
+
+
+def setup_logging(verbose: bool = False, pretty: bool = False) -> None:
     logger = logging.getLogger("ra_aid")
     logger.setLevel(logging.DEBUG if verbose else logging.INFO)
 
     if not logger.handlers:
-        handler = logging.StreamHandler(sys.stdout)
-        formatter = logging.Formatter(
-            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-        )
-        handler.setFormatter(formatter)
+        if pretty:
+            handler = PrettyHandler()
+        else:
+            handler = logging.StreamHandler(sys.stdout)
+            formatter = logging.Formatter(
+                "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+            )
+            handler.setFormatter(formatter)
         logger.addHandler(handler)
 
 
diff --git a/ra_aid/tool_leaderboard.py b/ra_aid/tool_leaderboard.py
new file mode 100644
index 0000000..04d7573
--- /dev/null
+++ b/ra_aid/tool_leaderboard.py
@@ -0,0 +1,529 @@
+from ra_aid.config import VALID_PROVIDERS
+
+# Data extracted at 2/10/2025:
+# https://gorilla.cs.berkeley.edu/leaderboard.html
+# In order of overall_acc
+leaderboard_data = [
+    {
+        "overall_acc": 74.31,
+        "model": "watt-tool-70B",
+        "type": "FC",
+        "link": "https://huggingface.co/watt-ai/watt-tool-70B/",
+        "cost": "N/A",
+        "latency": 3.4,
+        "ast_summary": 84.06,
+        "exec_summary": 89.39,
+        "live_ast_acc": 77.74,
+        "multi_turn_acc": 58.75,
+        "relevance": 94.44,
+        "irrelevance": 76.32,
+        "organization": "Watt AI Lab",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 72.08,
+        "model": "gpt-4o-2024-11-20",
+        "type": "Prompt",
+        "link": "https://openai.com/index/hello-gpt-4o/",
+        "cost": 13.54,
+        "latency": 0.78,
+        "ast_summary": 88.1,
+        "exec_summary": 89.38,
+        "live_ast_acc": 79.83,
+        "multi_turn_acc": 47.62,
+        "relevance": 83.33,
+        "irrelevance": 83.76,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 69.58,
+        "model": "gpt-4o-2024-11-20",
+        "type": "FC",
+        "link": "https://openai.com/index/hello-gpt-4o/",
+        "cost": 8.23,
+        "latency": 1.11,
+        "ast_summary": 87.42,
+        "exec_summary": 89.2,
+        "live_ast_acc": 79.65,
+        "multi_turn_acc": 41,
+        "relevance": 83.33,
+        "irrelevance": 83.15,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 67.98,
+        "model": "watt-tool-8B",
+        "type": "FC",
+        "link": "https://huggingface.co/watt-ai/watt-tool-8B/",
+        "cost": "N/A",
+        "latency": 1.31,
+        "ast_summary": 86.56,
+        "exec_summary": 89.34,
+        "live_ast_acc": 76.5,
+        "multi_turn_acc": 39.12,
+        "relevance": 83.33,
+        "irrelevance": 83.15,
+        "organization": "Watt AI Lab",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 67.88,
+        "model": "GPT-4-turbo-2024-04-09",
+        "type": "FC",
+        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "cost": 33.22,
+        "latency": 2.47,
+        "ast_summary": 84.73,
+        "exec_summary": 85.21,
+        "live_ast_acc": 80.5,
+        "multi_turn_acc": 38.12,
+        "relevance": 72.22,
+        "irrelevance": 83.81,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 66.73,
+        "model": "o1-2024-12-17",
+        "type": "Prompt",
+        "link": "https://openai.com/o1/",
+        "cost": 102.47,
+        "latency": 5.3,
+        "ast_summary": 85.67,
+        "exec_summary": 79.77,
+        "live_ast_acc": 80.63,
+        "multi_turn_acc": 36,
+        "relevance": 72.22,
+        "irrelevance": 87.78,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 64.1,
+        "model": "GPT-4o-mini-2024-07-18",
+        "type": "FC",
+        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
+        "cost": 0.51,
+        "latency": 1.49,
+        "ast_summary": 85.21,
+        "exec_summary": 83.57,
+        "live_ast_acc": 74.41,
+        "multi_turn_acc": 34.12,
+        "relevance": 83.33,
+        "irrelevance": 74.75,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 62.79,
+        "model": "o1-mini-2024-09-12",
+        "type": "Prompt",
+        "link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
+        "cost": 29.76,
+        "latency": 8.44,
+        "ast_summary": 78.92,
+        "exec_summary": 82.7,
+        "live_ast_acc": 78.14,
+        "multi_turn_acc": 28.25,
+        "relevance": 61.11,
+        "irrelevance": 89.62,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 62.73,
+        "model": "Functionary-Medium-v3.1",
+        "type": "FC",
+        "link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
+        "cost": "N/A",
+        "latency": 14.06,
+        "ast_summary": 89.88,
+        "exec_summary": 91.32,
+        "live_ast_acc": 76.63,
+        "multi_turn_acc": 21.62,
+        "relevance": 72.22,
+        "irrelevance": 76.08,
+        "organization": "MeetKai",
+        "license": "MIT",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 62.19,
+        "model": "Gemini-1.5-Pro-002",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 7.05,
+        "latency": 5.94,
+        "ast_summary": 88.58,
+        "exec_summary": 91.27,
+        "live_ast_acc": 76.72,
+        "multi_turn_acc": 20.75,
+        "relevance": 72.22,
+        "irrelevance": 78.15,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 61.83,
+        "model": "Hammer2.1-7b",
+        "type": "FC",
+        "link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
+        "cost": "N/A",
+        "latency": 2.08,
+        "ast_summary": 88.65,
+        "exec_summary": 85.48,
+        "live_ast_acc": 75.11,
+        "multi_turn_acc": 23.5,
+        "relevance": 82.35,
+        "irrelevance": 78.59,
+        "organization": "MadeAgents",
+        "license": "cc-by-nc-4.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 61.74,
+        "model": "Gemini-2.0-Flash-Exp",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.0,
+        "latency": 1.18,
+        "ast_summary": 89.96,
+        "exec_summary": 79.89,
+        "live_ast_acc": 82.01,
+        "multi_turn_acc": 17.88,
+        "relevance": 77.78,
+        "irrelevance": 86.44,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 61.38,
+        "model": "Amazon-Nova-Pro-v1:0",
+        "type": "FC",
+        "link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
+        "cost": 5.26,
+        "latency": 2.67,
+        "ast_summary": 84.46,
+        "exec_summary": 85.64,
+        "live_ast_acc": 74.32,
+        "multi_turn_acc": 26.12,
+        "relevance": 77.78,
+        "irrelevance": 70.98,
+        "organization": "Amazon",
+        "license": "Proprietary",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 61.31,
+        "model": "Qwen2.5-72B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
+        "cost": "N/A",
+        "latency": 3.72,
+        "ast_summary": 90.81,
+        "exec_summary": 92.7,
+        "live_ast_acc": 75.3,
+        "multi_turn_acc": 18,
+        "relevance": 100,
+        "irrelevance": 72.81,
+        "organization": "Qwen",
+        "license": "qwen",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 60.97,
+        "model": "Gemini-1.5-Pro-002",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 5.39,
+        "latency": 2.07,
+        "ast_summary": 87.29,
+        "exec_summary": 84.61,
+        "live_ast_acc": 76.28,
+        "multi_turn_acc": 21.62,
+        "relevance": 72.22,
+        "irrelevance": 76.9,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.89,
+        "model": "GPT-4o-mini-2024-07-18",
+        "type": "Prompt",
+        "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
+        "cost": 0.84,
+        "latency": 1.31,
+        "ast_summary": 86.77,
+        "exec_summary": 80.84,
+        "live_ast_acc": 76.5,
+        "multi_turn_acc": 22,
+        "relevance": 83.33,
+        "irrelevance": 80.67,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 60.59,
+        "model": "Gemini-2.0-Flash-Exp",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.0,
+        "latency": 0.85,
+        "ast_summary": 85.1,
+        "exec_summary": 77.46,
+        "live_ast_acc": 79.03,
+        "multi_turn_acc": 20.25,
+        "relevance": 55.56,
+        "irrelevance": 91.51,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.46,
+        "model": "Gemini-1.5-Pro-001",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 7.0,
+        "latency": 1.54,
+        "ast_summary": 85.56,
+        "exec_summary": 85.77,
+        "live_ast_acc": 76.68,
+        "multi_turn_acc": 18.88,
+        "relevance": 55.56,
+        "irrelevance": 84.81,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 60.38,
+        "model": "Gemini-Exp-1206",
+        "type": "FC",
+        "link": "https://blog.google/feed/gemini-exp-1206/",
+        "cost": 0.0,
+        "latency": 3.42,
+        "ast_summary": 85.17,
+        "exec_summary": 80.86,
+        "live_ast_acc": 78.54,
+        "multi_turn_acc": 20.25,
+        "relevance": 77.78,
+        "irrelevance": 79.64,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 59.67,
+        "model": "Qwen2.5-32B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
+        "cost": "N/A",
+        "latency": 2.26,
+        "ast_summary": 85.81,
+        "exec_summary": 89.79,
+        "live_ast_acc": 74.23,
+        "multi_turn_acc": 17.75,
+        "relevance": 100,
+        "irrelevance": 73.75,
+        "organization": "Qwen",
+        "license": "apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 59.57,
+        "model": "GPT-4-turbo-2024-04-09",
+        "type": "Prompt",
+        "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "cost": 58.87,
+        "latency": 1.24,
+        "ast_summary": 90.88,
+        "exec_summary": 89.45,
+        "live_ast_acc": 63.84,
+        "multi_turn_acc": 30.25,
+        "relevance": 100,
+        "irrelevance": 35.57,
+        "organization": "OpenAI",
+        "license": "Proprietary",
+        "provider": "openai",
+    },
+    {
+        "overall_acc": 59.42,
+        "model": "Gemini-1.5-Pro-001",
+        "type": "FC",
+        "link": "https://deepmind.google/technologies/gemini/pro/",
+        "cost": 5.1,
+        "latency": 1.43,
+        "ast_summary": 84.33,
+        "exec_summary": 87.95,
+        "live_ast_acc": 76.23,
+        "multi_turn_acc": 16,
+        "relevance": 50,
+        "irrelevance": 84.39,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 59.07,
+        "model": "Hammer2.1-3b",
+        "type": "FC",
+        "link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
+        "cost": "N/A",
+        "latency": 1.95,
+        "ast_summary": 86.85,
+        "exec_summary": 84.09,
+        "live_ast_acc": 74.04,
+        "multi_turn_acc": 17.38,
+        "relevance": 82.35,
+        "irrelevance": 81.87,
+        "organization": "MadeAgents",
+        "license": "qwen-research",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 58.45,
+        "model": "mistral-large-2407",
+        "type": "FC",
+        "link": "https://mistral.ai/news/mistral-large-2407/",
+        "cost": 12.68,
+        "latency": 3.12,
+        "ast_summary": 86.81,
+        "exec_summary": 84.38,
+        "live_ast_acc": 69.88,
+        "multi_turn_acc": 23.75,
+        "relevance": 72.22,
+        "irrelevance": 52.85,
+        "organization": "Mistral AI",
+        "license": "Proprietary",
+        "provider": "mistral",
+    },
+    {
+        "overall_acc": 58.42,
+        "model": "ToolACE-8B",
+        "type": "FC",
+        "link": "https://huggingface.co/Team-ACE/ToolACE-8B",
+        "cost": "N/A",
+        "latency": 5.24,
+        "ast_summary": 87.54,
+        "exec_summary": 89.21,
+        "live_ast_acc": 78.59,
+        "multi_turn_acc": 7.75,
+        "relevance": 83.33,
+        "irrelevance": 87.88,
+        "organization": "Huawei Noah & USTC",
+        "license": "Apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.78,
+        "model": "xLAM-8x22b-r",
+        "type": "FC",
+        "link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
+        "cost": "N/A",
+        "latency": 9.26,
+        "ast_summary": 83.69,
+        "exec_summary": 87.88,
+        "live_ast_acc": 72.59,
+        "multi_turn_acc": 16.25,
+        "relevance": 88.89,
+        "irrelevance": 67.81,
+        "organization": "Salesforce",
+        "license": "cc-by-nc-4.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.68,
+        "model": "Qwen2.5-14B-Instruct",
+        "type": "Prompt",
+        "link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
+        "cost": "N/A",
+        "latency": 2.02,
+        "ast_summary": 85.69,
+        "exec_summary": 88.84,
+        "live_ast_acc": 74.14,
+        "multi_turn_acc": 12.25,
+        "relevance": 77.78,
+        "irrelevance": 77.06,
+        "organization": "Qwen",
+        "license": "apache-2.0",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.23,
+        "model": "DeepSeek-V3",
+        "type": "FC",
+        "link": "https://api-docs.deepseek.com/news/news1226",
+        "cost": "N/A",
+        "latency": 2.58,
+        "ast_summary": 89.17,
+        "exec_summary": 83.39,
+        "live_ast_acc": 68.41,
+        "multi_turn_acc": 18.62,
+        "relevance": 88.89,
+        "irrelevance": 59.36,
+        "organization": "DeepSeek",
+        "license": "DeepSeek License",
+        "provider": "unknown",
+    },
+    {
+        "overall_acc": 57.09,
+        "model": "Gemini-1.5-Flash-001",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.48,
+        "latency": 0.71,
+        "ast_summary": 85.69,
+        "exec_summary": 83.59,
+        "live_ast_acc": 68.9,
+        "multi_turn_acc": 19.5,
+        "relevance": 83.33,
+        "irrelevance": 62.78,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+    {
+        "overall_acc": 56.79,
+        "model": "Gemini-1.5-Flash-002",
+        "type": "Prompt",
+        "link": "https://deepmind.google/technologies/gemini/flash/",
+        "cost": 0.46,
+        "latency": 0.81,
+        "ast_summary": 81.65,
+        "exec_summary": 80.64,
+        "live_ast_acc": 76.72,
+        "multi_turn_acc": 12.5,
+        "relevance": 83.33,
+        "irrelevance": 78.49,
+        "organization": "Google",
+        "license": "Proprietary",
+        "provider": "google",
+    },
+]
+
+
+supported_top_tool_models = [
+    {
+        "cost": item["cost"],
+        "model": item["model"],
+        "type": item["type"],
+        "provider": item["provider"],
+    }
+    for item in leaderboard_data
+    if item["provider"] in VALID_PROVIDERS
+]
diff --git a/tests/ra_aid/test_ciayn_agent.py b/tests/ra_aid/test_ciayn_agent.py
index 65794d0..2e39dfa 100644
--- a/tests/ra_aid/test_ciayn_agent.py
+++ b/tests/ra_aid/test_ciayn_agent.py
@@ -31,6 +31,7 @@ class DummyModel:
             content = "dummy_tool()"
 
         return Response()
+
     def bind_tools(self, tools, tool_choice):
         pass
 
@@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
         # Create a dummy tool that always fails for testing fallback
         def always_fail():
             raise Exception("Failure for fallback test")
+
         self.always_fail_tool = DummyTool(always_fail)
         # Create a dummy model that does minimal work for fallback tests
         self.dummy_model = DummyModel()
@@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
         self.agent = CiaynAgent(
             self.dummy_model,
             [self.always_fail_tool],
-            config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"}
+            config={
+                "max_tool_failures": 2,
+                "fallback_tool_models": "dummy-fallback-model",
+            },
         )
 
     def test_handle_tool_failure_increments_counter(self):
-        initial_failures = self.agent._tool_failure.consecutive_failures
+        initial_failures = self.agent.tool_failure_consecutive_failures
         self.agent._handle_tool_failure("dummy_call()", Exception("Test error"))
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1)
+        self.assertEqual(
+            self.agent.tool_failure_consecutive_failures, initial_failures + 1
+        )
 
     def test_attempt_fallback_invokes_fallback_logic(self):
-        # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env 
+        # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
         # to simulate fallback switching without external dependencies.
         def dummy_initialize_llm(provider, model_name, temperature=None):
             return self.dummy_model
+
         def dummy_merge_chat_history():
             return ["merged"]
+
         def dummy_validate_provider_env(provider):
             return True
+
         import ra_aid.llm as llm
+
         original_initialize = llm.initialize_llm
         original_merge = llm.merge_chat_history
         original_validate = llm.validate_provider_env
@@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
         llm.validate_provider_env = dummy_validate_provider_env
 
         # Set failure counter high enough to trigger fallback in _handle_tool_failure
-        self.agent._tool_failure.consecutive_failures = 2
+        self.agent.tool_failure_consecutive_failures = 2
         # Call _attempt_fallback; it should reset the failure counter to 0 on success.
         self.agent._attempt_fallback("always_fail_tool()")
-        self.assertEqual(self.agent._tool_failure.consecutive_failures, 0)
+        self.assertEqual(self.agent.tool_failure_consecutive_failures, 0)
         # Restore original functions
         llm.initialize_llm = original_initialize
         llm.merge_chat_history = original_merge
         llm.validate_provider_env = original_validate
 
+
 if __name__ == "__main__":
     unittest.main()