feat(fallback): implement automatic fallback to alternative LLM models on consecutive failures to enhance user experience and prevent infinite error loops

refactor(ciayn_agent): restructure tool failure handling to track consecutive failures and fallback attempts more effectively
fix(logging): add pretty logging option for improved log readability
chore(config): define valid providers for LLM selection and update fallback model loading logic
test(ciayn_agent): add unit tests for fallback logic and tool failure handling to ensure reliability and correctness
This commit is contained in:
Ariel Frischer 2025-02-10 14:13:19 -08:00
parent 45b993cfd0
commit d8ee4e04f4
7 changed files with 688 additions and 97 deletions

View File

@ -4,15 +4,7 @@
Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures.
## Background
Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience.
## Relevant Files
- ra_aid/agents/ciayn_agent.py
- ra_aid/llm.py
- ra_aid/agent_utils.py
- ra_aid/__main__.py
- ra_aid/models_params.py
Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls.
## Implementation Details
@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing
Define fallback sequences for each provider based on model capabilities:
1. Try same provider's smaller models
2. Try alternative providers' equivalent models
2. Try alternative providers' similar models
3. Raise final error if all fallbacks fail
### Provider Strategy Updates
Update provider strategies to support fallback configuration:
- Add provider-specific fallback sequences
- Handle model capability validation during fallback
- Track successful/failed attempts
## Risks and Mitigations
1. **Performance Impact**
- Risk: Multiple fallback attempts could increase latency
- Mitigation: Set reasonable max_failures limit and timeouts
2. **Consistency**
- Risk: Different models may give slightly different outputs
- Mitigation: Validate output schema consistency across models
3. **Cost**
1. **Cost**
- Risk: Fallback to more expensive models
- Mitigation: Configure cost limits and preferred fallback sequences
4. **State Management**
2. **State Management**
- Risk: Loss of context during fallbacks
- Mitigation: Preserve conversation state and tool context
## Relevant Files
- ra_aid/agents/ciayn_agent.py
- ra_aid/llm.py
- ra_aid/agent_utils.py
- ra_aid/__main__.py
- ra_aid/models_params.py
## Acceptance Criteria
1. Tool calls automatically attempt fallback models after N consecutive failures
2. `--no-fallback-tool` argument successfully disables fallback behavior
@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration:
5. Unit tests cover fallback scenarios and edge cases
6. README.md updated to reflect new behavior
## Testing
1. Unit tests for fallback wrapper
2. Integration tests with mock LLM failures
3. Provider strategy fallback tests
4. Command line argument handling
5. Error preservation and reporting
6. Performance impact measurement
7. Edge cases (e.g., partial failures, timeout handling)
8. State preservation during fallbacks
## Documentation Updates
1. Add fallback feature to main README
2. Document `--no-fallback-tool` in CLI help

View File

@ -17,7 +17,11 @@ from ra_aid.agent_utils import (
run_planning_agent,
run_research_agent,
)
from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT
from ra_aid.config import (
DEFAULT_MAX_TEST_CMD_RETRIES,
DEFAULT_RECURSION_LIMIT,
VALID_PROVIDERS,
)
from ra_aid.dependencies import check_dependencies
from ra_aid.env import validate_environment
from ra_aid.llm import initialize_llm
@ -40,14 +44,6 @@ def launch_webui(host: str, port: int):
def parse_arguments(args=None):
VALID_PROVIDERS = [
"anthropic",
"openai",
"openrouter",
"openai-compatible",
"deepseek",
"gemini",
]
ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
OPENAI_DEFAULT_MODEL = "gpt-4o"
@ -80,9 +76,11 @@ Examples:
parser.add_argument(
"--provider",
type=str,
default="openai"
if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
else "anthropic",
default=(
"openai"
if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
else "anthropic"
),
choices=VALID_PROVIDERS,
help="The LLM provider to use",
)
@ -138,6 +136,9 @@ Examples:
parser.add_argument(
"--verbose", action="store_true", help="Enable verbose logging output"
)
parser.add_argument(
"--pretty-logger", action="store_true", help="Enable pretty logging output"
)
parser.add_argument(
"--temperature",
type=float,
@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool:
def main():
"""Main entry point for the ra-aid command line tool."""
args = parse_arguments()
setup_logging(args.verbose)
setup_logging(args.verbose, args.pretty_logger)
logger.debug("Starting RA.Aid with arguments: %s", args)
# Launch web interface if requested
@ -378,9 +379,9 @@ def main():
chat_agent,
CHAT_PROMPT.format(
initial_request=initial_request,
web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT
if web_research_enabled
else "",
web_research_section=(
WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else ""
),
working_directory=working_directory,
current_date=current_date,
project_info=formatted_project_info,

View File

@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES
from ra_aid.exceptions import ToolExecutionError
from ra_aid.logging_config import get_logger
from ra_aid.models_params import DEFAULT_TOKEN_LIMIT
@ -68,22 +69,6 @@ class CiaynAgent:
- Memory management with configurable limits
"""
class ToolCallFailure:
"""Tracks consecutive failures and fallback model usage for tool calls.
Attributes:
consecutive_failures (int): Count of consecutive failures for current model
current_provider (Optional[str]): Current provider being used
current_model (Optional[str]): Current model being used
used_fallbacks (Set[str]): Set of fallback models already attempted
"""
def __init__(self):
self.consecutive_failures = 0
self.current_provider = None
self.current_model = None
self.used_fallbacks = set()
def __init__(
self,
model,
@ -106,10 +91,8 @@ class CiaynAgent:
self.config = config
self.provider = config.get("provider", "openai")
self.fallback_enabled = config.get("fallback_tool_enabled", True)
fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4")
self.fallback_tool_models = [
m.strip() for m in fallback_models_str.split(",") if m.strip()
]
self.fallback_tool_models = self._load_fallback_tool_models(config)
self.model = model
self.tools = tools
self.max_history_messages = max_history_messages
@ -117,7 +100,18 @@ class CiaynAgent:
self.available_functions = []
for t in tools:
self.available_functions.append(get_function_info(t.func))
self._tool_failure = CiaynAgent.ToolCallFailure()
self.tool_failure_consecutive_failures = 0
self.tool_failure_current_provider = None
self.tool_failure_current_model = None
self.tool_failure_used_fallbacks = set()
def _load_fallback_tool_models(self, config: dict) -> list:
fallback_tool_models_config = config.get("fallback_tool_models")
if fallback_tool_models_config:
return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()]
else:
from ra_aid.tool_leaderboard import supported_top_tool_models
return [item["model"] for item in supported_top_tool_models[:5]]
def _build_prompt(self, last_result: Optional[str] = None) -> str:
"""Build the prompt for the agent including available tools and context."""
@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**"""
last_error = None
while retries < max_retries:
try:
logger.debug(
f"_execute_tool: attempt {retries+1}, original code: {code}"
)
code = code.strip()
if validate_function_call_pattern(code):
functions_list = "\n\n".join(self.available_functions)
code = _extract_tool_call(code, functions_list)
globals_dict = {tool.func.__name__: tool.func for tool in self.tools}
logger.debug(f"_execute_tool: evaluating code: {code}")
result = eval(code, globals_dict)
self._tool_failure.consecutive_failures = 0
logger.debug(
f"_execute_tool: tool executed successfully with result: {result}"
)
self.tool_failure_consecutive_failures = 0
return result
except Exception as e:
logger.debug(f"_execute_tool: exception caught: {e}")
self._handle_tool_failure(code, e)
last_error = e
retries += 1
logger.debug(f"_execute_tool: retrying, new attempt count: {retries}")
raise ToolExecutionError(
f"Error executing code after {max_retries} attempts: {str(last_error)}"
)
def _handle_tool_failure(self, code: str, error: Exception) -> None:
self._tool_failure.consecutive_failures += 1
max_failures = self.config.get("max_tool_failures", 3)
logger.debug(
f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}"
)
self.tool_failure_consecutive_failures += 1
max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES)
logger.debug(
f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}"
)
if (
self.fallback_enabled
and self._tool_failure.consecutive_failures >= max_failures
and self.tool_failure_consecutive_failures >= max_failures
and self.fallback_tool_models
):
logger.debug(
"_handle_tool_failure: threshold reached, invoking fallback mechanism."
)
self._attempt_fallback(code)
def _attempt_fallback(self, code: str) -> None:
logger.debug(f"_attempt_fallback: initiating fallback for code: {code}")
new_model = self.fallback_tool_models[0]
failed_tool_call_name = code.split('(')[0].strip()
failed_tool_call_name = code.split("(")[0].strip()
logger.error(
f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
)
try:
from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env
from ra_aid.llm import (
initialize_llm,
merge_chat_history,
validate_provider_env,
)
logger.debug(f"_attempt_fallback: validating provider {self.provider}")
if not validate_provider_env(self.provider):
logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.")
logger.error(
f"Missing environment configuration for provider {self.provider}. Cannot fallback."
)
else:
logger.debug(
f"_attempt_fallback: initializing fallback model {new_model}"
)
self.model = initialize_llm(self.provider, new_model)
logger.debug(
f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}"
)
self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name)
self._tool_failure.used_fallbacks.add(new_model)
merge_chat_history() # Assuming merge_chat_history handles merging fallback history
self._tool_failure.consecutive_failures = 0
self.tool_failure_used_fallbacks.add(new_model)
logger.debug("_attempt_fallback: merging chat history for fallback")
merge_chat_history()
self.tool_failure_consecutive_failures = 0
logger.debug(
"_attempt_fallback: fallback successful and tool failure counter reset"
)
except Exception as switch_e:
logger.error(f"Fallback model switching failed: {switch_e}")

View File

@ -3,4 +3,12 @@
DEFAULT_RECURSION_LIMIT = 100
DEFAULT_MAX_TEST_CMD_RETRIES = 3
DEFAULT_MAX_TOOL_FAILURES = 3
MAX_TOOL_FAILURES = 3
VALID_PROVIDERS = [
"anthropic",
"openai",
"openrouter",
"openai-compatible",
"deepseek",
"gemini",
]

View File

@ -1,18 +1,53 @@
import logging
import sys
from typing import Optional
from rich.console import Console
from rich.panel import Panel
from rich.markdown import Markdown
def setup_logging(verbose: bool = False) -> None:
class PrettyHandler(logging.Handler):
def __init__(self, level=logging.NOTSET):
super().__init__(level)
self.console = Console()
def emit(self, record):
try:
msg = self.format(record)
# Determine title and style based on log level
if record.levelno >= logging.CRITICAL:
title = "🔥 CRITICAL"
style = "bold red"
elif record.levelno >= logging.ERROR:
title = "❌ ERROR"
style = "red"
elif record.levelno >= logging.WARNING:
title = "⚠️ WARNING"
style = "yellow"
elif record.levelno >= logging.INFO:
title = " INFO"
style = "green"
else:
title = "🐞 DEBUG"
style = "blue"
self.console.print(Panel(Markdown(msg.strip()), title=title, style=style))
except Exception:
self.handleError(record)
def setup_logging(verbose: bool = False, pretty: bool = False) -> None:
logger = logging.getLogger("ra_aid")
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)
if pretty:
handler = PrettyHandler()
else:
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)
logger.addHandler(handler)

529
ra_aid/tool_leaderboard.py Normal file
View File

@ -0,0 +1,529 @@
from ra_aid.config import VALID_PROVIDERS
# Data extracted at 2/10/2025:
# https://gorilla.cs.berkeley.edu/leaderboard.html
# In order of overall_acc
leaderboard_data = [
{
"overall_acc": 74.31,
"model": "watt-tool-70B",
"type": "FC",
"link": "https://huggingface.co/watt-ai/watt-tool-70B/",
"cost": "N/A",
"latency": 3.4,
"ast_summary": 84.06,
"exec_summary": 89.39,
"live_ast_acc": 77.74,
"multi_turn_acc": 58.75,
"relevance": 94.44,
"irrelevance": 76.32,
"organization": "Watt AI Lab",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 72.08,
"model": "gpt-4o-2024-11-20",
"type": "Prompt",
"link": "https://openai.com/index/hello-gpt-4o/",
"cost": 13.54,
"latency": 0.78,
"ast_summary": 88.1,
"exec_summary": 89.38,
"live_ast_acc": 79.83,
"multi_turn_acc": 47.62,
"relevance": 83.33,
"irrelevance": 83.76,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 69.58,
"model": "gpt-4o-2024-11-20",
"type": "FC",
"link": "https://openai.com/index/hello-gpt-4o/",
"cost": 8.23,
"latency": 1.11,
"ast_summary": 87.42,
"exec_summary": 89.2,
"live_ast_acc": 79.65,
"multi_turn_acc": 41,
"relevance": 83.33,
"irrelevance": 83.15,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 67.98,
"model": "watt-tool-8B",
"type": "FC",
"link": "https://huggingface.co/watt-ai/watt-tool-8B/",
"cost": "N/A",
"latency": 1.31,
"ast_summary": 86.56,
"exec_summary": 89.34,
"live_ast_acc": 76.5,
"multi_turn_acc": 39.12,
"relevance": 83.33,
"irrelevance": 83.15,
"organization": "Watt AI Lab",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 67.88,
"model": "GPT-4-turbo-2024-04-09",
"type": "FC",
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"cost": 33.22,
"latency": 2.47,
"ast_summary": 84.73,
"exec_summary": 85.21,
"live_ast_acc": 80.5,
"multi_turn_acc": 38.12,
"relevance": 72.22,
"irrelevance": 83.81,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 66.73,
"model": "o1-2024-12-17",
"type": "Prompt",
"link": "https://openai.com/o1/",
"cost": 102.47,
"latency": 5.3,
"ast_summary": 85.67,
"exec_summary": 79.77,
"live_ast_acc": 80.63,
"multi_turn_acc": 36,
"relevance": 72.22,
"irrelevance": 87.78,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 64.1,
"model": "GPT-4o-mini-2024-07-18",
"type": "FC",
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
"cost": 0.51,
"latency": 1.49,
"ast_summary": 85.21,
"exec_summary": 83.57,
"live_ast_acc": 74.41,
"multi_turn_acc": 34.12,
"relevance": 83.33,
"irrelevance": 74.75,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 62.79,
"model": "o1-mini-2024-09-12",
"type": "Prompt",
"link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
"cost": 29.76,
"latency": 8.44,
"ast_summary": 78.92,
"exec_summary": 82.7,
"live_ast_acc": 78.14,
"multi_turn_acc": 28.25,
"relevance": 61.11,
"irrelevance": 89.62,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 62.73,
"model": "Functionary-Medium-v3.1",
"type": "FC",
"link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
"cost": "N/A",
"latency": 14.06,
"ast_summary": 89.88,
"exec_summary": 91.32,
"live_ast_acc": 76.63,
"multi_turn_acc": 21.62,
"relevance": 72.22,
"irrelevance": 76.08,
"organization": "MeetKai",
"license": "MIT",
"provider": "unknown",
},
{
"overall_acc": 62.19,
"model": "Gemini-1.5-Pro-002",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 7.05,
"latency": 5.94,
"ast_summary": 88.58,
"exec_summary": 91.27,
"live_ast_acc": 76.72,
"multi_turn_acc": 20.75,
"relevance": 72.22,
"irrelevance": 78.15,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 61.83,
"model": "Hammer2.1-7b",
"type": "FC",
"link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
"cost": "N/A",
"latency": 2.08,
"ast_summary": 88.65,
"exec_summary": 85.48,
"live_ast_acc": 75.11,
"multi_turn_acc": 23.5,
"relevance": 82.35,
"irrelevance": 78.59,
"organization": "MadeAgents",
"license": "cc-by-nc-4.0",
"provider": "unknown",
},
{
"overall_acc": 61.74,
"model": "Gemini-2.0-Flash-Exp",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.0,
"latency": 1.18,
"ast_summary": 89.96,
"exec_summary": 79.89,
"live_ast_acc": 82.01,
"multi_turn_acc": 17.88,
"relevance": 77.78,
"irrelevance": 86.44,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 61.38,
"model": "Amazon-Nova-Pro-v1:0",
"type": "FC",
"link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
"cost": 5.26,
"latency": 2.67,
"ast_summary": 84.46,
"exec_summary": 85.64,
"live_ast_acc": 74.32,
"multi_turn_acc": 26.12,
"relevance": 77.78,
"irrelevance": 70.98,
"organization": "Amazon",
"license": "Proprietary",
"provider": "unknown",
},
{
"overall_acc": 61.31,
"model": "Qwen2.5-72B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
"cost": "N/A",
"latency": 3.72,
"ast_summary": 90.81,
"exec_summary": 92.7,
"live_ast_acc": 75.3,
"multi_turn_acc": 18,
"relevance": 100,
"irrelevance": 72.81,
"organization": "Qwen",
"license": "qwen",
"provider": "unknown",
},
{
"overall_acc": 60.97,
"model": "Gemini-1.5-Pro-002",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 5.39,
"latency": 2.07,
"ast_summary": 87.29,
"exec_summary": 84.61,
"live_ast_acc": 76.28,
"multi_turn_acc": 21.62,
"relevance": 72.22,
"irrelevance": 76.9,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.89,
"model": "GPT-4o-mini-2024-07-18",
"type": "Prompt",
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
"cost": 0.84,
"latency": 1.31,
"ast_summary": 86.77,
"exec_summary": 80.84,
"live_ast_acc": 76.5,
"multi_turn_acc": 22,
"relevance": 83.33,
"irrelevance": 80.67,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 60.59,
"model": "Gemini-2.0-Flash-Exp",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.0,
"latency": 0.85,
"ast_summary": 85.1,
"exec_summary": 77.46,
"live_ast_acc": 79.03,
"multi_turn_acc": 20.25,
"relevance": 55.56,
"irrelevance": 91.51,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.46,
"model": "Gemini-1.5-Pro-001",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 7.0,
"latency": 1.54,
"ast_summary": 85.56,
"exec_summary": 85.77,
"live_ast_acc": 76.68,
"multi_turn_acc": 18.88,
"relevance": 55.56,
"irrelevance": 84.81,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.38,
"model": "Gemini-Exp-1206",
"type": "FC",
"link": "https://blog.google/feed/gemini-exp-1206/",
"cost": 0.0,
"latency": 3.42,
"ast_summary": 85.17,
"exec_summary": 80.86,
"live_ast_acc": 78.54,
"multi_turn_acc": 20.25,
"relevance": 77.78,
"irrelevance": 79.64,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 59.67,
"model": "Qwen2.5-32B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
"cost": "N/A",
"latency": 2.26,
"ast_summary": 85.81,
"exec_summary": 89.79,
"live_ast_acc": 74.23,
"multi_turn_acc": 17.75,
"relevance": 100,
"irrelevance": 73.75,
"organization": "Qwen",
"license": "apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 59.57,
"model": "GPT-4-turbo-2024-04-09",
"type": "Prompt",
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"cost": 58.87,
"latency": 1.24,
"ast_summary": 90.88,
"exec_summary": 89.45,
"live_ast_acc": 63.84,
"multi_turn_acc": 30.25,
"relevance": 100,
"irrelevance": 35.57,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 59.42,
"model": "Gemini-1.5-Pro-001",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 5.1,
"latency": 1.43,
"ast_summary": 84.33,
"exec_summary": 87.95,
"live_ast_acc": 76.23,
"multi_turn_acc": 16,
"relevance": 50,
"irrelevance": 84.39,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 59.07,
"model": "Hammer2.1-3b",
"type": "FC",
"link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
"cost": "N/A",
"latency": 1.95,
"ast_summary": 86.85,
"exec_summary": 84.09,
"live_ast_acc": 74.04,
"multi_turn_acc": 17.38,
"relevance": 82.35,
"irrelevance": 81.87,
"organization": "MadeAgents",
"license": "qwen-research",
"provider": "unknown",
},
{
"overall_acc": 58.45,
"model": "mistral-large-2407",
"type": "FC",
"link": "https://mistral.ai/news/mistral-large-2407/",
"cost": 12.68,
"latency": 3.12,
"ast_summary": 86.81,
"exec_summary": 84.38,
"live_ast_acc": 69.88,
"multi_turn_acc": 23.75,
"relevance": 72.22,
"irrelevance": 52.85,
"organization": "Mistral AI",
"license": "Proprietary",
"provider": "mistral",
},
{
"overall_acc": 58.42,
"model": "ToolACE-8B",
"type": "FC",
"link": "https://huggingface.co/Team-ACE/ToolACE-8B",
"cost": "N/A",
"latency": 5.24,
"ast_summary": 87.54,
"exec_summary": 89.21,
"live_ast_acc": 78.59,
"multi_turn_acc": 7.75,
"relevance": 83.33,
"irrelevance": 87.88,
"organization": "Huawei Noah & USTC",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 57.78,
"model": "xLAM-8x22b-r",
"type": "FC",
"link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
"cost": "N/A",
"latency": 9.26,
"ast_summary": 83.69,
"exec_summary": 87.88,
"live_ast_acc": 72.59,
"multi_turn_acc": 16.25,
"relevance": 88.89,
"irrelevance": 67.81,
"organization": "Salesforce",
"license": "cc-by-nc-4.0",
"provider": "unknown",
},
{
"overall_acc": 57.68,
"model": "Qwen2.5-14B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
"cost": "N/A",
"latency": 2.02,
"ast_summary": 85.69,
"exec_summary": 88.84,
"live_ast_acc": 74.14,
"multi_turn_acc": 12.25,
"relevance": 77.78,
"irrelevance": 77.06,
"organization": "Qwen",
"license": "apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 57.23,
"model": "DeepSeek-V3",
"type": "FC",
"link": "https://api-docs.deepseek.com/news/news1226",
"cost": "N/A",
"latency": 2.58,
"ast_summary": 89.17,
"exec_summary": 83.39,
"live_ast_acc": 68.41,
"multi_turn_acc": 18.62,
"relevance": 88.89,
"irrelevance": 59.36,
"organization": "DeepSeek",
"license": "DeepSeek License",
"provider": "unknown",
},
{
"overall_acc": 57.09,
"model": "Gemini-1.5-Flash-001",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.48,
"latency": 0.71,
"ast_summary": 85.69,
"exec_summary": 83.59,
"live_ast_acc": 68.9,
"multi_turn_acc": 19.5,
"relevance": 83.33,
"irrelevance": 62.78,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 56.79,
"model": "Gemini-1.5-Flash-002",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.46,
"latency": 0.81,
"ast_summary": 81.65,
"exec_summary": 80.64,
"live_ast_acc": 76.72,
"multi_turn_acc": 12.5,
"relevance": 83.33,
"irrelevance": 78.49,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
]
supported_top_tool_models = [
{
"cost": item["cost"],
"model": item["model"],
"type": item["type"],
"provider": item["provider"],
}
for item in leaderboard_data
if item["provider"] in VALID_PROVIDERS
]

View File

@ -31,6 +31,7 @@ class DummyModel:
content = "dummy_tool()"
return Response()
def bind_tools(self, tools, tool_choice):
pass
@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
# Create a dummy tool that always fails for testing fallback
def always_fail():
raise Exception("Failure for fallback test")
self.always_fail_tool = DummyTool(always_fail)
# Create a dummy model that does minimal work for fallback tests
self.dummy_model = DummyModel()
@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
self.agent = CiaynAgent(
self.dummy_model,
[self.always_fail_tool],
config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"}
config={
"max_tool_failures": 2,
"fallback_tool_models": "dummy-fallback-model",
},
)
def test_handle_tool_failure_increments_counter(self):
initial_failures = self.agent._tool_failure.consecutive_failures
initial_failures = self.agent.tool_failure_consecutive_failures
self.agent._handle_tool_failure("dummy_call()", Exception("Test error"))
self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1)
self.assertEqual(
self.agent.tool_failure_consecutive_failures, initial_failures + 1
)
def test_attempt_fallback_invokes_fallback_logic(self):
# Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
# Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
# to simulate fallback switching without external dependencies.
def dummy_initialize_llm(provider, model_name, temperature=None):
return self.dummy_model
def dummy_merge_chat_history():
return ["merged"]
def dummy_validate_provider_env(provider):
return True
import ra_aid.llm as llm
original_initialize = llm.initialize_llm
original_merge = llm.merge_chat_history
original_validate = llm.validate_provider_env
@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
llm.validate_provider_env = dummy_validate_provider_env
# Set failure counter high enough to trigger fallback in _handle_tool_failure
self.agent._tool_failure.consecutive_failures = 2
self.agent.tool_failure_consecutive_failures = 2
# Call _attempt_fallback; it should reset the failure counter to 0 on success.
self.agent._attempt_fallback("always_fail_tool()")
self.assertEqual(self.agent._tool_failure.consecutive_failures, 0)
self.assertEqual(self.agent.tool_failure_consecutive_failures, 0)
# Restore original functions
llm.initialize_llm = original_initialize
llm.merge_chat_history = original_merge
llm.validate_provider_env = original_validate
if __name__ == "__main__":
unittest.main()