From d8ee4e04f4c7ccaec4e8f78a6fbf9f0e25135ae3 Mon Sep 17 00:00:00 2001 From: Ariel Frischer Date: Mon, 10 Feb 2025 14:13:19 -0800 Subject: [PATCH] feat(fallback): implement automatic fallback to alternative LLM models on consecutive failures to enhance user experience and prevent infinite error loops refactor(ciayn_agent): restructure tool failure handling to track consecutive failures and fallback attempts more effectively fix(logging): add pretty logging option for improved log readability chore(config): define valid providers for LLM selection and update fallback model loading logic test(ciayn_agent): add unit tests for fallback logic and tool failure handling to ensure reliability and correctness --- issue.md | 47 +-- ra_aid/__main__.py | 33 +- ra_aid/agents/ciayn_agent.py | 95 ++++-- ra_aid/config.py | 10 +- ra_aid/logging_config.py | 47 ++- ra_aid/tool_leaderboard.py | 529 +++++++++++++++++++++++++++++++ tests/ra_aid/test_ciayn_agent.py | 24 +- 7 files changed, 688 insertions(+), 97 deletions(-) create mode 100644 ra_aid/tool_leaderboard.py diff --git a/issue.md b/issue.md index 62e246b..3bd0988 100644 --- a/issue.md +++ b/issue.md @@ -4,15 +4,7 @@ Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures. ## Background -Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience. - -## Relevant Files -- ra_aid/agents/ciayn_agent.py -- ra_aid/llm.py -- ra_aid/agent_utils.py -- ra_aid/__main__.py -- ra_aid/models_params.py - +Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls. ## Implementation Details @@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing Define fallback sequences for each provider based on model capabilities: 1. Try same provider's smaller models -2. Try alternative providers' equivalent models +2. Try alternative providers' similar models 3. Raise final error if all fallbacks fail -### Provider Strategy Updates -Update provider strategies to support fallback configuration: -- Add provider-specific fallback sequences -- Handle model capability validation during fallback -- Track successful/failed attempts - ## Risks and Mitigations -1. **Performance Impact** - - Risk: Multiple fallback attempts could increase latency - - Mitigation: Set reasonable max_failures limit and timeouts - -2. **Consistency** - - Risk: Different models may give slightly different outputs - - Mitigation: Validate output schema consistency across models - -3. **Cost** +1. **Cost** - Risk: Fallback to more expensive models - Mitigation: Configure cost limits and preferred fallback sequences -4. **State Management** +2. **State Management** - Risk: Loss of context during fallbacks - Mitigation: Preserve conversation state and tool context +## Relevant Files +- ra_aid/agents/ciayn_agent.py +- ra_aid/llm.py +- ra_aid/agent_utils.py +- ra_aid/__main__.py +- ra_aid/models_params.py + ## Acceptance Criteria 1. Tool calls automatically attempt fallback models after N consecutive failures 2. `--no-fallback-tool` argument successfully disables fallback behavior @@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration: 5. Unit tests cover fallback scenarios and edge cases 6. README.md updated to reflect new behavior -## Testing -1. Unit tests for fallback wrapper -2. Integration tests with mock LLM failures -3. Provider strategy fallback tests -4. Command line argument handling -5. Error preservation and reporting -6. Performance impact measurement -7. Edge cases (e.g., partial failures, timeout handling) -8. State preservation during fallbacks - ## Documentation Updates 1. Add fallback feature to main README 2. Document `--no-fallback-tool` in CLI help diff --git a/ra_aid/__main__.py b/ra_aid/__main__.py index 16fd5df..654dd60 100644 --- a/ra_aid/__main__.py +++ b/ra_aid/__main__.py @@ -17,7 +17,11 @@ from ra_aid.agent_utils import ( run_planning_agent, run_research_agent, ) -from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT +from ra_aid.config import ( + DEFAULT_MAX_TEST_CMD_RETRIES, + DEFAULT_RECURSION_LIMIT, + VALID_PROVIDERS, +) from ra_aid.dependencies import check_dependencies from ra_aid.env import validate_environment from ra_aid.llm import initialize_llm @@ -40,14 +44,6 @@ def launch_webui(host: str, port: int): def parse_arguments(args=None): - VALID_PROVIDERS = [ - "anthropic", - "openai", - "openrouter", - "openai-compatible", - "deepseek", - "gemini", - ] ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022" OPENAI_DEFAULT_MODEL = "gpt-4o" @@ -80,9 +76,11 @@ Examples: parser.add_argument( "--provider", type=str, - default="openai" - if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY")) - else "anthropic", + default=( + "openai" + if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY")) + else "anthropic" + ), choices=VALID_PROVIDERS, help="The LLM provider to use", ) @@ -138,6 +136,9 @@ Examples: parser.add_argument( "--verbose", action="store_true", help="Enable verbose logging output" ) + parser.add_argument( + "--pretty-logger", action="store_true", help="Enable pretty logging output" + ) parser.add_argument( "--temperature", type=float, @@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool: def main(): """Main entry point for the ra-aid command line tool.""" args = parse_arguments() - setup_logging(args.verbose) + setup_logging(args.verbose, args.pretty_logger) logger.debug("Starting RA.Aid with arguments: %s", args) # Launch web interface if requested @@ -378,9 +379,9 @@ def main(): chat_agent, CHAT_PROMPT.format( initial_request=initial_request, - web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT - if web_research_enabled - else "", + web_research_section=( + WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else "" + ), working_directory=working_directory, current_date=current_date, project_info=formatted_project_info, diff --git a/ra_aid/agents/ciayn_agent.py b/ra_aid/agents/ciayn_agent.py index cbe52bf..a651ab8 100644 --- a/ra_aid/agents/ciayn_agent.py +++ b/ra_aid/agents/ciayn_agent.py @@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage +from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES from ra_aid.exceptions import ToolExecutionError from ra_aid.logging_config import get_logger from ra_aid.models_params import DEFAULT_TOKEN_LIMIT @@ -68,22 +69,6 @@ class CiaynAgent: - Memory management with configurable limits """ - class ToolCallFailure: - """Tracks consecutive failures and fallback model usage for tool calls. - - Attributes: - consecutive_failures (int): Count of consecutive failures for current model - current_provider (Optional[str]): Current provider being used - current_model (Optional[str]): Current model being used - used_fallbacks (Set[str]): Set of fallback models already attempted - """ - - def __init__(self): - self.consecutive_failures = 0 - self.current_provider = None - self.current_model = None - self.used_fallbacks = set() - def __init__( self, model, @@ -106,10 +91,8 @@ class CiaynAgent: self.config = config self.provider = config.get("provider", "openai") self.fallback_enabled = config.get("fallback_tool_enabled", True) - fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4") - self.fallback_tool_models = [ - m.strip() for m in fallback_models_str.split(",") if m.strip() - ] + self.fallback_tool_models = self._load_fallback_tool_models(config) + self.model = model self.tools = tools self.max_history_messages = max_history_messages @@ -117,7 +100,18 @@ class CiaynAgent: self.available_functions = [] for t in tools: self.available_functions.append(get_function_info(t.func)) - self._tool_failure = CiaynAgent.ToolCallFailure() + self.tool_failure_consecutive_failures = 0 + self.tool_failure_current_provider = None + self.tool_failure_current_model = None + self.tool_failure_used_fallbacks = set() + + def _load_fallback_tool_models(self, config: dict) -> list: + fallback_tool_models_config = config.get("fallback_tool_models") + if fallback_tool_models_config: + return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()] + else: + from ra_aid.tool_leaderboard import supported_top_tool_models + return [item["model"] for item in supported_top_tool_models[:5]] def _build_prompt(self, last_result: Optional[str] = None) -> str: """Build the prompt for the agent including available tools and context.""" @@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**""" last_error = None while retries < max_retries: try: + logger.debug( + f"_execute_tool: attempt {retries+1}, original code: {code}" + ) code = code.strip() if validate_function_call_pattern(code): functions_list = "\n\n".join(self.available_functions) code = _extract_tool_call(code, functions_list) globals_dict = {tool.func.__name__: tool.func for tool in self.tools} + logger.debug(f"_execute_tool: evaluating code: {code}") result = eval(code, globals_dict) - self._tool_failure.consecutive_failures = 0 + logger.debug( + f"_execute_tool: tool executed successfully with result: {result}" + ) + self.tool_failure_consecutive_failures = 0 return result except Exception as e: + logger.debug(f"_execute_tool: exception caught: {e}") self._handle_tool_failure(code, e) last_error = e retries += 1 + logger.debug(f"_execute_tool: retrying, new attempt count: {retries}") raise ToolExecutionError( f"Error executing code after {max_retries} attempts: {str(last_error)}" ) def _handle_tool_failure(self, code: str, error: Exception) -> None: - self._tool_failure.consecutive_failures += 1 - max_failures = self.config.get("max_tool_failures", 3) + logger.debug( + f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}" + ) + self.tool_failure_consecutive_failures += 1 + max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES) + logger.debug( + f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}" + ) if ( self.fallback_enabled - and self._tool_failure.consecutive_failures >= max_failures + and self.tool_failure_consecutive_failures >= max_failures and self.fallback_tool_models ): + logger.debug( + "_handle_tool_failure: threshold reached, invoking fallback mechanism." + ) self._attempt_fallback(code) def _attempt_fallback(self, code: str) -> None: + logger.debug(f"_attempt_fallback: initiating fallback for code: {code}") new_model = self.fallback_tool_models[0] - failed_tool_call_name = code.split('(')[0].strip() + failed_tool_call_name = code.split("(")[0].strip() logger.error( - f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}" + f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}" ) try: - from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env + from ra_aid.llm import ( + initialize_llm, + merge_chat_history, + validate_provider_env, + ) + + logger.debug(f"_attempt_fallback: validating provider {self.provider}") if not validate_provider_env(self.provider): - logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.") + logger.error( + f"Missing environment configuration for provider {self.provider}. Cannot fallback." + ) else: + logger.debug( + f"_attempt_fallback: initializing fallback model {new_model}" + ) self.model = initialize_llm(self.provider, new_model) + logger.debug( + f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}" + ) self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name) - self._tool_failure.used_fallbacks.add(new_model) - merge_chat_history() # Assuming merge_chat_history handles merging fallback history - self._tool_failure.consecutive_failures = 0 + self.tool_failure_used_fallbacks.add(new_model) + logger.debug("_attempt_fallback: merging chat history for fallback") + merge_chat_history() + self.tool_failure_consecutive_failures = 0 + logger.debug( + "_attempt_fallback: fallback successful and tool failure counter reset" + ) except Exception as switch_e: logger.error(f"Fallback model switching failed: {switch_e}") diff --git a/ra_aid/config.py b/ra_aid/config.py index 6c12a93..41868dd 100644 --- a/ra_aid/config.py +++ b/ra_aid/config.py @@ -3,4 +3,12 @@ DEFAULT_RECURSION_LIMIT = 100 DEFAULT_MAX_TEST_CMD_RETRIES = 3 DEFAULT_MAX_TOOL_FAILURES = 3 -MAX_TOOL_FAILURES = 3 + +VALID_PROVIDERS = [ + "anthropic", + "openai", + "openrouter", + "openai-compatible", + "deepseek", + "gemini", +] diff --git a/ra_aid/logging_config.py b/ra_aid/logging_config.py index a40aa3a..fb3bf63 100644 --- a/ra_aid/logging_config.py +++ b/ra_aid/logging_config.py @@ -1,18 +1,53 @@ import logging import sys from typing import Optional +from rich.console import Console +from rich.panel import Panel +from rich.markdown import Markdown -def setup_logging(verbose: bool = False) -> None: +class PrettyHandler(logging.Handler): + def __init__(self, level=logging.NOTSET): + super().__init__(level) + self.console = Console() + + def emit(self, record): + try: + msg = self.format(record) + # Determine title and style based on log level + if record.levelno >= logging.CRITICAL: + title = "đŸ”Ĩ CRITICAL" + style = "bold red" + elif record.levelno >= logging.ERROR: + title = "❌ ERROR" + style = "red" + elif record.levelno >= logging.WARNING: + title = "âš ī¸ WARNING" + style = "yellow" + elif record.levelno >= logging.INFO: + title = "â„šī¸ INFO" + style = "green" + else: + title = "🐞 DEBUG" + style = "blue" + self.console.print(Panel(Markdown(msg.strip()), title=title, style=style)) + except Exception: + self.handleError(record) + + +def setup_logging(verbose: bool = False, pretty: bool = False) -> None: logger = logging.getLogger("ra_aid") logger.setLevel(logging.DEBUG if verbose else logging.INFO) if not logger.handlers: - handler = logging.StreamHandler(sys.stdout) - formatter = logging.Formatter( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) - handler.setFormatter(formatter) + if pretty: + handler = PrettyHandler() + else: + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + handler.setFormatter(formatter) logger.addHandler(handler) diff --git a/ra_aid/tool_leaderboard.py b/ra_aid/tool_leaderboard.py new file mode 100644 index 0000000..04d7573 --- /dev/null +++ b/ra_aid/tool_leaderboard.py @@ -0,0 +1,529 @@ +from ra_aid.config import VALID_PROVIDERS + +# Data extracted at 2/10/2025: +# https://gorilla.cs.berkeley.edu/leaderboard.html +# In order of overall_acc +leaderboard_data = [ + { + "overall_acc": 74.31, + "model": "watt-tool-70B", + "type": "FC", + "link": "https://huggingface.co/watt-ai/watt-tool-70B/", + "cost": "N/A", + "latency": 3.4, + "ast_summary": 84.06, + "exec_summary": 89.39, + "live_ast_acc": 77.74, + "multi_turn_acc": 58.75, + "relevance": 94.44, + "irrelevance": 76.32, + "organization": "Watt AI Lab", + "license": "Apache-2.0", + "provider": "unknown", + }, + { + "overall_acc": 72.08, + "model": "gpt-4o-2024-11-20", + "type": "Prompt", + "link": "https://openai.com/index/hello-gpt-4o/", + "cost": 13.54, + "latency": 0.78, + "ast_summary": 88.1, + "exec_summary": 89.38, + "live_ast_acc": 79.83, + "multi_turn_acc": 47.62, + "relevance": 83.33, + "irrelevance": 83.76, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 69.58, + "model": "gpt-4o-2024-11-20", + "type": "FC", + "link": "https://openai.com/index/hello-gpt-4o/", + "cost": 8.23, + "latency": 1.11, + "ast_summary": 87.42, + "exec_summary": 89.2, + "live_ast_acc": 79.65, + "multi_turn_acc": 41, + "relevance": 83.33, + "irrelevance": 83.15, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 67.98, + "model": "watt-tool-8B", + "type": "FC", + "link": "https://huggingface.co/watt-ai/watt-tool-8B/", + "cost": "N/A", + "latency": 1.31, + "ast_summary": 86.56, + "exec_summary": 89.34, + "live_ast_acc": 76.5, + "multi_turn_acc": 39.12, + "relevance": 83.33, + "irrelevance": 83.15, + "organization": "Watt AI Lab", + "license": "Apache-2.0", + "provider": "unknown", + }, + { + "overall_acc": 67.88, + "model": "GPT-4-turbo-2024-04-09", + "type": "FC", + "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "cost": 33.22, + "latency": 2.47, + "ast_summary": 84.73, + "exec_summary": 85.21, + "live_ast_acc": 80.5, + "multi_turn_acc": 38.12, + "relevance": 72.22, + "irrelevance": 83.81, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 66.73, + "model": "o1-2024-12-17", + "type": "Prompt", + "link": "https://openai.com/o1/", + "cost": 102.47, + "latency": 5.3, + "ast_summary": 85.67, + "exec_summary": 79.77, + "live_ast_acc": 80.63, + "multi_turn_acc": 36, + "relevance": 72.22, + "irrelevance": 87.78, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 64.1, + "model": "GPT-4o-mini-2024-07-18", + "type": "FC", + "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", + "cost": 0.51, + "latency": 1.49, + "ast_summary": 85.21, + "exec_summary": 83.57, + "live_ast_acc": 74.41, + "multi_turn_acc": 34.12, + "relevance": 83.33, + "irrelevance": 74.75, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 62.79, + "model": "o1-mini-2024-09-12", + "type": "Prompt", + "link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", + "cost": 29.76, + "latency": 8.44, + "ast_summary": 78.92, + "exec_summary": 82.7, + "live_ast_acc": 78.14, + "multi_turn_acc": 28.25, + "relevance": 61.11, + "irrelevance": 89.62, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 62.73, + "model": "Functionary-Medium-v3.1", + "type": "FC", + "link": "https://huggingface.co/meetkai/functionary-medium-v3.1", + "cost": "N/A", + "latency": 14.06, + "ast_summary": 89.88, + "exec_summary": 91.32, + "live_ast_acc": 76.63, + "multi_turn_acc": 21.62, + "relevance": 72.22, + "irrelevance": 76.08, + "organization": "MeetKai", + "license": "MIT", + "provider": "unknown", + }, + { + "overall_acc": 62.19, + "model": "Gemini-1.5-Pro-002", + "type": "Prompt", + "link": "https://deepmind.google/technologies/gemini/pro/", + "cost": 7.05, + "latency": 5.94, + "ast_summary": 88.58, + "exec_summary": 91.27, + "live_ast_acc": 76.72, + "multi_turn_acc": 20.75, + "relevance": 72.22, + "irrelevance": 78.15, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 61.83, + "model": "Hammer2.1-7b", + "type": "FC", + "link": "https://huggingface.co/MadeAgents/Hammer2.1-7b", + "cost": "N/A", + "latency": 2.08, + "ast_summary": 88.65, + "exec_summary": 85.48, + "live_ast_acc": 75.11, + "multi_turn_acc": 23.5, + "relevance": 82.35, + "irrelevance": 78.59, + "organization": "MadeAgents", + "license": "cc-by-nc-4.0", + "provider": "unknown", + }, + { + "overall_acc": 61.74, + "model": "Gemini-2.0-Flash-Exp", + "type": "Prompt", + "link": "https://deepmind.google/technologies/gemini/flash/", + "cost": 0.0, + "latency": 1.18, + "ast_summary": 89.96, + "exec_summary": 79.89, + "live_ast_acc": 82.01, + "multi_turn_acc": 17.88, + "relevance": 77.78, + "irrelevance": 86.44, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 61.38, + "model": "Amazon-Nova-Pro-v1:0", + "type": "FC", + "link": "https://aws.amazon.com/cn/ai/generative-ai/nova/", + "cost": 5.26, + "latency": 2.67, + "ast_summary": 84.46, + "exec_summary": 85.64, + "live_ast_acc": 74.32, + "multi_turn_acc": 26.12, + "relevance": 77.78, + "irrelevance": 70.98, + "organization": "Amazon", + "license": "Proprietary", + "provider": "unknown", + }, + { + "overall_acc": 61.31, + "model": "Qwen2.5-72B-Instruct", + "type": "Prompt", + "link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", + "cost": "N/A", + "latency": 3.72, + "ast_summary": 90.81, + "exec_summary": 92.7, + "live_ast_acc": 75.3, + "multi_turn_acc": 18, + "relevance": 100, + "irrelevance": 72.81, + "organization": "Qwen", + "license": "qwen", + "provider": "unknown", + }, + { + "overall_acc": 60.97, + "model": "Gemini-1.5-Pro-002", + "type": "FC", + "link": "https://deepmind.google/technologies/gemini/pro/", + "cost": 5.39, + "latency": 2.07, + "ast_summary": 87.29, + "exec_summary": 84.61, + "live_ast_acc": 76.28, + "multi_turn_acc": 21.62, + "relevance": 72.22, + "irrelevance": 76.9, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 60.89, + "model": "GPT-4o-mini-2024-07-18", + "type": "Prompt", + "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", + "cost": 0.84, + "latency": 1.31, + "ast_summary": 86.77, + "exec_summary": 80.84, + "live_ast_acc": 76.5, + "multi_turn_acc": 22, + "relevance": 83.33, + "irrelevance": 80.67, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 60.59, + "model": "Gemini-2.0-Flash-Exp", + "type": "FC", + "link": "https://deepmind.google/technologies/gemini/flash/", + "cost": 0.0, + "latency": 0.85, + "ast_summary": 85.1, + "exec_summary": 77.46, + "live_ast_acc": 79.03, + "multi_turn_acc": 20.25, + "relevance": 55.56, + "irrelevance": 91.51, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 60.46, + "model": "Gemini-1.5-Pro-001", + "type": "Prompt", + "link": "https://deepmind.google/technologies/gemini/pro/", + "cost": 7.0, + "latency": 1.54, + "ast_summary": 85.56, + "exec_summary": 85.77, + "live_ast_acc": 76.68, + "multi_turn_acc": 18.88, + "relevance": 55.56, + "irrelevance": 84.81, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 60.38, + "model": "Gemini-Exp-1206", + "type": "FC", + "link": "https://blog.google/feed/gemini-exp-1206/", + "cost": 0.0, + "latency": 3.42, + "ast_summary": 85.17, + "exec_summary": 80.86, + "live_ast_acc": 78.54, + "multi_turn_acc": 20.25, + "relevance": 77.78, + "irrelevance": 79.64, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 59.67, + "model": "Qwen2.5-32B-Instruct", + "type": "Prompt", + "link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct", + "cost": "N/A", + "latency": 2.26, + "ast_summary": 85.81, + "exec_summary": 89.79, + "live_ast_acc": 74.23, + "multi_turn_acc": 17.75, + "relevance": 100, + "irrelevance": 73.75, + "organization": "Qwen", + "license": "apache-2.0", + "provider": "unknown", + }, + { + "overall_acc": 59.57, + "model": "GPT-4-turbo-2024-04-09", + "type": "Prompt", + "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", + "cost": 58.87, + "latency": 1.24, + "ast_summary": 90.88, + "exec_summary": 89.45, + "live_ast_acc": 63.84, + "multi_turn_acc": 30.25, + "relevance": 100, + "irrelevance": 35.57, + "organization": "OpenAI", + "license": "Proprietary", + "provider": "openai", + }, + { + "overall_acc": 59.42, + "model": "Gemini-1.5-Pro-001", + "type": "FC", + "link": "https://deepmind.google/technologies/gemini/pro/", + "cost": 5.1, + "latency": 1.43, + "ast_summary": 84.33, + "exec_summary": 87.95, + "live_ast_acc": 76.23, + "multi_turn_acc": 16, + "relevance": 50, + "irrelevance": 84.39, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 59.07, + "model": "Hammer2.1-3b", + "type": "FC", + "link": "https://huggingface.co/MadeAgents/Hammer2.1-3b", + "cost": "N/A", + "latency": 1.95, + "ast_summary": 86.85, + "exec_summary": 84.09, + "live_ast_acc": 74.04, + "multi_turn_acc": 17.38, + "relevance": 82.35, + "irrelevance": 81.87, + "organization": "MadeAgents", + "license": "qwen-research", + "provider": "unknown", + }, + { + "overall_acc": 58.45, + "model": "mistral-large-2407", + "type": "FC", + "link": "https://mistral.ai/news/mistral-large-2407/", + "cost": 12.68, + "latency": 3.12, + "ast_summary": 86.81, + "exec_summary": 84.38, + "live_ast_acc": 69.88, + "multi_turn_acc": 23.75, + "relevance": 72.22, + "irrelevance": 52.85, + "organization": "Mistral AI", + "license": "Proprietary", + "provider": "mistral", + }, + { + "overall_acc": 58.42, + "model": "ToolACE-8B", + "type": "FC", + "link": "https://huggingface.co/Team-ACE/ToolACE-8B", + "cost": "N/A", + "latency": 5.24, + "ast_summary": 87.54, + "exec_summary": 89.21, + "live_ast_acc": 78.59, + "multi_turn_acc": 7.75, + "relevance": 83.33, + "irrelevance": 87.88, + "organization": "Huawei Noah & USTC", + "license": "Apache-2.0", + "provider": "unknown", + }, + { + "overall_acc": 57.78, + "model": "xLAM-8x22b-r", + "type": "FC", + "link": "https://huggingface.co/Salesforce/xLAM-8x22b-r", + "cost": "N/A", + "latency": 9.26, + "ast_summary": 83.69, + "exec_summary": 87.88, + "live_ast_acc": 72.59, + "multi_turn_acc": 16.25, + "relevance": 88.89, + "irrelevance": 67.81, + "organization": "Salesforce", + "license": "cc-by-nc-4.0", + "provider": "unknown", + }, + { + "overall_acc": 57.68, + "model": "Qwen2.5-14B-Instruct", + "type": "Prompt", + "link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct", + "cost": "N/A", + "latency": 2.02, + "ast_summary": 85.69, + "exec_summary": 88.84, + "live_ast_acc": 74.14, + "multi_turn_acc": 12.25, + "relevance": 77.78, + "irrelevance": 77.06, + "organization": "Qwen", + "license": "apache-2.0", + "provider": "unknown", + }, + { + "overall_acc": 57.23, + "model": "DeepSeek-V3", + "type": "FC", + "link": "https://api-docs.deepseek.com/news/news1226", + "cost": "N/A", + "latency": 2.58, + "ast_summary": 89.17, + "exec_summary": 83.39, + "live_ast_acc": 68.41, + "multi_turn_acc": 18.62, + "relevance": 88.89, + "irrelevance": 59.36, + "organization": "DeepSeek", + "license": "DeepSeek License", + "provider": "unknown", + }, + { + "overall_acc": 57.09, + "model": "Gemini-1.5-Flash-001", + "type": "Prompt", + "link": "https://deepmind.google/technologies/gemini/flash/", + "cost": 0.48, + "latency": 0.71, + "ast_summary": 85.69, + "exec_summary": 83.59, + "live_ast_acc": 68.9, + "multi_turn_acc": 19.5, + "relevance": 83.33, + "irrelevance": 62.78, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, + { + "overall_acc": 56.79, + "model": "Gemini-1.5-Flash-002", + "type": "Prompt", + "link": "https://deepmind.google/technologies/gemini/flash/", + "cost": 0.46, + "latency": 0.81, + "ast_summary": 81.65, + "exec_summary": 80.64, + "live_ast_acc": 76.72, + "multi_turn_acc": 12.5, + "relevance": 83.33, + "irrelevance": 78.49, + "organization": "Google", + "license": "Proprietary", + "provider": "google", + }, +] + + +supported_top_tool_models = [ + { + "cost": item["cost"], + "model": item["model"], + "type": item["type"], + "provider": item["provider"], + } + for item in leaderboard_data + if item["provider"] in VALID_PROVIDERS +] diff --git a/tests/ra_aid/test_ciayn_agent.py b/tests/ra_aid/test_ciayn_agent.py index 65794d0..2e39dfa 100644 --- a/tests/ra_aid/test_ciayn_agent.py +++ b/tests/ra_aid/test_ciayn_agent.py @@ -31,6 +31,7 @@ class DummyModel: content = "dummy_tool()" return Response() + def bind_tools(self, tools, tool_choice): pass @@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase): # Create a dummy tool that always fails for testing fallback def always_fail(): raise Exception("Failure for fallback test") + self.always_fail_tool = DummyTool(always_fail) # Create a dummy model that does minimal work for fallback tests self.dummy_model = DummyModel() @@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase): self.agent = CiaynAgent( self.dummy_model, [self.always_fail_tool], - config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"} + config={ + "max_tool_failures": 2, + "fallback_tool_models": "dummy-fallback-model", + }, ) def test_handle_tool_failure_increments_counter(self): - initial_failures = self.agent._tool_failure.consecutive_failures + initial_failures = self.agent.tool_failure_consecutive_failures self.agent._handle_tool_failure("dummy_call()", Exception("Test error")) - self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1) + self.assertEqual( + self.agent.tool_failure_consecutive_failures, initial_failures + 1 + ) def test_attempt_fallback_invokes_fallback_logic(self): - # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env + # Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env # to simulate fallback switching without external dependencies. def dummy_initialize_llm(provider, model_name, temperature=None): return self.dummy_model + def dummy_merge_chat_history(): return ["merged"] + def dummy_validate_provider_env(provider): return True + import ra_aid.llm as llm + original_initialize = llm.initialize_llm original_merge = llm.merge_chat_history original_validate = llm.validate_provider_env @@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase): llm.validate_provider_env = dummy_validate_provider_env # Set failure counter high enough to trigger fallback in _handle_tool_failure - self.agent._tool_failure.consecutive_failures = 2 + self.agent.tool_failure_consecutive_failures = 2 # Call _attempt_fallback; it should reset the failure counter to 0 on success. self.agent._attempt_fallback("always_fail_tool()") - self.assertEqual(self.agent._tool_failure.consecutive_failures, 0) + self.assertEqual(self.agent.tool_failure_consecutive_failures, 0) # Restore original functions llm.initialize_llm = original_initialize llm.merge_chat_history = original_merge llm.validate_provider_env = original_validate + if __name__ == "__main__": unittest.main()