feat(fallback): implement automatic fallback to alternative LLM models on consecutive failures to enhance user experience and prevent infinite error loops
refactor(ciayn_agent): restructure tool failure handling to track consecutive failures and fallback attempts more effectively fix(logging): add pretty logging option for improved log readability chore(config): define valid providers for LLM selection and update fallback model loading logic test(ciayn_agent): add unit tests for fallback logic and tool failure handling to ensure reliability and correctness
This commit is contained in:
parent
45b993cfd0
commit
d8ee4e04f4
47
issue.md
47
issue.md
|
|
@ -4,15 +4,7 @@
|
|||
Add functionality to automatically fallback to alternative LLM models when a tool call experiences multiple consecutive failures.
|
||||
|
||||
## Background
|
||||
Currently, when a tool call fails due to LLM-related errors (e.g., API timeouts, rate limits, context length issues), there is no automatic fallback mechanism. This can lead to interrupted workflows and poor user experience.
|
||||
|
||||
## Relevant Files
|
||||
- ra_aid/agents/ciayn_agent.py
|
||||
- ra_aid/llm.py
|
||||
- ra_aid/agent_utils.py
|
||||
- ra_aid/__main__.py
|
||||
- ra_aid/models_params.py
|
||||
|
||||
Currently, when a tool call fails due to LLM-related errors (e.g., invalid format), there is no automatic fallback mechanism. This often causes infinite loop of erroring tool calls.
|
||||
|
||||
## Implementation Details
|
||||
|
||||
|
|
@ -59,32 +51,25 @@ The prompt passed to `try_fallback_model`, should be the failed last few failing
|
|||
Define fallback sequences for each provider based on model capabilities:
|
||||
|
||||
1. Try same provider's smaller models
|
||||
2. Try alternative providers' equivalent models
|
||||
2. Try alternative providers' similar models
|
||||
3. Raise final error if all fallbacks fail
|
||||
|
||||
### Provider Strategy Updates
|
||||
Update provider strategies to support fallback configuration:
|
||||
- Add provider-specific fallback sequences
|
||||
- Handle model capability validation during fallback
|
||||
- Track successful/failed attempts
|
||||
|
||||
## Risks and Mitigations
|
||||
1. **Performance Impact**
|
||||
- Risk: Multiple fallback attempts could increase latency
|
||||
- Mitigation: Set reasonable max_failures limit and timeouts
|
||||
|
||||
2. **Consistency**
|
||||
- Risk: Different models may give slightly different outputs
|
||||
- Mitigation: Validate output schema consistency across models
|
||||
|
||||
3. **Cost**
|
||||
1. **Cost**
|
||||
- Risk: Fallback to more expensive models
|
||||
- Mitigation: Configure cost limits and preferred fallback sequences
|
||||
|
||||
4. **State Management**
|
||||
2. **State Management**
|
||||
- Risk: Loss of context during fallbacks
|
||||
- Mitigation: Preserve conversation state and tool context
|
||||
|
||||
## Relevant Files
|
||||
- ra_aid/agents/ciayn_agent.py
|
||||
- ra_aid/llm.py
|
||||
- ra_aid/agent_utils.py
|
||||
- ra_aid/__main__.py
|
||||
- ra_aid/models_params.py
|
||||
|
||||
## Acceptance Criteria
|
||||
1. Tool calls automatically attempt fallback models after N consecutive failures
|
||||
2. `--no-fallback-tool` argument successfully disables fallback behavior
|
||||
|
|
@ -93,16 +78,6 @@ Update provider strategies to support fallback configuration:
|
|||
5. Unit tests cover fallback scenarios and edge cases
|
||||
6. README.md updated to reflect new behavior
|
||||
|
||||
## Testing
|
||||
1. Unit tests for fallback wrapper
|
||||
2. Integration tests with mock LLM failures
|
||||
3. Provider strategy fallback tests
|
||||
4. Command line argument handling
|
||||
5. Error preservation and reporting
|
||||
6. Performance impact measurement
|
||||
7. Edge cases (e.g., partial failures, timeout handling)
|
||||
8. State preservation during fallbacks
|
||||
|
||||
## Documentation Updates
|
||||
1. Add fallback feature to main README
|
||||
2. Document `--no-fallback-tool` in CLI help
|
||||
|
|
|
|||
|
|
@ -17,7 +17,11 @@ from ra_aid.agent_utils import (
|
|||
run_planning_agent,
|
||||
run_research_agent,
|
||||
)
|
||||
from ra_aid.config import DEFAULT_MAX_TEST_CMD_RETRIES, DEFAULT_RECURSION_LIMIT
|
||||
from ra_aid.config import (
|
||||
DEFAULT_MAX_TEST_CMD_RETRIES,
|
||||
DEFAULT_RECURSION_LIMIT,
|
||||
VALID_PROVIDERS,
|
||||
)
|
||||
from ra_aid.dependencies import check_dependencies
|
||||
from ra_aid.env import validate_environment
|
||||
from ra_aid.llm import initialize_llm
|
||||
|
|
@ -40,14 +44,6 @@ def launch_webui(host: str, port: int):
|
|||
|
||||
|
||||
def parse_arguments(args=None):
|
||||
VALID_PROVIDERS = [
|
||||
"anthropic",
|
||||
"openai",
|
||||
"openrouter",
|
||||
"openai-compatible",
|
||||
"deepseek",
|
||||
"gemini",
|
||||
]
|
||||
ANTHROPIC_DEFAULT_MODEL = "claude-3-5-sonnet-20241022"
|
||||
OPENAI_DEFAULT_MODEL = "gpt-4o"
|
||||
|
||||
|
|
@ -80,9 +76,11 @@ Examples:
|
|||
parser.add_argument(
|
||||
"--provider",
|
||||
type=str,
|
||||
default="openai"
|
||||
if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
|
||||
else "anthropic",
|
||||
default=(
|
||||
"openai"
|
||||
if (os.getenv("OPENAI_API_KEY") and not os.getenv("ANTHROPIC_API_KEY"))
|
||||
else "anthropic"
|
||||
),
|
||||
choices=VALID_PROVIDERS,
|
||||
help="The LLM provider to use",
|
||||
)
|
||||
|
|
@ -138,6 +136,9 @@ Examples:
|
|||
parser.add_argument(
|
||||
"--verbose", action="store_true", help="Enable verbose logging output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pretty-logger", action="store_true", help="Enable pretty logging output"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
|
|
@ -276,7 +277,7 @@ def is_stage_requested(stage: str) -> bool:
|
|||
def main():
|
||||
"""Main entry point for the ra-aid command line tool."""
|
||||
args = parse_arguments()
|
||||
setup_logging(args.verbose)
|
||||
setup_logging(args.verbose, args.pretty_logger)
|
||||
logger.debug("Starting RA.Aid with arguments: %s", args)
|
||||
|
||||
# Launch web interface if requested
|
||||
|
|
@ -378,9 +379,9 @@ def main():
|
|||
chat_agent,
|
||||
CHAT_PROMPT.format(
|
||||
initial_request=initial_request,
|
||||
web_research_section=WEB_RESEARCH_PROMPT_SECTION_CHAT
|
||||
if web_research_enabled
|
||||
else "",
|
||||
web_research_section=(
|
||||
WEB_RESEARCH_PROMPT_SECTION_CHAT if web_research_enabled else ""
|
||||
),
|
||||
working_directory=working_directory,
|
||||
current_date=current_date,
|
||||
project_info=formatted_project_info,
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from typing import Any, Dict, Generator, List, Optional, Union
|
|||
|
||||
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
|
||||
|
||||
from ra_aid.config import DEFAULT_MAX_TOOL_FAILURES
|
||||
from ra_aid.exceptions import ToolExecutionError
|
||||
from ra_aid.logging_config import get_logger
|
||||
from ra_aid.models_params import DEFAULT_TOKEN_LIMIT
|
||||
|
|
@ -68,22 +69,6 @@ class CiaynAgent:
|
|||
- Memory management with configurable limits
|
||||
"""
|
||||
|
||||
class ToolCallFailure:
|
||||
"""Tracks consecutive failures and fallback model usage for tool calls.
|
||||
|
||||
Attributes:
|
||||
consecutive_failures (int): Count of consecutive failures for current model
|
||||
current_provider (Optional[str]): Current provider being used
|
||||
current_model (Optional[str]): Current model being used
|
||||
used_fallbacks (Set[str]): Set of fallback models already attempted
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.consecutive_failures = 0
|
||||
self.current_provider = None
|
||||
self.current_model = None
|
||||
self.used_fallbacks = set()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model,
|
||||
|
|
@ -106,10 +91,8 @@ class CiaynAgent:
|
|||
self.config = config
|
||||
self.provider = config.get("provider", "openai")
|
||||
self.fallback_enabled = config.get("fallback_tool_enabled", True)
|
||||
fallback_models_str = config.get("fallback_tool_models", "gpt-3.5-turbo,gpt-4")
|
||||
self.fallback_tool_models = [
|
||||
m.strip() for m in fallback_models_str.split(",") if m.strip()
|
||||
]
|
||||
self.fallback_tool_models = self._load_fallback_tool_models(config)
|
||||
|
||||
self.model = model
|
||||
self.tools = tools
|
||||
self.max_history_messages = max_history_messages
|
||||
|
|
@ -117,7 +100,18 @@ class CiaynAgent:
|
|||
self.available_functions = []
|
||||
for t in tools:
|
||||
self.available_functions.append(get_function_info(t.func))
|
||||
self._tool_failure = CiaynAgent.ToolCallFailure()
|
||||
self.tool_failure_consecutive_failures = 0
|
||||
self.tool_failure_current_provider = None
|
||||
self.tool_failure_current_model = None
|
||||
self.tool_failure_used_fallbacks = set()
|
||||
|
||||
def _load_fallback_tool_models(self, config: dict) -> list:
|
||||
fallback_tool_models_config = config.get("fallback_tool_models")
|
||||
if fallback_tool_models_config:
|
||||
return [m.strip() for m in fallback_tool_models_config.split(",") if m.strip()]
|
||||
else:
|
||||
from ra_aid.tool_leaderboard import supported_top_tool_models
|
||||
return [item["model"] for item in supported_top_tool_models[:5]]
|
||||
|
||||
def _build_prompt(self, last_result: Optional[str] = None) -> str:
|
||||
"""Build the prompt for the agent including available tools and context."""
|
||||
|
|
@ -255,48 +249,85 @@ Output **ONLY THE CODE** and **NO MARKDOWN BACKTICKS**"""
|
|||
last_error = None
|
||||
while retries < max_retries:
|
||||
try:
|
||||
logger.debug(
|
||||
f"_execute_tool: attempt {retries+1}, original code: {code}"
|
||||
)
|
||||
code = code.strip()
|
||||
if validate_function_call_pattern(code):
|
||||
functions_list = "\n\n".join(self.available_functions)
|
||||
code = _extract_tool_call(code, functions_list)
|
||||
globals_dict = {tool.func.__name__: tool.func for tool in self.tools}
|
||||
logger.debug(f"_execute_tool: evaluating code: {code}")
|
||||
result = eval(code, globals_dict)
|
||||
self._tool_failure.consecutive_failures = 0
|
||||
logger.debug(
|
||||
f"_execute_tool: tool executed successfully with result: {result}"
|
||||
)
|
||||
self.tool_failure_consecutive_failures = 0
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.debug(f"_execute_tool: exception caught: {e}")
|
||||
self._handle_tool_failure(code, e)
|
||||
last_error = e
|
||||
retries += 1
|
||||
logger.debug(f"_execute_tool: retrying, new attempt count: {retries}")
|
||||
raise ToolExecutionError(
|
||||
f"Error executing code after {max_retries} attempts: {str(last_error)}"
|
||||
)
|
||||
|
||||
def _handle_tool_failure(self, code: str, error: Exception) -> None:
|
||||
self._tool_failure.consecutive_failures += 1
|
||||
max_failures = self.config.get("max_tool_failures", 3)
|
||||
logger.debug(
|
||||
f"_handle_tool_failure: tool failure encountered for code '{code}' with error: {error}"
|
||||
)
|
||||
self.tool_failure_consecutive_failures += 1
|
||||
max_failures = self.config.get("max_tool_failures", DEFAULT_MAX_TOOL_FAILURES)
|
||||
logger.debug(
|
||||
f"_handle_tool_failure: failure count {self.tool_failure_consecutive_failures}, max_failures {max_failures}"
|
||||
)
|
||||
if (
|
||||
self.fallback_enabled
|
||||
and self._tool_failure.consecutive_failures >= max_failures
|
||||
and self.tool_failure_consecutive_failures >= max_failures
|
||||
and self.fallback_tool_models
|
||||
):
|
||||
logger.debug(
|
||||
"_handle_tool_failure: threshold reached, invoking fallback mechanism."
|
||||
)
|
||||
self._attempt_fallback(code)
|
||||
|
||||
def _attempt_fallback(self, code: str) -> None:
|
||||
logger.debug(f"_attempt_fallback: initiating fallback for code: {code}")
|
||||
new_model = self.fallback_tool_models[0]
|
||||
failed_tool_call_name = code.split('(')[0].strip()
|
||||
failed_tool_call_name = code.split("(")[0].strip()
|
||||
logger.error(
|
||||
f"Tool call failed {self._tool_failure.consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
|
||||
f"Tool call failed {self.tool_failure_consecutive_failures} times. Attempting fallback to model: {new_model} for tool: {failed_tool_call_name}"
|
||||
)
|
||||
try:
|
||||
from ra_aid.llm import initialize_llm, merge_chat_history, validate_provider_env
|
||||
from ra_aid.llm import (
|
||||
initialize_llm,
|
||||
merge_chat_history,
|
||||
validate_provider_env,
|
||||
)
|
||||
|
||||
logger.debug(f"_attempt_fallback: validating provider {self.provider}")
|
||||
if not validate_provider_env(self.provider):
|
||||
logger.error(f"Missing environment configuration for provider {self.provider}. Cannot fallback.")
|
||||
logger.error(
|
||||
f"Missing environment configuration for provider {self.provider}. Cannot fallback."
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"_attempt_fallback: initializing fallback model {new_model}"
|
||||
)
|
||||
self.model = initialize_llm(self.provider, new_model)
|
||||
logger.debug(
|
||||
f"_attempt_fallback: binding tools to new model using tool: {failed_tool_call_name}"
|
||||
)
|
||||
self.model.bind_tools(self.tools, tool_choice=failed_tool_call_name)
|
||||
self._tool_failure.used_fallbacks.add(new_model)
|
||||
merge_chat_history() # Assuming merge_chat_history handles merging fallback history
|
||||
self._tool_failure.consecutive_failures = 0
|
||||
self.tool_failure_used_fallbacks.add(new_model)
|
||||
logger.debug("_attempt_fallback: merging chat history for fallback")
|
||||
merge_chat_history()
|
||||
self.tool_failure_consecutive_failures = 0
|
||||
logger.debug(
|
||||
"_attempt_fallback: fallback successful and tool failure counter reset"
|
||||
)
|
||||
except Exception as switch_e:
|
||||
logger.error(f"Fallback model switching failed: {switch_e}")
|
||||
|
||||
|
|
|
|||
|
|
@ -3,4 +3,12 @@
|
|||
DEFAULT_RECURSION_LIMIT = 100
|
||||
DEFAULT_MAX_TEST_CMD_RETRIES = 3
|
||||
DEFAULT_MAX_TOOL_FAILURES = 3
|
||||
MAX_TOOL_FAILURES = 3
|
||||
|
||||
VALID_PROVIDERS = [
|
||||
"anthropic",
|
||||
"openai",
|
||||
"openrouter",
|
||||
"openai-compatible",
|
||||
"deepseek",
|
||||
"gemini",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1,18 +1,53 @@
|
|||
import logging
|
||||
import sys
|
||||
from typing import Optional
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.markdown import Markdown
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False) -> None:
|
||||
class PrettyHandler(logging.Handler):
|
||||
def __init__(self, level=logging.NOTSET):
|
||||
super().__init__(level)
|
||||
self.console = Console()
|
||||
|
||||
def emit(self, record):
|
||||
try:
|
||||
msg = self.format(record)
|
||||
# Determine title and style based on log level
|
||||
if record.levelno >= logging.CRITICAL:
|
||||
title = "🔥 CRITICAL"
|
||||
style = "bold red"
|
||||
elif record.levelno >= logging.ERROR:
|
||||
title = "❌ ERROR"
|
||||
style = "red"
|
||||
elif record.levelno >= logging.WARNING:
|
||||
title = "⚠️ WARNING"
|
||||
style = "yellow"
|
||||
elif record.levelno >= logging.INFO:
|
||||
title = "ℹ️ INFO"
|
||||
style = "green"
|
||||
else:
|
||||
title = "🐞 DEBUG"
|
||||
style = "blue"
|
||||
self.console.print(Panel(Markdown(msg.strip()), title=title, style=style))
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False, pretty: bool = False) -> None:
|
||||
logger = logging.getLogger("ra_aid")
|
||||
logger.setLevel(logging.DEBUG if verbose else logging.INFO)
|
||||
|
||||
if not logger.handlers:
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
if pretty:
|
||||
handler = PrettyHandler()
|
||||
else:
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,529 @@
|
|||
from ra_aid.config import VALID_PROVIDERS
|
||||
|
||||
# Data extracted at 2/10/2025:
|
||||
# https://gorilla.cs.berkeley.edu/leaderboard.html
|
||||
# In order of overall_acc
|
||||
leaderboard_data = [
|
||||
{
|
||||
"overall_acc": 74.31,
|
||||
"model": "watt-tool-70B",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/watt-ai/watt-tool-70B/",
|
||||
"cost": "N/A",
|
||||
"latency": 3.4,
|
||||
"ast_summary": 84.06,
|
||||
"exec_summary": 89.39,
|
||||
"live_ast_acc": 77.74,
|
||||
"multi_turn_acc": 58.75,
|
||||
"relevance": 94.44,
|
||||
"irrelevance": 76.32,
|
||||
"organization": "Watt AI Lab",
|
||||
"license": "Apache-2.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 72.08,
|
||||
"model": "gpt-4o-2024-11-20",
|
||||
"type": "Prompt",
|
||||
"link": "https://openai.com/index/hello-gpt-4o/",
|
||||
"cost": 13.54,
|
||||
"latency": 0.78,
|
||||
"ast_summary": 88.1,
|
||||
"exec_summary": 89.38,
|
||||
"live_ast_acc": 79.83,
|
||||
"multi_turn_acc": 47.62,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 83.76,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 69.58,
|
||||
"model": "gpt-4o-2024-11-20",
|
||||
"type": "FC",
|
||||
"link": "https://openai.com/index/hello-gpt-4o/",
|
||||
"cost": 8.23,
|
||||
"latency": 1.11,
|
||||
"ast_summary": 87.42,
|
||||
"exec_summary": 89.2,
|
||||
"live_ast_acc": 79.65,
|
||||
"multi_turn_acc": 41,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 83.15,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 67.98,
|
||||
"model": "watt-tool-8B",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/watt-ai/watt-tool-8B/",
|
||||
"cost": "N/A",
|
||||
"latency": 1.31,
|
||||
"ast_summary": 86.56,
|
||||
"exec_summary": 89.34,
|
||||
"live_ast_acc": 76.5,
|
||||
"multi_turn_acc": 39.12,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 83.15,
|
||||
"organization": "Watt AI Lab",
|
||||
"license": "Apache-2.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 67.88,
|
||||
"model": "GPT-4-turbo-2024-04-09",
|
||||
"type": "FC",
|
||||
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
|
||||
"cost": 33.22,
|
||||
"latency": 2.47,
|
||||
"ast_summary": 84.73,
|
||||
"exec_summary": 85.21,
|
||||
"live_ast_acc": 80.5,
|
||||
"multi_turn_acc": 38.12,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 83.81,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 66.73,
|
||||
"model": "o1-2024-12-17",
|
||||
"type": "Prompt",
|
||||
"link": "https://openai.com/o1/",
|
||||
"cost": 102.47,
|
||||
"latency": 5.3,
|
||||
"ast_summary": 85.67,
|
||||
"exec_summary": 79.77,
|
||||
"live_ast_acc": 80.63,
|
||||
"multi_turn_acc": 36,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 87.78,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 64.1,
|
||||
"model": "GPT-4o-mini-2024-07-18",
|
||||
"type": "FC",
|
||||
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
|
||||
"cost": 0.51,
|
||||
"latency": 1.49,
|
||||
"ast_summary": 85.21,
|
||||
"exec_summary": 83.57,
|
||||
"live_ast_acc": 74.41,
|
||||
"multi_turn_acc": 34.12,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 74.75,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 62.79,
|
||||
"model": "o1-mini-2024-09-12",
|
||||
"type": "Prompt",
|
||||
"link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
|
||||
"cost": 29.76,
|
||||
"latency": 8.44,
|
||||
"ast_summary": 78.92,
|
||||
"exec_summary": 82.7,
|
||||
"live_ast_acc": 78.14,
|
||||
"multi_turn_acc": 28.25,
|
||||
"relevance": 61.11,
|
||||
"irrelevance": 89.62,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 62.73,
|
||||
"model": "Functionary-Medium-v3.1",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
|
||||
"cost": "N/A",
|
||||
"latency": 14.06,
|
||||
"ast_summary": 89.88,
|
||||
"exec_summary": 91.32,
|
||||
"live_ast_acc": 76.63,
|
||||
"multi_turn_acc": 21.62,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 76.08,
|
||||
"organization": "MeetKai",
|
||||
"license": "MIT",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 62.19,
|
||||
"model": "Gemini-1.5-Pro-002",
|
||||
"type": "Prompt",
|
||||
"link": "https://deepmind.google/technologies/gemini/pro/",
|
||||
"cost": 7.05,
|
||||
"latency": 5.94,
|
||||
"ast_summary": 88.58,
|
||||
"exec_summary": 91.27,
|
||||
"live_ast_acc": 76.72,
|
||||
"multi_turn_acc": 20.75,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 78.15,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 61.83,
|
||||
"model": "Hammer2.1-7b",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
|
||||
"cost": "N/A",
|
||||
"latency": 2.08,
|
||||
"ast_summary": 88.65,
|
||||
"exec_summary": 85.48,
|
||||
"live_ast_acc": 75.11,
|
||||
"multi_turn_acc": 23.5,
|
||||
"relevance": 82.35,
|
||||
"irrelevance": 78.59,
|
||||
"organization": "MadeAgents",
|
||||
"license": "cc-by-nc-4.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 61.74,
|
||||
"model": "Gemini-2.0-Flash-Exp",
|
||||
"type": "Prompt",
|
||||
"link": "https://deepmind.google/technologies/gemini/flash/",
|
||||
"cost": 0.0,
|
||||
"latency": 1.18,
|
||||
"ast_summary": 89.96,
|
||||
"exec_summary": 79.89,
|
||||
"live_ast_acc": 82.01,
|
||||
"multi_turn_acc": 17.88,
|
||||
"relevance": 77.78,
|
||||
"irrelevance": 86.44,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 61.38,
|
||||
"model": "Amazon-Nova-Pro-v1:0",
|
||||
"type": "FC",
|
||||
"link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
|
||||
"cost": 5.26,
|
||||
"latency": 2.67,
|
||||
"ast_summary": 84.46,
|
||||
"exec_summary": 85.64,
|
||||
"live_ast_acc": 74.32,
|
||||
"multi_turn_acc": 26.12,
|
||||
"relevance": 77.78,
|
||||
"irrelevance": 70.98,
|
||||
"organization": "Amazon",
|
||||
"license": "Proprietary",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 61.31,
|
||||
"model": "Qwen2.5-72B-Instruct",
|
||||
"type": "Prompt",
|
||||
"link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
||||
"cost": "N/A",
|
||||
"latency": 3.72,
|
||||
"ast_summary": 90.81,
|
||||
"exec_summary": 92.7,
|
||||
"live_ast_acc": 75.3,
|
||||
"multi_turn_acc": 18,
|
||||
"relevance": 100,
|
||||
"irrelevance": 72.81,
|
||||
"organization": "Qwen",
|
||||
"license": "qwen",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 60.97,
|
||||
"model": "Gemini-1.5-Pro-002",
|
||||
"type": "FC",
|
||||
"link": "https://deepmind.google/technologies/gemini/pro/",
|
||||
"cost": 5.39,
|
||||
"latency": 2.07,
|
||||
"ast_summary": 87.29,
|
||||
"exec_summary": 84.61,
|
||||
"live_ast_acc": 76.28,
|
||||
"multi_turn_acc": 21.62,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 76.9,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 60.89,
|
||||
"model": "GPT-4o-mini-2024-07-18",
|
||||
"type": "Prompt",
|
||||
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
|
||||
"cost": 0.84,
|
||||
"latency": 1.31,
|
||||
"ast_summary": 86.77,
|
||||
"exec_summary": 80.84,
|
||||
"live_ast_acc": 76.5,
|
||||
"multi_turn_acc": 22,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 80.67,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 60.59,
|
||||
"model": "Gemini-2.0-Flash-Exp",
|
||||
"type": "FC",
|
||||
"link": "https://deepmind.google/technologies/gemini/flash/",
|
||||
"cost": 0.0,
|
||||
"latency": 0.85,
|
||||
"ast_summary": 85.1,
|
||||
"exec_summary": 77.46,
|
||||
"live_ast_acc": 79.03,
|
||||
"multi_turn_acc": 20.25,
|
||||
"relevance": 55.56,
|
||||
"irrelevance": 91.51,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 60.46,
|
||||
"model": "Gemini-1.5-Pro-001",
|
||||
"type": "Prompt",
|
||||
"link": "https://deepmind.google/technologies/gemini/pro/",
|
||||
"cost": 7.0,
|
||||
"latency": 1.54,
|
||||
"ast_summary": 85.56,
|
||||
"exec_summary": 85.77,
|
||||
"live_ast_acc": 76.68,
|
||||
"multi_turn_acc": 18.88,
|
||||
"relevance": 55.56,
|
||||
"irrelevance": 84.81,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 60.38,
|
||||
"model": "Gemini-Exp-1206",
|
||||
"type": "FC",
|
||||
"link": "https://blog.google/feed/gemini-exp-1206/",
|
||||
"cost": 0.0,
|
||||
"latency": 3.42,
|
||||
"ast_summary": 85.17,
|
||||
"exec_summary": 80.86,
|
||||
"live_ast_acc": 78.54,
|
||||
"multi_turn_acc": 20.25,
|
||||
"relevance": 77.78,
|
||||
"irrelevance": 79.64,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 59.67,
|
||||
"model": "Qwen2.5-32B-Instruct",
|
||||
"type": "Prompt",
|
||||
"link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
|
||||
"cost": "N/A",
|
||||
"latency": 2.26,
|
||||
"ast_summary": 85.81,
|
||||
"exec_summary": 89.79,
|
||||
"live_ast_acc": 74.23,
|
||||
"multi_turn_acc": 17.75,
|
||||
"relevance": 100,
|
||||
"irrelevance": 73.75,
|
||||
"organization": "Qwen",
|
||||
"license": "apache-2.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 59.57,
|
||||
"model": "GPT-4-turbo-2024-04-09",
|
||||
"type": "Prompt",
|
||||
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
|
||||
"cost": 58.87,
|
||||
"latency": 1.24,
|
||||
"ast_summary": 90.88,
|
||||
"exec_summary": 89.45,
|
||||
"live_ast_acc": 63.84,
|
||||
"multi_turn_acc": 30.25,
|
||||
"relevance": 100,
|
||||
"irrelevance": 35.57,
|
||||
"organization": "OpenAI",
|
||||
"license": "Proprietary",
|
||||
"provider": "openai",
|
||||
},
|
||||
{
|
||||
"overall_acc": 59.42,
|
||||
"model": "Gemini-1.5-Pro-001",
|
||||
"type": "FC",
|
||||
"link": "https://deepmind.google/technologies/gemini/pro/",
|
||||
"cost": 5.1,
|
||||
"latency": 1.43,
|
||||
"ast_summary": 84.33,
|
||||
"exec_summary": 87.95,
|
||||
"live_ast_acc": 76.23,
|
||||
"multi_turn_acc": 16,
|
||||
"relevance": 50,
|
||||
"irrelevance": 84.39,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 59.07,
|
||||
"model": "Hammer2.1-3b",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
|
||||
"cost": "N/A",
|
||||
"latency": 1.95,
|
||||
"ast_summary": 86.85,
|
||||
"exec_summary": 84.09,
|
||||
"live_ast_acc": 74.04,
|
||||
"multi_turn_acc": 17.38,
|
||||
"relevance": 82.35,
|
||||
"irrelevance": 81.87,
|
||||
"organization": "MadeAgents",
|
||||
"license": "qwen-research",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 58.45,
|
||||
"model": "mistral-large-2407",
|
||||
"type": "FC",
|
||||
"link": "https://mistral.ai/news/mistral-large-2407/",
|
||||
"cost": 12.68,
|
||||
"latency": 3.12,
|
||||
"ast_summary": 86.81,
|
||||
"exec_summary": 84.38,
|
||||
"live_ast_acc": 69.88,
|
||||
"multi_turn_acc": 23.75,
|
||||
"relevance": 72.22,
|
||||
"irrelevance": 52.85,
|
||||
"organization": "Mistral AI",
|
||||
"license": "Proprietary",
|
||||
"provider": "mistral",
|
||||
},
|
||||
{
|
||||
"overall_acc": 58.42,
|
||||
"model": "ToolACE-8B",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/Team-ACE/ToolACE-8B",
|
||||
"cost": "N/A",
|
||||
"latency": 5.24,
|
||||
"ast_summary": 87.54,
|
||||
"exec_summary": 89.21,
|
||||
"live_ast_acc": 78.59,
|
||||
"multi_turn_acc": 7.75,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 87.88,
|
||||
"organization": "Huawei Noah & USTC",
|
||||
"license": "Apache-2.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 57.78,
|
||||
"model": "xLAM-8x22b-r",
|
||||
"type": "FC",
|
||||
"link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
|
||||
"cost": "N/A",
|
||||
"latency": 9.26,
|
||||
"ast_summary": 83.69,
|
||||
"exec_summary": 87.88,
|
||||
"live_ast_acc": 72.59,
|
||||
"multi_turn_acc": 16.25,
|
||||
"relevance": 88.89,
|
||||
"irrelevance": 67.81,
|
||||
"organization": "Salesforce",
|
||||
"license": "cc-by-nc-4.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 57.68,
|
||||
"model": "Qwen2.5-14B-Instruct",
|
||||
"type": "Prompt",
|
||||
"link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
|
||||
"cost": "N/A",
|
||||
"latency": 2.02,
|
||||
"ast_summary": 85.69,
|
||||
"exec_summary": 88.84,
|
||||
"live_ast_acc": 74.14,
|
||||
"multi_turn_acc": 12.25,
|
||||
"relevance": 77.78,
|
||||
"irrelevance": 77.06,
|
||||
"organization": "Qwen",
|
||||
"license": "apache-2.0",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 57.23,
|
||||
"model": "DeepSeek-V3",
|
||||
"type": "FC",
|
||||
"link": "https://api-docs.deepseek.com/news/news1226",
|
||||
"cost": "N/A",
|
||||
"latency": 2.58,
|
||||
"ast_summary": 89.17,
|
||||
"exec_summary": 83.39,
|
||||
"live_ast_acc": 68.41,
|
||||
"multi_turn_acc": 18.62,
|
||||
"relevance": 88.89,
|
||||
"irrelevance": 59.36,
|
||||
"organization": "DeepSeek",
|
||||
"license": "DeepSeek License",
|
||||
"provider": "unknown",
|
||||
},
|
||||
{
|
||||
"overall_acc": 57.09,
|
||||
"model": "Gemini-1.5-Flash-001",
|
||||
"type": "Prompt",
|
||||
"link": "https://deepmind.google/technologies/gemini/flash/",
|
||||
"cost": 0.48,
|
||||
"latency": 0.71,
|
||||
"ast_summary": 85.69,
|
||||
"exec_summary": 83.59,
|
||||
"live_ast_acc": 68.9,
|
||||
"multi_turn_acc": 19.5,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 62.78,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
{
|
||||
"overall_acc": 56.79,
|
||||
"model": "Gemini-1.5-Flash-002",
|
||||
"type": "Prompt",
|
||||
"link": "https://deepmind.google/technologies/gemini/flash/",
|
||||
"cost": 0.46,
|
||||
"latency": 0.81,
|
||||
"ast_summary": 81.65,
|
||||
"exec_summary": 80.64,
|
||||
"live_ast_acc": 76.72,
|
||||
"multi_turn_acc": 12.5,
|
||||
"relevance": 83.33,
|
||||
"irrelevance": 78.49,
|
||||
"organization": "Google",
|
||||
"license": "Proprietary",
|
||||
"provider": "google",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
supported_top_tool_models = [
|
||||
{
|
||||
"cost": item["cost"],
|
||||
"model": item["model"],
|
||||
"type": item["type"],
|
||||
"provider": item["provider"],
|
||||
}
|
||||
for item in leaderboard_data
|
||||
if item["provider"] in VALID_PROVIDERS
|
||||
]
|
||||
|
|
@ -31,6 +31,7 @@ class DummyModel:
|
|||
content = "dummy_tool()"
|
||||
|
||||
return Response()
|
||||
|
||||
def bind_tools(self, tools, tool_choice):
|
||||
pass
|
||||
|
||||
|
|
@ -267,6 +268,7 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
|
|||
# Create a dummy tool that always fails for testing fallback
|
||||
def always_fail():
|
||||
raise Exception("Failure for fallback test")
|
||||
|
||||
self.always_fail_tool = DummyTool(always_fail)
|
||||
# Create a dummy model that does minimal work for fallback tests
|
||||
self.dummy_model = DummyModel()
|
||||
|
|
@ -274,24 +276,33 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
|
|||
self.agent = CiaynAgent(
|
||||
self.dummy_model,
|
||||
[self.always_fail_tool],
|
||||
config={"max_tool_failures": 2, "fallback_tool_models": "dummy-fallback-model"}
|
||||
config={
|
||||
"max_tool_failures": 2,
|
||||
"fallback_tool_models": "dummy-fallback-model",
|
||||
},
|
||||
)
|
||||
|
||||
def test_handle_tool_failure_increments_counter(self):
|
||||
initial_failures = self.agent._tool_failure.consecutive_failures
|
||||
initial_failures = self.agent.tool_failure_consecutive_failures
|
||||
self.agent._handle_tool_failure("dummy_call()", Exception("Test error"))
|
||||
self.assertEqual(self.agent._tool_failure.consecutive_failures, initial_failures + 1)
|
||||
self.assertEqual(
|
||||
self.agent.tool_failure_consecutive_failures, initial_failures + 1
|
||||
)
|
||||
|
||||
def test_attempt_fallback_invokes_fallback_logic(self):
|
||||
# Monkey-patch initialize_llm, merge_chat_history, and validate_provider_env
|
||||
# to simulate fallback switching without external dependencies.
|
||||
def dummy_initialize_llm(provider, model_name, temperature=None):
|
||||
return self.dummy_model
|
||||
|
||||
def dummy_merge_chat_history():
|
||||
return ["merged"]
|
||||
|
||||
def dummy_validate_provider_env(provider):
|
||||
return True
|
||||
|
||||
import ra_aid.llm as llm
|
||||
|
||||
original_initialize = llm.initialize_llm
|
||||
original_merge = llm.merge_chat_history
|
||||
original_validate = llm.validate_provider_env
|
||||
|
|
@ -300,14 +311,15 @@ class TestCiaynAgentNewMethods(unittest.TestCase):
|
|||
llm.validate_provider_env = dummy_validate_provider_env
|
||||
|
||||
# Set failure counter high enough to trigger fallback in _handle_tool_failure
|
||||
self.agent._tool_failure.consecutive_failures = 2
|
||||
self.agent.tool_failure_consecutive_failures = 2
|
||||
# Call _attempt_fallback; it should reset the failure counter to 0 on success.
|
||||
self.agent._attempt_fallback("always_fail_tool()")
|
||||
self.assertEqual(self.agent._tool_failure.consecutive_failures, 0)
|
||||
self.assertEqual(self.agent.tool_failure_consecutive_failures, 0)
|
||||
# Restore original functions
|
||||
llm.initialize_llm = original_initialize
|
||||
llm.merge_chat_history = original_merge
|
||||
llm.validate_provider_env = original_validate
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
Loading…
Reference in New Issue