from ra_aid.config import VALID_PROVIDERS # Data extracted at 2/10/2025: # https://gorilla.cs.berkeley.edu/leaderboard.html # In order of overall_acc leaderboard_data = [ { "overall_acc": 74.31, "model": "watt-tool-70B", "type": "FC", "link": "https://huggingface.co/watt-ai/watt-tool-70B/", "cost": "N/A", "latency": 3.4, "ast_summary": 84.06, "exec_summary": 89.39, "live_ast_acc": 77.74, "multi_turn_acc": 58.75, "relevance": 94.44, "irrelevance": 76.32, "organization": "Watt AI Lab", "license": "Apache-2.0", "provider": "unknown", }, { "overall_acc": 72.08, "model": "gpt-4o-2024-11-20", "type": "Prompt", "link": "https://openai.com/index/hello-gpt-4o/", "cost": 13.54, "latency": 0.78, "ast_summary": 88.1, "exec_summary": 89.38, "live_ast_acc": 79.83, "multi_turn_acc": 47.62, "relevance": 83.33, "irrelevance": 83.76, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 69.58, "model": "gpt-4o-2024-11-20", "type": "FC", "link": "https://openai.com/index/hello-gpt-4o/", "cost": 8.23, "latency": 1.11, "ast_summary": 87.42, "exec_summary": 89.2, "live_ast_acc": 79.65, "multi_turn_acc": 41, "relevance": 83.33, "irrelevance": 83.15, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 67.98, "model": "watt-tool-8B", "type": "FC", "link": "https://huggingface.co/watt-ai/watt-tool-8B/", "cost": "N/A", "latency": 1.31, "ast_summary": 86.56, "exec_summary": 89.34, "live_ast_acc": 76.5, "multi_turn_acc": 39.12, "relevance": 83.33, "irrelevance": 83.15, "organization": "Watt AI Lab", "license": "Apache-2.0", "provider": "unknown", }, { "overall_acc": 67.88, "model": "GPT-4-turbo-2024-04-09", "type": "FC", "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", "cost": 33.22, "latency": 2.47, "ast_summary": 84.73, "exec_summary": 85.21, "live_ast_acc": 80.5, "multi_turn_acc": 38.12, "relevance": 72.22, "irrelevance": 83.81, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 66.73, "model": "o1-2024-12-17", "type": "Prompt", "link": "https://openai.com/o1/", "cost": 102.47, "latency": 5.3, "ast_summary": 85.67, "exec_summary": 79.77, "live_ast_acc": 80.63, "multi_turn_acc": 36, "relevance": 72.22, "irrelevance": 87.78, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 64.1, "model": "GPT-4o-mini-2024-07-18", "type": "FC", "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", "cost": 0.51, "latency": 1.49, "ast_summary": 85.21, "exec_summary": 83.57, "live_ast_acc": 74.41, "multi_turn_acc": 34.12, "relevance": 83.33, "irrelevance": 74.75, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 62.79, "model": "o1-mini-2024-09-12", "type": "Prompt", "link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/", "cost": 29.76, "latency": 8.44, "ast_summary": 78.92, "exec_summary": 82.7, "live_ast_acc": 78.14, "multi_turn_acc": 28.25, "relevance": 61.11, "irrelevance": 89.62, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 62.73, "model": "Functionary-Medium-v3.1", "type": "FC", "link": "https://huggingface.co/meetkai/functionary-medium-v3.1", "cost": "N/A", "latency": 14.06, "ast_summary": 89.88, "exec_summary": 91.32, "live_ast_acc": 76.63, "multi_turn_acc": 21.62, "relevance": 72.22, "irrelevance": 76.08, "organization": "MeetKai", "license": "MIT", "provider": "unknown", }, { "overall_acc": 62.19, "model": "Gemini-1.5-Pro-002", "type": "Prompt", "link": "https://deepmind.google/technologies/gemini/pro/", "cost": 7.05, "latency": 5.94, "ast_summary": 88.58, "exec_summary": 91.27, "live_ast_acc": 76.72, "multi_turn_acc": 20.75, "relevance": 72.22, "irrelevance": 78.15, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 61.83, "model": "Hammer2.1-7b", "type": "FC", "link": "https://huggingface.co/MadeAgents/Hammer2.1-7b", "cost": "N/A", "latency": 2.08, "ast_summary": 88.65, "exec_summary": 85.48, "live_ast_acc": 75.11, "multi_turn_acc": 23.5, "relevance": 82.35, "irrelevance": 78.59, "organization": "MadeAgents", "license": "cc-by-nc-4.0", "provider": "unknown", }, { "overall_acc": 61.74, "model": "Gemini-2.0-Flash-Exp", "type": "Prompt", "link": "https://deepmind.google/technologies/gemini/flash/", "cost": 0.0, "latency": 1.18, "ast_summary": 89.96, "exec_summary": 79.89, "live_ast_acc": 82.01, "multi_turn_acc": 17.88, "relevance": 77.78, "irrelevance": 86.44, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 61.38, "model": "Amazon-Nova-Pro-v1:0", "type": "FC", "link": "https://aws.amazon.com/cn/ai/generative-ai/nova/", "cost": 5.26, "latency": 2.67, "ast_summary": 84.46, "exec_summary": 85.64, "live_ast_acc": 74.32, "multi_turn_acc": 26.12, "relevance": 77.78, "irrelevance": 70.98, "organization": "Amazon", "license": "Proprietary", "provider": "unknown", }, { "overall_acc": 61.31, "model": "Qwen2.5-72B-Instruct", "type": "Prompt", "link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", "cost": "N/A", "latency": 3.72, "ast_summary": 90.81, "exec_summary": 92.7, "live_ast_acc": 75.3, "multi_turn_acc": 18, "relevance": 100, "irrelevance": 72.81, "organization": "Qwen", "license": "qwen", "provider": "unknown", }, { "overall_acc": 60.97, "model": "Gemini-1.5-Pro-002", "type": "FC", "link": "https://deepmind.google/technologies/gemini/pro/", "cost": 5.39, "latency": 2.07, "ast_summary": 87.29, "exec_summary": 84.61, "live_ast_acc": 76.28, "multi_turn_acc": 21.62, "relevance": 72.22, "irrelevance": 76.9, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 60.89, "model": "GPT-4o-mini-2024-07-18", "type": "Prompt", "link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/", "cost": 0.84, "latency": 1.31, "ast_summary": 86.77, "exec_summary": 80.84, "live_ast_acc": 76.5, "multi_turn_acc": 22, "relevance": 83.33, "irrelevance": 80.67, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 60.59, "model": "Gemini-2.0-Flash-Exp", "type": "FC", "link": "https://deepmind.google/technologies/gemini/flash/", "cost": 0.0, "latency": 0.85, "ast_summary": 85.1, "exec_summary": 77.46, "live_ast_acc": 79.03, "multi_turn_acc": 20.25, "relevance": 55.56, "irrelevance": 91.51, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 60.46, "model": "Gemini-1.5-Pro-001", "type": "Prompt", "link": "https://deepmind.google/technologies/gemini/pro/", "cost": 7.0, "latency": 1.54, "ast_summary": 85.56, "exec_summary": 85.77, "live_ast_acc": 76.68, "multi_turn_acc": 18.88, "relevance": 55.56, "irrelevance": 84.81, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 60.38, "model": "Gemini-Exp-1206", "type": "FC", "link": "https://blog.google/feed/gemini-exp-1206/", "cost": 0.0, "latency": 3.42, "ast_summary": 85.17, "exec_summary": 80.86, "live_ast_acc": 78.54, "multi_turn_acc": 20.25, "relevance": 77.78, "irrelevance": 79.64, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 59.67, "model": "Qwen2.5-32B-Instruct", "type": "Prompt", "link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct", "cost": "N/A", "latency": 2.26, "ast_summary": 85.81, "exec_summary": 89.79, "live_ast_acc": 74.23, "multi_turn_acc": 17.75, "relevance": 100, "irrelevance": 73.75, "organization": "Qwen", "license": "apache-2.0", "provider": "unknown", }, { "overall_acc": 59.57, "model": "GPT-4-turbo-2024-04-09", "type": "Prompt", "link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo", "cost": 58.87, "latency": 1.24, "ast_summary": 90.88, "exec_summary": 89.45, "live_ast_acc": 63.84, "multi_turn_acc": 30.25, "relevance": 100, "irrelevance": 35.57, "organization": "OpenAI", "license": "Proprietary", "provider": "openai", }, { "overall_acc": 59.42, "model": "Gemini-1.5-Pro-001", "type": "FC", "link": "https://deepmind.google/technologies/gemini/pro/", "cost": 5.1, "latency": 1.43, "ast_summary": 84.33, "exec_summary": 87.95, "live_ast_acc": 76.23, "multi_turn_acc": 16, "relevance": 50, "irrelevance": 84.39, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 59.07, "model": "Hammer2.1-3b", "type": "FC", "link": "https://huggingface.co/MadeAgents/Hammer2.1-3b", "cost": "N/A", "latency": 1.95, "ast_summary": 86.85, "exec_summary": 84.09, "live_ast_acc": 74.04, "multi_turn_acc": 17.38, "relevance": 82.35, "irrelevance": 81.87, "organization": "MadeAgents", "license": "qwen-research", "provider": "unknown", }, { "overall_acc": 58.45, "model": "mistral-large-2407", "type": "FC", "link": "https://mistral.ai/news/mistral-large-2407/", "cost": 12.68, "latency": 3.12, "ast_summary": 86.81, "exec_summary": 84.38, "live_ast_acc": 69.88, "multi_turn_acc": 23.75, "relevance": 72.22, "irrelevance": 52.85, "organization": "Mistral AI", "license": "Proprietary", "provider": "mistral", }, { "overall_acc": 58.42, "model": "ToolACE-8B", "type": "FC", "link": "https://huggingface.co/Team-ACE/ToolACE-8B", "cost": "N/A", "latency": 5.24, "ast_summary": 87.54, "exec_summary": 89.21, "live_ast_acc": 78.59, "multi_turn_acc": 7.75, "relevance": 83.33, "irrelevance": 87.88, "organization": "Huawei Noah & USTC", "license": "Apache-2.0", "provider": "unknown", }, { "overall_acc": 57.78, "model": "xLAM-8x22b-r", "type": "FC", "link": "https://huggingface.co/Salesforce/xLAM-8x22b-r", "cost": "N/A", "latency": 9.26, "ast_summary": 83.69, "exec_summary": 87.88, "live_ast_acc": 72.59, "multi_turn_acc": 16.25, "relevance": 88.89, "irrelevance": 67.81, "organization": "Salesforce", "license": "cc-by-nc-4.0", "provider": "unknown", }, { "overall_acc": 57.68, "model": "Qwen2.5-14B-Instruct", "type": "Prompt", "link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct", "cost": "N/A", "latency": 2.02, "ast_summary": 85.69, "exec_summary": 88.84, "live_ast_acc": 74.14, "multi_turn_acc": 12.25, "relevance": 77.78, "irrelevance": 77.06, "organization": "Qwen", "license": "apache-2.0", "provider": "unknown", }, { "overall_acc": 57.23, "model": "DeepSeek-V3", "type": "FC", "link": "https://api-docs.deepseek.com/news/news1226", "cost": "N/A", "latency": 2.58, "ast_summary": 89.17, "exec_summary": 83.39, "live_ast_acc": 68.41, "multi_turn_acc": 18.62, "relevance": 88.89, "irrelevance": 59.36, "organization": "DeepSeek", "license": "DeepSeek License", "provider": "unknown", }, { "overall_acc": 57.09, "model": "Gemini-1.5-Flash-001", "type": "Prompt", "link": "https://deepmind.google/technologies/gemini/flash/", "cost": 0.48, "latency": 0.71, "ast_summary": 85.69, "exec_summary": 83.59, "live_ast_acc": 68.9, "multi_turn_acc": 19.5, "relevance": 83.33, "irrelevance": 62.78, "organization": "Google", "license": "Proprietary", "provider": "google", }, { "overall_acc": 56.79, "model": "Gemini-1.5-Flash-002", "type": "Prompt", "link": "https://deepmind.google/technologies/gemini/flash/", "cost": 0.46, "latency": 0.81, "ast_summary": 81.65, "exec_summary": 80.64, "live_ast_acc": 76.72, "multi_turn_acc": 12.5, "relevance": 83.33, "irrelevance": 78.49, "organization": "Google", "license": "Proprietary", "provider": "google", }, ] supported_top_tool_models = [ { "cost": item["cost"], "model": item["model"], "type": item["type"], "provider": item["provider"], } for item in leaderboard_data if item["provider"] in VALID_PROVIDERS ]