RA.Aid/ra_aid/tool_leaderboard.py

530 lines
15 KiB
Python

from ra_aid.config import VALID_PROVIDERS
# Data extracted at 2/10/2025:
# https://gorilla.cs.berkeley.edu/leaderboard.html
# In order of overall_acc
leaderboard_data = [
{
"overall_acc": 74.31,
"model": "watt-tool-70B",
"type": "FC",
"link": "https://huggingface.co/watt-ai/watt-tool-70B/",
"cost": "N/A",
"latency": 3.4,
"ast_summary": 84.06,
"exec_summary": 89.39,
"live_ast_acc": 77.74,
"multi_turn_acc": 58.75,
"relevance": 94.44,
"irrelevance": 76.32,
"organization": "Watt AI Lab",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 72.08,
"model": "gpt-4o-2024-11-20",
"type": "Prompt",
"link": "https://openai.com/index/hello-gpt-4o/",
"cost": 13.54,
"latency": 0.78,
"ast_summary": 88.1,
"exec_summary": 89.38,
"live_ast_acc": 79.83,
"multi_turn_acc": 47.62,
"relevance": 83.33,
"irrelevance": 83.76,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 69.58,
"model": "gpt-4o-2024-11-20",
"type": "FC",
"link": "https://openai.com/index/hello-gpt-4o/",
"cost": 8.23,
"latency": 1.11,
"ast_summary": 87.42,
"exec_summary": 89.2,
"live_ast_acc": 79.65,
"multi_turn_acc": 41,
"relevance": 83.33,
"irrelevance": 83.15,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 67.98,
"model": "watt-tool-8B",
"type": "FC",
"link": "https://huggingface.co/watt-ai/watt-tool-8B/",
"cost": "N/A",
"latency": 1.31,
"ast_summary": 86.56,
"exec_summary": 89.34,
"live_ast_acc": 76.5,
"multi_turn_acc": 39.12,
"relevance": 83.33,
"irrelevance": 83.15,
"organization": "Watt AI Lab",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 67.88,
"model": "GPT-4-turbo-2024-04-09",
"type": "FC",
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"cost": 33.22,
"latency": 2.47,
"ast_summary": 84.73,
"exec_summary": 85.21,
"live_ast_acc": 80.5,
"multi_turn_acc": 38.12,
"relevance": 72.22,
"irrelevance": 83.81,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 66.73,
"model": "o1-2024-12-17",
"type": "Prompt",
"link": "https://openai.com/o1/",
"cost": 102.47,
"latency": 5.3,
"ast_summary": 85.67,
"exec_summary": 79.77,
"live_ast_acc": 80.63,
"multi_turn_acc": 36,
"relevance": 72.22,
"irrelevance": 87.78,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 64.1,
"model": "GPT-4o-mini-2024-07-18",
"type": "FC",
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
"cost": 0.51,
"latency": 1.49,
"ast_summary": 85.21,
"exec_summary": 83.57,
"live_ast_acc": 74.41,
"multi_turn_acc": 34.12,
"relevance": 83.33,
"irrelevance": 74.75,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 62.79,
"model": "o1-mini-2024-09-12",
"type": "Prompt",
"link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
"cost": 29.76,
"latency": 8.44,
"ast_summary": 78.92,
"exec_summary": 82.7,
"live_ast_acc": 78.14,
"multi_turn_acc": 28.25,
"relevance": 61.11,
"irrelevance": 89.62,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 62.73,
"model": "Functionary-Medium-v3.1",
"type": "FC",
"link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
"cost": "N/A",
"latency": 14.06,
"ast_summary": 89.88,
"exec_summary": 91.32,
"live_ast_acc": 76.63,
"multi_turn_acc": 21.62,
"relevance": 72.22,
"irrelevance": 76.08,
"organization": "MeetKai",
"license": "MIT",
"provider": "unknown",
},
{
"overall_acc": 62.19,
"model": "Gemini-1.5-Pro-002",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 7.05,
"latency": 5.94,
"ast_summary": 88.58,
"exec_summary": 91.27,
"live_ast_acc": 76.72,
"multi_turn_acc": 20.75,
"relevance": 72.22,
"irrelevance": 78.15,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 61.83,
"model": "Hammer2.1-7b",
"type": "FC",
"link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
"cost": "N/A",
"latency": 2.08,
"ast_summary": 88.65,
"exec_summary": 85.48,
"live_ast_acc": 75.11,
"multi_turn_acc": 23.5,
"relevance": 82.35,
"irrelevance": 78.59,
"organization": "MadeAgents",
"license": "cc-by-nc-4.0",
"provider": "unknown",
},
{
"overall_acc": 61.74,
"model": "Gemini-2.0-Flash-Exp",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.0,
"latency": 1.18,
"ast_summary": 89.96,
"exec_summary": 79.89,
"live_ast_acc": 82.01,
"multi_turn_acc": 17.88,
"relevance": 77.78,
"irrelevance": 86.44,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 61.38,
"model": "Amazon-Nova-Pro-v1:0",
"type": "FC",
"link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
"cost": 5.26,
"latency": 2.67,
"ast_summary": 84.46,
"exec_summary": 85.64,
"live_ast_acc": 74.32,
"multi_turn_acc": 26.12,
"relevance": 77.78,
"irrelevance": 70.98,
"organization": "Amazon",
"license": "Proprietary",
"provider": "unknown",
},
{
"overall_acc": 61.31,
"model": "Qwen2.5-72B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
"cost": "N/A",
"latency": 3.72,
"ast_summary": 90.81,
"exec_summary": 92.7,
"live_ast_acc": 75.3,
"multi_turn_acc": 18,
"relevance": 100,
"irrelevance": 72.81,
"organization": "Qwen",
"license": "qwen",
"provider": "unknown",
},
{
"overall_acc": 60.97,
"model": "Gemini-1.5-Pro-002",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 5.39,
"latency": 2.07,
"ast_summary": 87.29,
"exec_summary": 84.61,
"live_ast_acc": 76.28,
"multi_turn_acc": 21.62,
"relevance": 72.22,
"irrelevance": 76.9,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.89,
"model": "GPT-4o-mini-2024-07-18",
"type": "Prompt",
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
"cost": 0.84,
"latency": 1.31,
"ast_summary": 86.77,
"exec_summary": 80.84,
"live_ast_acc": 76.5,
"multi_turn_acc": 22,
"relevance": 83.33,
"irrelevance": 80.67,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 60.59,
"model": "Gemini-2.0-Flash-Exp",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.0,
"latency": 0.85,
"ast_summary": 85.1,
"exec_summary": 77.46,
"live_ast_acc": 79.03,
"multi_turn_acc": 20.25,
"relevance": 55.56,
"irrelevance": 91.51,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.46,
"model": "Gemini-1.5-Pro-001",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 7.0,
"latency": 1.54,
"ast_summary": 85.56,
"exec_summary": 85.77,
"live_ast_acc": 76.68,
"multi_turn_acc": 18.88,
"relevance": 55.56,
"irrelevance": 84.81,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 60.38,
"model": "Gemini-Exp-1206",
"type": "FC",
"link": "https://blog.google/feed/gemini-exp-1206/",
"cost": 0.0,
"latency": 3.42,
"ast_summary": 85.17,
"exec_summary": 80.86,
"live_ast_acc": 78.54,
"multi_turn_acc": 20.25,
"relevance": 77.78,
"irrelevance": 79.64,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 59.67,
"model": "Qwen2.5-32B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
"cost": "N/A",
"latency": 2.26,
"ast_summary": 85.81,
"exec_summary": 89.79,
"live_ast_acc": 74.23,
"multi_turn_acc": 17.75,
"relevance": 100,
"irrelevance": 73.75,
"organization": "Qwen",
"license": "apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 59.57,
"model": "GPT-4-turbo-2024-04-09",
"type": "Prompt",
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
"cost": 58.87,
"latency": 1.24,
"ast_summary": 90.88,
"exec_summary": 89.45,
"live_ast_acc": 63.84,
"multi_turn_acc": 30.25,
"relevance": 100,
"irrelevance": 35.57,
"organization": "OpenAI",
"license": "Proprietary",
"provider": "openai",
},
{
"overall_acc": 59.42,
"model": "Gemini-1.5-Pro-001",
"type": "FC",
"link": "https://deepmind.google/technologies/gemini/pro/",
"cost": 5.1,
"latency": 1.43,
"ast_summary": 84.33,
"exec_summary": 87.95,
"live_ast_acc": 76.23,
"multi_turn_acc": 16,
"relevance": 50,
"irrelevance": 84.39,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 59.07,
"model": "Hammer2.1-3b",
"type": "FC",
"link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
"cost": "N/A",
"latency": 1.95,
"ast_summary": 86.85,
"exec_summary": 84.09,
"live_ast_acc": 74.04,
"multi_turn_acc": 17.38,
"relevance": 82.35,
"irrelevance": 81.87,
"organization": "MadeAgents",
"license": "qwen-research",
"provider": "unknown",
},
{
"overall_acc": 58.45,
"model": "mistral-large-2407",
"type": "FC",
"link": "https://mistral.ai/news/mistral-large-2407/",
"cost": 12.68,
"latency": 3.12,
"ast_summary": 86.81,
"exec_summary": 84.38,
"live_ast_acc": 69.88,
"multi_turn_acc": 23.75,
"relevance": 72.22,
"irrelevance": 52.85,
"organization": "Mistral AI",
"license": "Proprietary",
"provider": "mistral",
},
{
"overall_acc": 58.42,
"model": "ToolACE-8B",
"type": "FC",
"link": "https://huggingface.co/Team-ACE/ToolACE-8B",
"cost": "N/A",
"latency": 5.24,
"ast_summary": 87.54,
"exec_summary": 89.21,
"live_ast_acc": 78.59,
"multi_turn_acc": 7.75,
"relevance": 83.33,
"irrelevance": 87.88,
"organization": "Huawei Noah & USTC",
"license": "Apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 57.78,
"model": "xLAM-8x22b-r",
"type": "FC",
"link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
"cost": "N/A",
"latency": 9.26,
"ast_summary": 83.69,
"exec_summary": 87.88,
"live_ast_acc": 72.59,
"multi_turn_acc": 16.25,
"relevance": 88.89,
"irrelevance": 67.81,
"organization": "Salesforce",
"license": "cc-by-nc-4.0",
"provider": "unknown",
},
{
"overall_acc": 57.68,
"model": "Qwen2.5-14B-Instruct",
"type": "Prompt",
"link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
"cost": "N/A",
"latency": 2.02,
"ast_summary": 85.69,
"exec_summary": 88.84,
"live_ast_acc": 74.14,
"multi_turn_acc": 12.25,
"relevance": 77.78,
"irrelevance": 77.06,
"organization": "Qwen",
"license": "apache-2.0",
"provider": "unknown",
},
{
"overall_acc": 57.23,
"model": "DeepSeek-V3",
"type": "FC",
"link": "https://api-docs.deepseek.com/news/news1226",
"cost": "N/A",
"latency": 2.58,
"ast_summary": 89.17,
"exec_summary": 83.39,
"live_ast_acc": 68.41,
"multi_turn_acc": 18.62,
"relevance": 88.89,
"irrelevance": 59.36,
"organization": "DeepSeek",
"license": "DeepSeek License",
"provider": "unknown",
},
{
"overall_acc": 57.09,
"model": "Gemini-1.5-Flash-001",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.48,
"latency": 0.71,
"ast_summary": 85.69,
"exec_summary": 83.59,
"live_ast_acc": 68.9,
"multi_turn_acc": 19.5,
"relevance": 83.33,
"irrelevance": 62.78,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
{
"overall_acc": 56.79,
"model": "Gemini-1.5-Flash-002",
"type": "Prompt",
"link": "https://deepmind.google/technologies/gemini/flash/",
"cost": 0.46,
"latency": 0.81,
"ast_summary": 81.65,
"exec_summary": 80.64,
"live_ast_acc": 76.72,
"multi_turn_acc": 12.5,
"relevance": 83.33,
"irrelevance": 78.49,
"organization": "Google",
"license": "Proprietary",
"provider": "google",
},
]
supported_top_tool_models = [
{
"cost": item["cost"],
"model": item["model"],
"type": item["type"],
"provider": item["provider"],
}
for item in leaderboard_data
if item["provider"] in VALID_PROVIDERS
]