530 lines
15 KiB
Python
530 lines
15 KiB
Python
from ra_aid.config import VALID_PROVIDERS
|
|
|
|
# Data extracted at 2/10/2025:
|
|
# https://gorilla.cs.berkeley.edu/leaderboard.html
|
|
# In order of overall_acc
|
|
leaderboard_data = [
|
|
{
|
|
"overall_acc": 74.31,
|
|
"model": "watt-tool-70B",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/watt-ai/watt-tool-70B/",
|
|
"cost": "N/A",
|
|
"latency": 3.4,
|
|
"ast_summary": 84.06,
|
|
"exec_summary": 89.39,
|
|
"live_ast_acc": 77.74,
|
|
"multi_turn_acc": 58.75,
|
|
"relevance": 94.44,
|
|
"irrelevance": 76.32,
|
|
"organization": "Watt AI Lab",
|
|
"license": "Apache-2.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 72.08,
|
|
"model": "gpt-4o-2024-11-20",
|
|
"type": "Prompt",
|
|
"link": "https://openai.com/index/hello-gpt-4o/",
|
|
"cost": 13.54,
|
|
"latency": 0.78,
|
|
"ast_summary": 88.1,
|
|
"exec_summary": 89.38,
|
|
"live_ast_acc": 79.83,
|
|
"multi_turn_acc": 47.62,
|
|
"relevance": 83.33,
|
|
"irrelevance": 83.76,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 69.58,
|
|
"model": "gpt-4o-2024-11-20",
|
|
"type": "FC",
|
|
"link": "https://openai.com/index/hello-gpt-4o/",
|
|
"cost": 8.23,
|
|
"latency": 1.11,
|
|
"ast_summary": 87.42,
|
|
"exec_summary": 89.2,
|
|
"live_ast_acc": 79.65,
|
|
"multi_turn_acc": 41,
|
|
"relevance": 83.33,
|
|
"irrelevance": 83.15,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 67.98,
|
|
"model": "watt-tool-8B",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/watt-ai/watt-tool-8B/",
|
|
"cost": "N/A",
|
|
"latency": 1.31,
|
|
"ast_summary": 86.56,
|
|
"exec_summary": 89.34,
|
|
"live_ast_acc": 76.5,
|
|
"multi_turn_acc": 39.12,
|
|
"relevance": 83.33,
|
|
"irrelevance": 83.15,
|
|
"organization": "Watt AI Lab",
|
|
"license": "Apache-2.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 67.88,
|
|
"model": "GPT-4-turbo-2024-04-09",
|
|
"type": "FC",
|
|
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
|
|
"cost": 33.22,
|
|
"latency": 2.47,
|
|
"ast_summary": 84.73,
|
|
"exec_summary": 85.21,
|
|
"live_ast_acc": 80.5,
|
|
"multi_turn_acc": 38.12,
|
|
"relevance": 72.22,
|
|
"irrelevance": 83.81,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 66.73,
|
|
"model": "o1-2024-12-17",
|
|
"type": "Prompt",
|
|
"link": "https://openai.com/o1/",
|
|
"cost": 102.47,
|
|
"latency": 5.3,
|
|
"ast_summary": 85.67,
|
|
"exec_summary": 79.77,
|
|
"live_ast_acc": 80.63,
|
|
"multi_turn_acc": 36,
|
|
"relevance": 72.22,
|
|
"irrelevance": 87.78,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 64.1,
|
|
"model": "GPT-4o-mini-2024-07-18",
|
|
"type": "FC",
|
|
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
|
|
"cost": 0.51,
|
|
"latency": 1.49,
|
|
"ast_summary": 85.21,
|
|
"exec_summary": 83.57,
|
|
"live_ast_acc": 74.41,
|
|
"multi_turn_acc": 34.12,
|
|
"relevance": 83.33,
|
|
"irrelevance": 74.75,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 62.79,
|
|
"model": "o1-mini-2024-09-12",
|
|
"type": "Prompt",
|
|
"link": "https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/",
|
|
"cost": 29.76,
|
|
"latency": 8.44,
|
|
"ast_summary": 78.92,
|
|
"exec_summary": 82.7,
|
|
"live_ast_acc": 78.14,
|
|
"multi_turn_acc": 28.25,
|
|
"relevance": 61.11,
|
|
"irrelevance": 89.62,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 62.73,
|
|
"model": "Functionary-Medium-v3.1",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/meetkai/functionary-medium-v3.1",
|
|
"cost": "N/A",
|
|
"latency": 14.06,
|
|
"ast_summary": 89.88,
|
|
"exec_summary": 91.32,
|
|
"live_ast_acc": 76.63,
|
|
"multi_turn_acc": 21.62,
|
|
"relevance": 72.22,
|
|
"irrelevance": 76.08,
|
|
"organization": "MeetKai",
|
|
"license": "MIT",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 62.19,
|
|
"model": "Gemini-1.5-Pro-002",
|
|
"type": "Prompt",
|
|
"link": "https://deepmind.google/technologies/gemini/pro/",
|
|
"cost": 7.05,
|
|
"latency": 5.94,
|
|
"ast_summary": 88.58,
|
|
"exec_summary": 91.27,
|
|
"live_ast_acc": 76.72,
|
|
"multi_turn_acc": 20.75,
|
|
"relevance": 72.22,
|
|
"irrelevance": 78.15,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 61.83,
|
|
"model": "Hammer2.1-7b",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/MadeAgents/Hammer2.1-7b",
|
|
"cost": "N/A",
|
|
"latency": 2.08,
|
|
"ast_summary": 88.65,
|
|
"exec_summary": 85.48,
|
|
"live_ast_acc": 75.11,
|
|
"multi_turn_acc": 23.5,
|
|
"relevance": 82.35,
|
|
"irrelevance": 78.59,
|
|
"organization": "MadeAgents",
|
|
"license": "cc-by-nc-4.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 61.74,
|
|
"model": "Gemini-2.0-Flash-Exp",
|
|
"type": "Prompt",
|
|
"link": "https://deepmind.google/technologies/gemini/flash/",
|
|
"cost": 0.0,
|
|
"latency": 1.18,
|
|
"ast_summary": 89.96,
|
|
"exec_summary": 79.89,
|
|
"live_ast_acc": 82.01,
|
|
"multi_turn_acc": 17.88,
|
|
"relevance": 77.78,
|
|
"irrelevance": 86.44,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 61.38,
|
|
"model": "Amazon-Nova-Pro-v1:0",
|
|
"type": "FC",
|
|
"link": "https://aws.amazon.com/cn/ai/generative-ai/nova/",
|
|
"cost": 5.26,
|
|
"latency": 2.67,
|
|
"ast_summary": 84.46,
|
|
"exec_summary": 85.64,
|
|
"live_ast_acc": 74.32,
|
|
"multi_turn_acc": 26.12,
|
|
"relevance": 77.78,
|
|
"irrelevance": 70.98,
|
|
"organization": "Amazon",
|
|
"license": "Proprietary",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 61.31,
|
|
"model": "Qwen2.5-72B-Instruct",
|
|
"type": "Prompt",
|
|
"link": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
|
"cost": "N/A",
|
|
"latency": 3.72,
|
|
"ast_summary": 90.81,
|
|
"exec_summary": 92.7,
|
|
"live_ast_acc": 75.3,
|
|
"multi_turn_acc": 18,
|
|
"relevance": 100,
|
|
"irrelevance": 72.81,
|
|
"organization": "Qwen",
|
|
"license": "qwen",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 60.97,
|
|
"model": "Gemini-1.5-Pro-002",
|
|
"type": "FC",
|
|
"link": "https://deepmind.google/technologies/gemini/pro/",
|
|
"cost": 5.39,
|
|
"latency": 2.07,
|
|
"ast_summary": 87.29,
|
|
"exec_summary": 84.61,
|
|
"live_ast_acc": 76.28,
|
|
"multi_turn_acc": 21.62,
|
|
"relevance": 72.22,
|
|
"irrelevance": 76.9,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 60.89,
|
|
"model": "GPT-4o-mini-2024-07-18",
|
|
"type": "Prompt",
|
|
"link": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
|
|
"cost": 0.84,
|
|
"latency": 1.31,
|
|
"ast_summary": 86.77,
|
|
"exec_summary": 80.84,
|
|
"live_ast_acc": 76.5,
|
|
"multi_turn_acc": 22,
|
|
"relevance": 83.33,
|
|
"irrelevance": 80.67,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 60.59,
|
|
"model": "Gemini-2.0-Flash-Exp",
|
|
"type": "FC",
|
|
"link": "https://deepmind.google/technologies/gemini/flash/",
|
|
"cost": 0.0,
|
|
"latency": 0.85,
|
|
"ast_summary": 85.1,
|
|
"exec_summary": 77.46,
|
|
"live_ast_acc": 79.03,
|
|
"multi_turn_acc": 20.25,
|
|
"relevance": 55.56,
|
|
"irrelevance": 91.51,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 60.46,
|
|
"model": "Gemini-1.5-Pro-001",
|
|
"type": "Prompt",
|
|
"link": "https://deepmind.google/technologies/gemini/pro/",
|
|
"cost": 7.0,
|
|
"latency": 1.54,
|
|
"ast_summary": 85.56,
|
|
"exec_summary": 85.77,
|
|
"live_ast_acc": 76.68,
|
|
"multi_turn_acc": 18.88,
|
|
"relevance": 55.56,
|
|
"irrelevance": 84.81,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 60.38,
|
|
"model": "Gemini-Exp-1206",
|
|
"type": "FC",
|
|
"link": "https://blog.google/feed/gemini-exp-1206/",
|
|
"cost": 0.0,
|
|
"latency": 3.42,
|
|
"ast_summary": 85.17,
|
|
"exec_summary": 80.86,
|
|
"live_ast_acc": 78.54,
|
|
"multi_turn_acc": 20.25,
|
|
"relevance": 77.78,
|
|
"irrelevance": 79.64,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 59.67,
|
|
"model": "Qwen2.5-32B-Instruct",
|
|
"type": "Prompt",
|
|
"link": "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct",
|
|
"cost": "N/A",
|
|
"latency": 2.26,
|
|
"ast_summary": 85.81,
|
|
"exec_summary": 89.79,
|
|
"live_ast_acc": 74.23,
|
|
"multi_turn_acc": 17.75,
|
|
"relevance": 100,
|
|
"irrelevance": 73.75,
|
|
"organization": "Qwen",
|
|
"license": "apache-2.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 59.57,
|
|
"model": "GPT-4-turbo-2024-04-09",
|
|
"type": "Prompt",
|
|
"link": "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
|
|
"cost": 58.87,
|
|
"latency": 1.24,
|
|
"ast_summary": 90.88,
|
|
"exec_summary": 89.45,
|
|
"live_ast_acc": 63.84,
|
|
"multi_turn_acc": 30.25,
|
|
"relevance": 100,
|
|
"irrelevance": 35.57,
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"provider": "openai",
|
|
},
|
|
{
|
|
"overall_acc": 59.42,
|
|
"model": "Gemini-1.5-Pro-001",
|
|
"type": "FC",
|
|
"link": "https://deepmind.google/technologies/gemini/pro/",
|
|
"cost": 5.1,
|
|
"latency": 1.43,
|
|
"ast_summary": 84.33,
|
|
"exec_summary": 87.95,
|
|
"live_ast_acc": 76.23,
|
|
"multi_turn_acc": 16,
|
|
"relevance": 50,
|
|
"irrelevance": 84.39,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 59.07,
|
|
"model": "Hammer2.1-3b",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/MadeAgents/Hammer2.1-3b",
|
|
"cost": "N/A",
|
|
"latency": 1.95,
|
|
"ast_summary": 86.85,
|
|
"exec_summary": 84.09,
|
|
"live_ast_acc": 74.04,
|
|
"multi_turn_acc": 17.38,
|
|
"relevance": 82.35,
|
|
"irrelevance": 81.87,
|
|
"organization": "MadeAgents",
|
|
"license": "qwen-research",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 58.45,
|
|
"model": "mistral-large-2407",
|
|
"type": "FC",
|
|
"link": "https://mistral.ai/news/mistral-large-2407/",
|
|
"cost": 12.68,
|
|
"latency": 3.12,
|
|
"ast_summary": 86.81,
|
|
"exec_summary": 84.38,
|
|
"live_ast_acc": 69.88,
|
|
"multi_turn_acc": 23.75,
|
|
"relevance": 72.22,
|
|
"irrelevance": 52.85,
|
|
"organization": "Mistral AI",
|
|
"license": "Proprietary",
|
|
"provider": "mistral",
|
|
},
|
|
{
|
|
"overall_acc": 58.42,
|
|
"model": "ToolACE-8B",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/Team-ACE/ToolACE-8B",
|
|
"cost": "N/A",
|
|
"latency": 5.24,
|
|
"ast_summary": 87.54,
|
|
"exec_summary": 89.21,
|
|
"live_ast_acc": 78.59,
|
|
"multi_turn_acc": 7.75,
|
|
"relevance": 83.33,
|
|
"irrelevance": 87.88,
|
|
"organization": "Huawei Noah & USTC",
|
|
"license": "Apache-2.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 57.78,
|
|
"model": "xLAM-8x22b-r",
|
|
"type": "FC",
|
|
"link": "https://huggingface.co/Salesforce/xLAM-8x22b-r",
|
|
"cost": "N/A",
|
|
"latency": 9.26,
|
|
"ast_summary": 83.69,
|
|
"exec_summary": 87.88,
|
|
"live_ast_acc": 72.59,
|
|
"multi_turn_acc": 16.25,
|
|
"relevance": 88.89,
|
|
"irrelevance": 67.81,
|
|
"organization": "Salesforce",
|
|
"license": "cc-by-nc-4.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 57.68,
|
|
"model": "Qwen2.5-14B-Instruct",
|
|
"type": "Prompt",
|
|
"link": "https://huggingface.co/Qwen/Qwen2.5-14B-Instruct",
|
|
"cost": "N/A",
|
|
"latency": 2.02,
|
|
"ast_summary": 85.69,
|
|
"exec_summary": 88.84,
|
|
"live_ast_acc": 74.14,
|
|
"multi_turn_acc": 12.25,
|
|
"relevance": 77.78,
|
|
"irrelevance": 77.06,
|
|
"organization": "Qwen",
|
|
"license": "apache-2.0",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 57.23,
|
|
"model": "DeepSeek-V3",
|
|
"type": "FC",
|
|
"link": "https://api-docs.deepseek.com/news/news1226",
|
|
"cost": "N/A",
|
|
"latency": 2.58,
|
|
"ast_summary": 89.17,
|
|
"exec_summary": 83.39,
|
|
"live_ast_acc": 68.41,
|
|
"multi_turn_acc": 18.62,
|
|
"relevance": 88.89,
|
|
"irrelevance": 59.36,
|
|
"organization": "DeepSeek",
|
|
"license": "DeepSeek License",
|
|
"provider": "unknown",
|
|
},
|
|
{
|
|
"overall_acc": 57.09,
|
|
"model": "Gemini-1.5-Flash-001",
|
|
"type": "Prompt",
|
|
"link": "https://deepmind.google/technologies/gemini/flash/",
|
|
"cost": 0.48,
|
|
"latency": 0.71,
|
|
"ast_summary": 85.69,
|
|
"exec_summary": 83.59,
|
|
"live_ast_acc": 68.9,
|
|
"multi_turn_acc": 19.5,
|
|
"relevance": 83.33,
|
|
"irrelevance": 62.78,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
{
|
|
"overall_acc": 56.79,
|
|
"model": "Gemini-1.5-Flash-002",
|
|
"type": "Prompt",
|
|
"link": "https://deepmind.google/technologies/gemini/flash/",
|
|
"cost": 0.46,
|
|
"latency": 0.81,
|
|
"ast_summary": 81.65,
|
|
"exec_summary": 80.64,
|
|
"live_ast_acc": 76.72,
|
|
"multi_turn_acc": 12.5,
|
|
"relevance": 83.33,
|
|
"irrelevance": 78.49,
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"provider": "google",
|
|
},
|
|
]
|
|
|
|
|
|
supported_top_tool_models = [
|
|
{
|
|
"cost": item["cost"],
|
|
"model": item["model"],
|
|
"type": item["type"],
|
|
"provider": item["provider"],
|
|
}
|
|
for item in leaderboard_data
|
|
if item["provider"] in VALID_PROVIDERS
|
|
]
|