Skip to content

Instantly share code, notes, and snippets.

@EricGustin
Created November 26, 2025 01:49
Show Gist options
  • Select an option

  • Save EricGustin/8075a6a038627769e99e2524b53224c0 to your computer and use it in GitHub Desktop.

Select an option

Save EricGustin/8075a6a038627769e99e2524b53224c0 to your computer and use it in GitHub Desktop.
Evaluating the performance of Anthropic's tool search with 4000+ Arcade tools
"""
This script provides testing Anthropic's tool search functionality across multiple scenarios.
REQUIRED:
- pip install arcadepy
- pip install anthropic
- export ANTHROPIC_API_KEY=<your_anthropic_api_key>
- export ARCADE_API_KEY=<your_arcade_api_key>
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from anthropic import Anthropic
from arcadepy import Arcade
@dataclass
class TestCase:
"""Defines a single test case for tool search evaluation."""
name: str
messages: list[dict]
expected_tools: list[str] # List of tool names that should be found
@dataclass
class TestResult:
"""Results from running a test case."""
name: str
expected_tools: list[str]
found_tools: list[str]
success: bool
response: object
class ToolSearchEvaluationSuite:
"""Evaluation suite for testing tool search functionality."""
def __init__(
self,
server_names: list[str],
arcade_client: Arcade,
anthropic_client: Anthropic,
max_workers: int = 20,
):
self.server_names = server_names
self.arcade_client = arcade_client
self.anthropic_client = anthropic_client
self.max_workers = max_workers
self.arcade_tools: list[dict] = []
self.search_tool = [
{"type": "tool_search_tool_bm25_20251119", "name": "tool_search_tool_bm25"}
]
def _fetch_server_tools(self, tk_name: str) -> tuple[str, list[dict], int]:
"""Fetch tools for a single server."""
tools = []
tools_page_iter = self.arcade_client.tools.list(toolkit=tk_name, include_format="anthropic")
for tool in tools_page_iter:
tool_def = tool.formatted_schema["anthropic"]
tool_def["defer_loading"] = True
tools.append(tool_def)
return tk_name, tools, len(tools)
def load_tools(self, verbose: bool = True) -> None:
"""Load all tools from the specified servers in parallel."""
self.arcade_tools = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all server fetches
future_to_server = {
executor.submit(self._fetch_server_tools, tk_name): tk_name
for tk_name in self.server_names
}
# Process results as they complete
for future in as_completed(future_to_server):
tk_name, tools, num_tools = future.result()
self.arcade_tools.extend(tools)
if verbose:
print(f"{tk_name}: {num_tools} tools. Total tools: {len(self.arcade_tools)}")
if verbose:
print(f"\nTotal: {len(self.arcade_tools)} tools loaded\n")
def run_test_case(self, test_case: TestCase, verbose: bool = True) -> TestResult:
"""Run a single test case and return the results."""
if verbose:
print(f"\n{'=' * 80}")
print(f"Running test: {test_case.name}")
print(f"{'=' * 80}")
response = self.anthropic_client.beta.messages.create(
model="claude-sonnet-4-5-20250929",
betas=["advanced-tool-use-2025-11-20", "mcp-client-2025-11-20"],
max_tokens=2048,
tools=self.search_tool + self.arcade_tools,
messages=test_case.messages,
)
# Extract found tools from the response
found_tools = []
for content_block in response.content:
if hasattr(content_block, "type") and content_block.type == "tool_search_tool_result":
tool_references = content_block.content.tool_references
found_tools = [ref.tool_name for ref in tool_references]
break
# Check if all expected tools were found
success = all(tool in found_tools for tool in test_case.expected_tools)
if verbose:
print(f"\nExpected tools: {test_case.expected_tools}")
print(f"Found tools: {found_tools}")
if success:
print("✅ SUCCESS: All expected tools were found!")
else:
missing_tools = [
tool for tool in test_case.expected_tools if tool not in found_tools
]
print(f"❌ FAILURE: Missing tools: {missing_tools}")
print(f" Total tools returned: {len(found_tools)}")
return TestResult(
name=test_case.name,
expected_tools=test_case.expected_tools,
found_tools=found_tools,
success=success,
response=response,
)
def run_all_tests(self, test_cases: list[TestCase], verbose: bool = True) -> list[TestResult]:
"""Run all test cases and return the results."""
results = []
for test_case in test_cases:
result = self.run_test_case(test_case, verbose=verbose)
results.append(result)
# Print summary
if verbose:
self._print_summary(results)
return results
def _print_summary(self, results: list[TestResult]) -> None:
"""Print a summary of all test results."""
print(f"\n{'=' * 80}")
print("SUMMARY")
print(f"{'=' * 80}")
passed = sum(1 for r in results if r.success)
total = len(results)
print(f"\nTotal tests: {total}")
print(f"Passed: {passed}")
print(f"Failed: {total - passed}")
print(f"Success rate: {passed / total * 100:.1f}%")
if total - passed > 0:
print("\nFailed tests:")
for result in results:
if not result.success:
missing_tools = [
tool for tool in result.expected_tools if tool not in result.found_tools
]
print(f" - {result.name}: Missing {missing_tools}")
# Define the servers to load
SERVER_NAMES = [
"Asana",
"ClickUp",
"Confluence",
"Dropbox",
"E2B",
"Firecrawl",
"Github",
"Gmail",
"GoogleCalendar",
"GoogleContacts",
"GoogleDocs",
"GoogleDrive",
"GoogleFinance",
"GoogleFlights",
"GoogleHotels",
"GoogleJobs",
"GoogleMaps",
"GoogleNews",
"GoogleSearch",
"GoogleSheets",
"GoogleShopping",
"GoogleSlides",
"HubSpot",
"Imgflip",
"Linear",
"LinkedIn",
"SharePoint",
"MicrosoftTeams",
"NotionToolkit",
"OutlookCalendar",
"OutlookMail",
"Reddit",
"Salesforce",
"Slack",
"Spotify",
"Stripe",
"Walmart",
"X",
"Youtube",
"Zendesk",
"Zoom",
"AsanaApi",
"ArcadeEngineApi",
"BoxApi",
"CalendlyApi",
"ClickupApi",
"CursorAgentsApi",
"CustomerioPipelinesApi",
"CustomerioTrackApi",
"FigmaApi",
"GithubApi",
"HubspotAutomationApi",
"HubspotCmsApi",
"HubspotConversationsApi",
"HubspotCrmApi",
"HubspotMarketingApi",
"HubspotMeetingsApi",
"HubspotUsersApi",
"MiroApi",
"SlackApi",
"StripeApi",
"TicktickApi",
"TrelloApi",
"VercelApi",
"ZohobooksApi",
"ZohocreatorApi",
]
# Define test cases
TEST_CASES = [
TestCase(
name="Reddit - Get posts from subreddit",
messages=[
{
"role": "user",
"content": "I want to read some posts on a specific subreddit.",
},
{
"role": "assistant",
"content": "I'd be happy to help you with your task. Which subreddit are you interested in?",
},
{"role": "user", "content": "mcp"},
],
expected_tools=["Reddit_GetPostsInSubreddit"],
),
TestCase(
name="Gmail - Send email",
messages=[
{
"role": "user",
"content": "I need to send an Email to my colleague about the project update.",
},
],
expected_tools=["Gmail_SendEmail"],
),
TestCase(
name="Google Calendar - Create event",
messages=[
{
"role": "user",
"content": "Schedule a meeting for 1:1 tomorrow at 2pm. Invite [email protected]",
},
],
expected_tools=["GoogleCalendar_CreateEvent"],
),
TestCase(
name="Slack - Send message",
messages=[
{
"role": "user",
"content": "Post a message to the #general channel in Slack that says that I will be OOO tomorrow.",
},
],
expected_tools=["Slack_SendMessage"],
),
TestCase(
name="GitHub - Create issue",
messages=[
{
"role": "user",
"content": "Create a new issue in my GitHub repo arcadeai/arcade-mcp that says 'This isn't an issue, per se, but great work on this!'",
},
],
expected_tools=["Github_CreateIssue"],
),
TestCase(
name="Google Docs - Create blank document",
messages=[
{
"role": "user",
"content": "I need to write a document. Can you help me create a new Google Doc?",
},
{
"role": "assistant",
"content": "Of course! I can help you create a new Google Doc. What would you like to name it?",
},
{
"role": "user",
"content": "Call it 'Q1 Planning Notes'",
},
],
expected_tools=["GoogleDocs_CreateBlankDocument"],
),
TestCase(
name="Google Drive - Search for files",
messages=[
{
"role": "user",
"content": "Find all the PDF files in my Google Drive that contain the word 'invoice'",
},
],
expected_tools=["GoogleDrive_SearchFiles"],
),
TestCase(
name="Google Sheets - Create spreadsheet",
messages=[
{
"role": "user",
"content": "Create a new spreadsheet to track my expenses for this month.",
},
],
expected_tools=["GoogleSheets_CreateSpreadsheet"],
),
TestCase(
name="Notion - Create page",
messages=[
{
"role": "user",
"content": "I want to create a new page in Notion.",
},
{
"role": "assistant",
"content": "I can help with that! What would you like to title the page, and where should it be created?",
},
{
"role": "user",
"content": "Title it 'Meeting Notes - Nov 26' and have it be a child of the 'Meetings' page",
},
],
expected_tools=["NotionToolkit_CreatePage"],
),
TestCase(
name="Linear - Create issue",
messages=[
{
"role": "user",
"content": "I'm a part of so many teams in Linear and it's hard to keep track of them all. Can you help me get a list of all my teams?",
},
],
expected_tools=["Linear_GetTeams"],
),
TestCase(
name="Asana - Create task",
messages=[
{
"role": "user",
"content": "Add a task in Asana to review the pull requests by Friday.",
},
],
expected_tools=["Asana_CreateTask"],
),
TestCase(
name="Spotify - Play music",
messages=[
{
"role": "user",
"content": "Play some music on Spotify.",
},
{
"role": "assistant",
"content": "I'd be happy to play music for you! What would you like to listen to?",
},
{
"role": "user",
"content": "Play the 'Helter Skelter' song by The Beatles",
},
],
expected_tools=["Spotify_PlayTrackByName"],
),
TestCase(
name="YouTube - Search videos",
messages=[
{
"role": "user",
"content": "Search YouTube for tutorials on Python async programming.",
},
],
expected_tools=["Youtube_SearchVideos"],
),
TestCase(
name="Stripe - Create customer",
messages=[
{
"role": "user",
"content": "Create a new customer in Stripe for [email protected]",
},
],
expected_tools=["Stripe_CreateCustomer"],
),
TestCase(
name="HubSpot - Create contact",
messages=[
{
"role": "user",
"content": "I need to add a new contact to HubSpot.",
},
{
"role": "assistant",
"content": "Sure! I can help you add a contact to HubSpot. What are the contact details?",
},
{
"role": "user",
"content": "Name is Sarah Johnson, email is [email protected]",
},
],
expected_tools=["HubSpot_CreateContact"],
),
TestCase(
name="Zendesk - Create ticket",
messages=[
{
"role": "user",
"content": "List recent tickets that I have in Zendesk.",
},
],
expected_tools=["Zendesk_CreateTicket"],
),
TestCase(
name="Confluence - Create page",
messages=[
{
"role": "user",
"content": "I want to document our new API endpoints in Confluence.",
},
{
"role": "assistant",
"content": "Great idea! I can help you create a Confluence page. What should the title be?",
},
{
"role": "user",
"content": "Call it 'API Documentation - v2.0.0'",
},
],
expected_tools=["Confluence_CreatePage"],
),
TestCase(
name="Microsoft Teams - Send message",
messages=[
{
"role": "user",
"content": "Send a message in the Engineering channel on Teams to remind everyone about standup.",
},
],
expected_tools=["MicrosoftTeams_SendMessageToChannel"],
),
TestCase(
name="Outlook Mail - Send email",
messages=[
{
"role": "user",
"content": "Draft up an email in Outlook to the team about tomorrow's workshop.",
},
],
expected_tools=["OutlookMail_CreateDraftEmail"],
),
TestCase(
name="Dropbox - List files",
messages=[
{
"role": "user",
"content": "What files do I have in my Dropbox?",
},
{
"role": "assistant",
"content": "I can check your Dropbox for you. Would you like to see files in a specific folder or all files?",
},
{
"role": "user",
"content": "Show me what's in the 'Documents' folder",
},
],
expected_tools=["Dropbox_ListItemsInFolder"],
),
TestCase(
name="X (Twitter) - Post tweet",
messages=[
{
"role": "user",
"content": "Post a tweet saying 'Just finished an amazing project with the team! 🚀'",
},
],
expected_tools=["X_PostTweet"],
),
TestCase(
name="ClickUp - Create task",
messages=[
{
"role": "user",
"content": "Add a task in ClickUp to update the documentation.",
},
],
expected_tools=["ClickUp_CreateTask"],
),
TestCase(
name="Salesforce - Create contact",
messages=[
{
"role": "user",
"content": "I met a potential customer at a conference. Can you help me add them to Salesforce?",
},
{
"role": "assistant",
"content": "Absolutely! I can help you create a contact in Salesforce. What are their details?",
},
{
"role": "user",
"content": "Name is Michael Chen, company is TechCorp, email is [email protected]",
},
],
expected_tools=["Salesforce_CreateContact"],
),
TestCase(
name="Google Slides - Create presentation",
messages=[
{
"role": "user",
"content": "Create a new Google Slides presentation for my quarterly review. It should be a 5 slide presentation with a title slide, an overview slide, a section on the past quarter, a section on the current quarter, and a section on the future quarter.",
},
],
expected_tools=["GoogleSlides_CreatePresentation"],
),
TestCase(
name="Figma - Get file info",
messages=[
{
"role": "user",
"content": "I need information about a Figma design file.",
},
{
"role": "assistant",
"content": "I can help you get information about a Figma file. What's the file key or URL?",
},
{
"role": "user",
"content": "It's the file with key abc123def456",
},
],
expected_tools=["FigmaApi_FetchFigmaFile"],
),
]
if __name__ == "__main__":
# Initialize clients
arcade_client = Arcade()
anthropic_client = Anthropic()
# Create evaluation suite
suite = ToolSearchEvaluationSuite(
server_names=SERVER_NAMES,
arcade_client=arcade_client,
anthropic_client=anthropic_client,
)
# Load tools once
print("Loading tools...")
suite.load_tools(verbose=True)
# Run all test cases
results = suite.run_all_tests(TEST_CASES, verbose=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment