Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save evantahler/5c4cda9d412192de35f9702892fdd657 to your computer and use it in GitHub Desktop.

Select an option

Save evantahler/5c4cda9d412192de35f9702892fdd657 to your computer and use it in GitHub Desktop.
Evaluating the performance of Anthropic's tool search with 4000+ Arcade tools
"""
This script provides testing Anthropic's tool search functionality across multiple scenarios.
REQUIRED:
- pip install arcadepy
- pip install anthropic
- export ANTHROPIC_API_KEY=<your_anthropic_api_key>
- export ARCADE_API_KEY=<your_arcade_api_key>
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from anthropic import Anthropic
from arcadepy import Arcade
@dataclass
class TestCase:
"""Defines a single test case for tool search evaluation."""
name: str
messages: list[dict]
expected_tools: list[str] # List of tool names that should be found
@dataclass
class TestResult:
"""Results from running a test case."""
name: str
expected_tools: list[str]
found_tools: list[str]
success: bool
response: object
class ToolSearchEvaluationSuite:
"""Evaluation suite for testing tool search functionality."""
def __init__(
self,
server_names: list[str],
arcade_client: Arcade,
anthropic_client: Anthropic,
max_workers: int = 20,
):
self.server_names = server_names
self.arcade_client = arcade_client
self.anthropic_client = anthropic_client
self.max_workers = max_workers
self.arcade_tools: list[dict] = []
self.search_tool = [
{"type": "tool_search_tool_bm25_20251119", "name": "tool_search_tool_bm25"}
]
def _fetch_server_tools(self, tk_name: str) -> tuple[str, list[dict], int]:
"""Fetch tools for a single server."""
tools = []
tools_page_iter = self.arcade_client.tools.list(toolkit=tk_name, include_format="anthropic")
for tool in tools_page_iter:
tool_def = tool.formatted_schema["anthropic"]
tool_def["defer_loading"] = True
tools.append(tool_def)
return tk_name, tools, len(tools)
def load_tools(self, verbose: bool = True) -> None:
"""Load all tools from the specified servers in parallel."""
self.arcade_tools = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# Submit all server fetches
future_to_server = {
executor.submit(self._fetch_server_tools, tk_name): tk_name
for tk_name in self.server_names
}
# Process results as they complete
for future in as_completed(future_to_server):
tk_name, tools, num_tools = future.result()
self.arcade_tools.extend(tools)
if verbose:
print(f"{tk_name}: {num_tools} tools. Total tools: {len(self.arcade_tools)}")
if verbose:
print(f"\nTotal: {len(self.arcade_tools)} tools loaded\n")
def run_test_case(self, test_case: TestCase, verbose: bool = True) -> TestResult:
"""Run a single test case and return the results."""
if verbose:
print(f"\n{'=' * 80}")
print(f"Running test: {test_case.name}")
print(f"{'=' * 80}")
response = self.anthropic_client.beta.messages.create(
model="claude-sonnet-4-5-20250929",
betas=["advanced-tool-use-2025-11-20", "mcp-client-2025-11-20"],
max_tokens=2048,
tools=self.search_tool + self.arcade_tools,
messages=test_case.messages,
)
# Extract found tools from the response
found_tools = []
for content_block in response.content:
if hasattr(content_block, "type") and content_block.type == "tool_search_tool_result":
tool_references = content_block.content.tool_references
found_tools = [ref.tool_name for ref in tool_references]
break
# Check if all expected tools were found
success = all(tool in found_tools for tool in test_case.expected_tools)
if verbose:
print(f"\nExpected tools: {test_case.expected_tools}")
print(f"Found tools: {found_tools}")
if success:
print("✅ SUCCESS: All expected tools were found!")
else:
missing_tools = [
tool for tool in test_case.expected_tools if tool not in found_tools
]
print(f"❌ FAILURE: Missing tools: {missing_tools}")
print(f" Total tools returned: {len(found_tools)}")
return TestResult(
name=test_case.name,
expected_tools=test_case.expected_tools,
found_tools=found_tools,
success=success,
response=response,
)
def run_all_tests(self, test_cases: list[TestCase], verbose: bool = True) -> list[TestResult]:
"""Run all test cases and return the results."""
results = []
for test_case in test_cases:
result = self.run_test_case(test_case, verbose=verbose)
results.append(result)
# Print summary
if verbose:
self._print_summary(results)
return results
def _print_summary(self, results: list[TestResult]) -> None:
"""Print a summary of all test results."""
print(f"\n{'=' * 80}")
print("SUMMARY")
print(f"{'=' * 80}")
passed = sum(1 for r in results if r.success)
total = len(results)
print(f"\nTotal tests: {total}")
print(f"Passed: {passed}")
print(f"Failed: {total - passed}")
print(f"Success rate: {passed / total * 100:.1f}%")
if total - passed > 0:
print("\nFailed tests:")
for result in results:
if not result.success:
missing_tools = [
tool for tool in result.expected_tools if tool not in result.found_tools
]
print(f" - {result.name}: Missing {missing_tools}")
# Define the servers to load
SERVER_NAMES = [
"Asana",
"ClickUp",
"Confluence",
"Dropbox",
"E2B",
"Firecrawl",
"Github",
"Gmail",
"GoogleCalendar",
"GoogleContacts",
"GoogleDocs",
"GoogleDrive",
"GoogleFinance",
"GoogleFlights",
"GoogleHotels",
"GoogleJobs",
"GoogleMaps",
"GoogleNews",
"GoogleSearch",
"GoogleSheets",
"GoogleShopping",
"GoogleSlides",
"HubSpot",
"Imgflip",
"Linear",
"LinkedIn",
"SharePoint",
"MicrosoftTeams",
"NotionToolkit",
"OutlookCalendar",
"OutlookMail",
"Reddit",
"Salesforce",
"Slack",
"Spotify",
"Stripe",
"Walmart",
"X",
"Youtube",
"Zendesk",
"Zoom",
"AsanaApi",
"ArcadeEngineApi",
"BoxApi",
"CalendlyApi",
"ClickupApi",
"CursorAgentsApi",
"CustomerioPipelinesApi",
"CustomerioTrackApi",
"FigmaApi",
"GithubApi",
"HubspotAutomationApi",
"HubspotCmsApi",
"HubspotConversationsApi",
"HubspotCrmApi",
"HubspotMarketingApi",
"HubspotMeetingsApi",
"HubspotUsersApi",
"MiroApi",
"SlackApi",
"StripeApi",
"TicktickApi",
"TrelloApi",
"VercelApi",
"ZohobooksApi",
"ZohocreatorApi",
]
# Define test cases
TEST_CASES = [
TestCase(
name="Reddit - Get posts from subreddit",
messages=[
{
"role": "user",
"content": "I want to read some posts on a specific subreddit.",
},
{
"role": "assistant",
"content": "I'd be happy to help you with your task. Which subreddit are you interested in?",
},
{"role": "user", "content": "mcp"},
],
expected_tools=["Reddit_GetPostsInSubreddit"],
),
TestCase(
name="Gmail - Send email",
messages=[
{
"role": "user",
"content": "I need to send an Email to my colleague about the project update.",
},
],
expected_tools=["Gmail_SendEmail"],
),
TestCase(
name="Google Calendar - Create event",
messages=[
{
"role": "user",
"content": "Schedule a meeting for 1:1 tomorrow at 2pm. Invite [email protected]",
},
],
expected_tools=["GoogleCalendar_CreateEvent"],
),
TestCase(
name="Slack - Send message",
messages=[
{
"role": "user",
"content": "Post a message to the #general channel in Slack that says that I will be OOO tomorrow.",
},
],
expected_tools=["Slack_SendMessage"],
),
TestCase(
name="GitHub - Create issue",
messages=[
{
"role": "user",
"content": "Create a new issue in my GitHub repo arcadeai/arcade-mcp that says 'This isn't an issue, per se, but great work on this!'",
},
],
expected_tools=["Github_CreateIssue"],
),
TestCase(
name="Google Docs - Create blank document",
messages=[
{
"role": "user",
"content": "I need to write a document. Can you help me create a new Google Doc?",
},
{
"role": "assistant",
"content": "Of course! I can help you create a new Google Doc. What would you like to name it?",
},
{
"role": "user",
"content": "Call it 'Q1 Planning Notes'",
},
],
expected_tools=["GoogleDocs_CreateBlankDocument"],
),
TestCase(
name="Google Drive - Search for files",
messages=[
{
"role": "user",
"content": "Find all the PDF files in my Google Drive that contain the word 'invoice'",
},
],
expected_tools=["GoogleDrive_SearchFiles"],
),
TestCase(
name="Google Sheets - Create spreadsheet",
messages=[
{
"role": "user",
"content": "Create a new spreadsheet to track my expenses for this month.",
},
],
expected_tools=["GoogleSheets_CreateSpreadsheet"],
),
TestCase(
name="Notion - Create page",
messages=[
{
"role": "user",
"content": "I want to create a new page in Notion.",
},
{
"role": "assistant",
"content": "I can help with that! What would you like to title the page, and where should it be created?",
},
{
"role": "user",
"content": "Title it 'Meeting Notes - Nov 26' and have it be a child of the 'Meetings' page",
},
],
expected_tools=["NotionToolkit_CreatePage"],
),
TestCase(
name="Linear - Create issue",
messages=[
{
"role": "user",
"content": "I'm a part of so many teams in Linear and it's hard to keep track of them all. Can you help me get a list of all my teams?",
},
],
expected_tools=["Linear_GetTeams"],
),
TestCase(
name="Asana - Create task",
messages=[
{
"role": "user",
"content": "Add a task in Asana to review the pull requests by Friday.",
},
],
expected_tools=["Asana_CreateTask"],
),
TestCase(
name="Spotify - Play music",
messages=[
{
"role": "user",
"content": "Play some music on Spotify.",
},
{
"role": "assistant",
"content": "I'd be happy to play music for you! What would you like to listen to?",
},
{
"role": "user",
"content": "Play the 'Helter Skelter' song by The Beatles",
},
],
expected_tools=["Spotify_PlayTrackByName"],
),
TestCase(
name="YouTube - Search videos",
messages=[
{
"role": "user",
"content": "Search YouTube for tutorials on Python async programming.",
},
],
expected_tools=["Youtube_SearchVideos"],
),
TestCase(
name="Stripe - Create customer",
messages=[
{
"role": "user",
"content": "Create a new customer in Stripe for [email protected]",
},
],
expected_tools=["Stripe_CreateCustomer"],
),
TestCase(
name="HubSpot - Create contact",
messages=[
{
"role": "user",
"content": "I need to add a new contact to HubSpot.",
},
{
"role": "assistant",
"content": "Sure! I can help you add a contact to HubSpot. What are the contact details?",
},
{
"role": "user",
"content": "Name is Sarah Johnson, email is [email protected]",
},
],
expected_tools=["HubSpot_CreateContact"],
),
TestCase(
name="Zendesk - Create ticket",
messages=[
{
"role": "user",
"content": "List recent tickets that I have in Zendesk.",
},
],
expected_tools=["Zendesk_CreateTicket"],
),
TestCase(
name="Confluence - Create page",
messages=[
{
"role": "user",
"content": "I want to document our new API endpoints in Confluence.",
},
{
"role": "assistant",
"content": "Great idea! I can help you create a Confluence page. What should the title be?",
},
{
"role": "user",
"content": "Call it 'API Documentation - v2.0.0'",
},
],
expected_tools=["Confluence_CreatePage"],
),
TestCase(
name="Microsoft Teams - Send message",
messages=[
{
"role": "user",
"content": "Send a message in the Engineering channel on Teams to remind everyone about standup.",
},
],
expected_tools=["MicrosoftTeams_SendMessageToChannel"],
),
TestCase(
name="Outlook Mail - Send email",
messages=[
{
"role": "user",
"content": "Draft up an email in Outlook to the team about tomorrow's workshop.",
},
],
expected_tools=["OutlookMail_CreateDraftEmail"],
),
TestCase(
name="Dropbox - List files",
messages=[
{
"role": "user",
"content": "What files do I have in my Dropbox?",
},
{
"role": "assistant",
"content": "I can check your Dropbox for you. Would you like to see files in a specific folder or all files?",
},
{
"role": "user",
"content": "Show me what's in the 'Documents' folder",
},
],
expected_tools=["Dropbox_ListItemsInFolder"],
),
TestCase(
name="X (Twitter) - Post tweet",
messages=[
{
"role": "user",
"content": "Post a tweet saying 'Just finished an amazing project with the team! 🚀'",
},
],
expected_tools=["X_PostTweet"],
),
TestCase(
name="ClickUp - Create task",
messages=[
{
"role": "user",
"content": "Add a task in ClickUp to update the documentation.",
},
],
expected_tools=["ClickUp_CreateTask"],
),
TestCase(
name="Salesforce - Create contact",
messages=[
{
"role": "user",
"content": "I met a potential customer at a conference. Can you help me add them to Salesforce?",
},
{
"role": "assistant",
"content": "Absolutely! I can help you create a contact in Salesforce. What are their details?",
},
{
"role": "user",
"content": "Name is Michael Chen, company is TechCorp, email is [email protected]",
},
],
expected_tools=["Salesforce_CreateContact"],
),
TestCase(
name="Google Slides - Create presentation",
messages=[
{
"role": "user",
"content": "Create a new Google Slides presentation for my quarterly review. It should be a 5 slide presentation with a title slide, an overview slide, a section on the past quarter, a section on the current quarter, and a section on the future quarter.",
},
],
expected_tools=["GoogleSlides_CreatePresentation"],
),
TestCase(
name="Figma - Get file info",
messages=[
{
"role": "user",
"content": "I need information about a Figma design file.",
},
{
"role": "assistant",
"content": "I can help you get information about a Figma file. What's the file key or URL?",
},
{
"role": "user",
"content": "It's the file with key abc123def456",
},
],
expected_tools=["FigmaApi_FetchFigmaFile"],
),
]
if __name__ == "__main__":
# Initialize clients
arcade_client = Arcade()
anthropic_client = Anthropic()
# Create evaluation suite
suite = ToolSearchEvaluationSuite(
server_names=SERVER_NAMES,
arcade_client=arcade_client,
anthropic_client=anthropic_client,
)
# Load tools once
print("Loading tools...")
suite.load_tools(verbose=True)
# Run all test cases
results = suite.run_all_tests(TEST_CASES, verbose=True)
BM25 Results
Total: 4027 tools loaded
================================================================================
Running test: Reddit - Get posts from subreddit
================================================================================
Expected tools: ['Reddit_GetPostsInSubreddit']
Found tools: ['Reddit_SubmitTextPost', 'Reddit_GetPostsInSubreddit', 'Reddit_CheckSubredditAccess', 'Reddit_GetSubredditRules', 'Reddit_GetContentOfPost']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Gmail - Send email
================================================================================
Expected tools: ['Gmail_SendEmail']
Found tools: ['OutlookMail_SendDraftEmail', 'ZohoBooksApi_SendInvitationEmail', 'ZohoBooksApi_SendInvoiceReminders', 'HubspotMarketingApi_SendTransactionalEmail', 'Slack_SendMessage']
❌ FAILURE: Missing tools: ['Gmail_SendEmail']
Total tools returned: 5
================================================================================
Running test: Google Calendar - Create event
================================================================================
Expected tools: ['GoogleCalendar_CreateEvent']
Found tools: ['GoogleCalendar_CreateEvent', 'GoogleCalendar_UpdateEvent', 'OutlookCalendar_CreateEvent', 'GoogleCalendar_DeleteEvent', 'HubspotMeetingsApi_ScheduleMeetingHubspot']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Slack - Send message
================================================================================
Expected tools: ['Slack_SendMessage']
Found tools: ['SlackApi_ScheduleSlackMessage', 'SlackApi_GetSlackMessagePermalink', 'SlackApi_CustomUnfurlSlackUrls', 'SlackApi_AddSlackReaction', 'SlackApi_SendEphemeralMessageSlack']
❌ FAILURE: Missing tools: ['Slack_SendMessage']
Total tools returned: 5
================================================================================
Running test: GitHub - Create issue
================================================================================
Expected tools: ['Github_CreateIssue']
Found tools: ['GithubApi_GithubCreateIssueComment', 'GithubApi_ListGithubIssuesForRepo', 'GithubApi_ListGithubIssues', 'GithubApi_DeleteGithubIssueReaction', 'Github_CreateIssue']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Docs - Create blank document
================================================================================
Expected tools: ['GoogleDocs_CreateBlankDocument']
Found tools: ['GoogleDocs_CreateBlankDocument', 'GoogleDocs_CreateDocumentFromText', 'GoogleDocs_GetDocumentById', 'GoogleDocs_InsertTextAtEndOfDocument', 'GoogleDocs_GetDocumentMetadata']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Drive - Search for files
================================================================================
Expected tools: ['GoogleDrive_SearchFiles']
Found tools: ['GoogleDrive_SearchFiles', 'Sharepoint_SearchDriveItems', 'GoogleDrive_GetFileTreeStructure', 'GoogleSheets_GenerateGoogleFilePickerUrl', 'GoogleDrive_GenerateGoogleFilePickerUrl']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Sheets - Create spreadsheet
================================================================================
Expected tools: ['GoogleSheets_CreateSpreadsheet']
Found tools: ['GoogleSheets_CreateSpreadsheet', 'GoogleSheets_GetSpreadsheetMetadata', 'GoogleSheets_SearchSpreadsheets', 'GoogleSheets_GetSpreadsheet', 'GoogleSheets_WriteToCell']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Notion - Create page
================================================================================
Expected tools: ['NotionToolkit_CreatePage']
Found tools: ['NotionToolkit_CreatePage', 'NotionToolkit_GetPageContentById', 'NotionToolkit_GetPageContentByTitle', 'NotionToolkit_AppendContentToEndOfPage', 'NotionToolkit_GetObjectMetadata']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Linear - Create issue
================================================================================
Expected tools: ['Linear_GetTeams']
Found tools: ['Linear_GetTeams', 'MicrosoftTeams_GetTeam', 'GithubApi_ListChildTeams', 'MicrosoftTeams_GetChannelMessageReplies', 'GithubApi_ListTeamProjectsInOrg']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Asana - Create task
================================================================================
Expected tools: ['Asana_CreateTask']
Found tools: ['AsanaApi_CreateAsanaTask', 'AsanaApi_DeleteAsanaTask', 'AsanaApi_DuplicateAsanaTask', 'AsanaApi_DeleteAsanaStory', 'AsanaApi_AddTaskDependencies']
❌ FAILURE: Missing tools: ['Asana_CreateTask']
Total tools returned: 5
================================================================================
Running test: Spotify - Play music
================================================================================
Expected tools: ['Spotify_PlayTrackByName']
Found tools: ['Spotify_StartTracksPlaybackById', 'Spotify_PlayTrackByName', 'Spotify_PlayArtistByName', 'Spotify_GetTrackFromId', 'Spotify_PausePlayback']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: YouTube - Search videos
================================================================================
Expected tools: ['Youtube_SearchVideos']
Found tools: ['Youtube_SearchForVideos', 'Youtube_GetYoutubeVideoDetails', 'Dropbox_SearchFilesAndFolders', 'GoogleDrive_SearchFiles', 'TrelloApi_UpdateSavedSearch']
❌ FAILURE: Missing tools: ['Youtube_SearchVideos']
Total tools returned: 5
================================================================================
Running test: Stripe - Create customer
================================================================================
Expected tools: ['Stripe_CreateCustomer']
Found tools: ['Stripe_CreateInvoice', 'Stripe_CreateInvoiceItem', 'Stripe_CreateCustomer', 'StripeApi_DeleteStripeCustomer', 'StripeApi_SearchStripeCustomers']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: HubSpot - Create contact
================================================================================
Expected tools: ['HubSpot_CreateContact']
Found tools: ['HubspotCrmApi_CreateHubspotContact', 'HubspotCrmApi_CreateCrmObject', 'HubspotMarketingApi_CreateSmtpApiToken', 'HubspotCrmApi_CreateHubspotCrmProperty', 'ZohoBooksApi_CreateContactPerson']
❌ FAILURE: Missing tools: ['HubSpot_CreateContact']
Total tools returned: 5
================================================================================
Running test: Zendesk - Create ticket
================================================================================
Expected tools: ['Zendesk_CreateTicket']
Found tools: ['Zendesk_MarkTicketSolved', 'Zendesk_AddTicketComment', 'Zendesk_GetTicketComments', 'Zendesk_ListTickets', 'Zendesk_WhoAmI']
❌ FAILURE: Missing tools: ['Zendesk_CreateTicket']
Total tools returned: 5
================================================================================
Running test: Confluence - Create page
================================================================================
Expected tools: ['Confluence_CreatePage']
Found tools: ['Confluence_GetSpaceHierarchy', 'Confluence_SearchContent', 'GoogleDocs_CreateDocumentFromText', 'GoogleDocs_CreateBlankDocument', 'Confluence_WhoAmI']
❌ FAILURE: Missing tools: ['Confluence_CreatePage']
Total tools returned: 5
================================================================================
Running test: Microsoft Teams - Send message
================================================================================
Expected tools: ['MicrosoftTeams_SendMessageToChannel']
Found tools: ['MicrosoftTeams_SendMessageToChannel', 'MicrosoftTeams_ReplyToChannelMessage', 'Slack_SendMessage', 'SlackApi_SendEphemeralMessageSlack', 'MicrosoftTeams_GetChannelMessages']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Outlook Mail - Send email
================================================================================
Expected tools: ['OutlookMail_CreateDraftEmail']
Found tools: ['OutlookMail_CreateDraftEmail', 'Gmail_WriteDraftEmail', 'Gmail_WriteDraftReplyEmail', 'OutlookMail_CreateAndSendEmail', 'OutlookMail_WhoAmI']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Dropbox - List files
================================================================================
Expected tools: ['Dropbox_ListItemsInFolder']
Found tools: ['Dropbox_SearchFilesAndFolders', 'Dropbox_ListItemsInFolder', 'GoogleDrive_GetFileTreeStructure', 'GoogleDrive_SearchFiles', 'FigmaApi_FetchFigmaFile']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: X (Twitter) - Post tweet
================================================================================
Expected tools: ['X_PostTweet']
Found tools: ['X_LookupTweetById', 'X_DeleteTweetById', 'X_PostTweet', 'X_ReplyToTweet', 'X_SearchRecentTweetsByUsername']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: ClickUp - Create task
================================================================================
Expected tools: ['ClickUp_CreateTask']
Found tools: ['Clickup_CreateTaskComment', 'ClickupApi_AddTaskToClickupList', 'ClickupApi_CreateFolderInSpace', 'Clickup_CreateTask', 'ClickupApi_CreateNewClickupTask']
❌ FAILURE: Missing tools: ['ClickUp_CreateTask']
Total tools returned: 5
================================================================================
Running test: Salesforce - Create contact
================================================================================
Expected tools: ['Salesforce_CreateContact']
Found tools: ['Salesforce_CreateContact', 'Salesforce_GetAccountDataByKeywords', 'Salesforce_GetAccountDataById', 'ZohoBooksApi_CreateContactPerson', 'Hubspot_CreateContact']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Slides - Create presentation
================================================================================
Expected tools: ['GoogleSlides_CreatePresentation']
Found tools: ['GoogleSlides_CreatePresentation', 'GoogleSlides_CommentOnPresentation', 'GoogleSlides_CreateSlide', 'GoogleSlides_GetPresentationAsMarkdown', 'GoogleSlides_ListPresentationComments']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Figma - Get file info
================================================================================
Expected tools: ['FigmaApi_FetchFigmaFile']
Found tools: ['FigmaApi_GetPublishedComponentSets', 'FigmaApi_GetStyleMetadata', 'FigmaApi_GetFileMetadata', 'FigmaApi_FetchImageFillLinks', 'FigmaApi_GetFigmaFileNodes']
❌ FAILURE: Missing tools: ['FigmaApi_FetchFigmaFile']
Total tools returned: 5
================================================================================
SUMMARY
================================================================================
Total tests: 25
Passed: 16
Failed: 9
Success rate: 64.0%
Failed tests:
- Gmail - Send email: Missing ['Gmail_SendEmail']
- Slack - Send message: Missing ['Slack_SendMessage']
- Asana - Create task: Missing ['Asana_CreateTask']
- YouTube - Search videos: Missing ['Youtube_SearchVideos']
- HubSpot - Create contact: Missing ['HubSpot_CreateContact']
- Zendesk - Create ticket: Missing ['Zendesk_CreateTicket']
- Confluence - Create page: Missing ['Confluence_CreatePage']
- ClickUp - Create task: Missing ['ClickUp_CreateTask']
- Figma - Get file info: Missing ['FigmaApi_FetchFigmaFile']
Regex Results
​​Total: 4027 tools loaded
================================================================================
Running test: Reddit - Get posts from subreddit
================================================================================
Expected tools: ['Reddit_GetPostsInSubreddit']
Found tools: ['Firecrawl_CrawlWebsite', 'Linkedin_CreateTextPost', 'Reddit_CheckSubredditAccess', 'Reddit_CommentOnPost', 'Reddit_GetContentOfMultiplePosts']
❌ FAILURE: Missing tools: ['Reddit_GetPostsInSubreddit']
Total tools returned: 5
================================================================================
Running test: Gmail - Send email
================================================================================
Expected tools: ['Gmail_SendEmail']
Found tools: ['GoogleContacts_CreateContact', 'GoogleContacts_SearchContactsByEmail', 'GoogleContacts_WhoAmI', 'GoogleDrive_WhoAmI', 'GoogleCalendar_CreateEvent']
❌ FAILURE: Missing tools: ['Gmail_SendEmail']
Total tools returned: 5
================================================================================
Running test: Google Calendar - Create event
================================================================================
Expected tools: ['GoogleCalendar_CreateEvent']
Found tools: ['GoogleCalendar_CreateEvent', 'GoogleCalendar_DeleteEvent', 'GoogleCalendar_FindTimeSlotsWhenEveryoneIsFree', 'GoogleCalendar_ListCalendars', 'GoogleCalendar_ListEvents', 'GoogleCalendar_UpdateEvent', 'GoogleCalendar_WhoAmI', 'Github_SubmitPullRequestReview', 'OutlookCalendar_CreateEvent', 'OutlookCalendar_GetEvent']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Slack - Send message
================================================================================
Expected tools: ['Slack_SendMessage']
Found tools: ['Slack_GetConversationMetadata', 'Slack_GetMessages', 'Slack_GetUsersInfo', 'Slack_ListConversations', 'Slack_ListUsers']
❌ FAILURE: Missing tools: ['Slack_SendMessage']
Total tools returned: 5
================================================================================
Running test: GitHub - Create issue
================================================================================
Expected tools: ['Github_CreateIssue']
Found tools: ['Github_CreateIssue', 'Github_CreateIssueComment', 'Github_ListIssues', 'GithubApi_AddLabelsToGithubIssue', 'GithubApi_CreateGithubIssue']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Docs - Create blank document
================================================================================
Expected tools: ['GoogleDocs_CreateBlankDocument']
Found tools: ['GoogleDrive_GenerateGoogleFilePickerUrl', 'GoogleDrive_GetFileTreeStructure', 'GoogleDrive_SearchFiles', 'GoogleDocs_CommentOnDocument', 'GoogleDocs_CreateBlankDocument']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Drive - Search for files
================================================================================
Expected tools: ['GoogleDrive_SearchFiles']
Found tools: ['GoogleDrive_GenerateGoogleFilePickerUrl', 'GoogleDrive_SearchFiles', 'GoogleDocs_GenerateGoogleFilePickerUrl', 'GoogleDocs_SearchAndRetrieveDocuments', 'GoogleDocs_SearchDocuments']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Sheets - Create spreadsheet
================================================================================
Expected tools: ['GoogleSheets_CreateSpreadsheet']
Found tools: ['GoogleSheets_CreateSpreadsheet', 'GoogleSheets_SearchSpreadsheets', 'GoogleSheets_UpdateCells']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Notion - Create page
================================================================================
Expected tools: ['NotionToolkit_CreatePage']
Found tools: ['NotionToolkit_CreatePage']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Linear - Create issue
================================================================================
Expected tools: ['Linear_GetTeams']
Found tools: []
❌ FAILURE: Missing tools: ['Linear_GetTeams']
Total tools returned: 0
================================================================================
Running test: Asana - Create task
================================================================================
Expected tools: ['Asana_CreateTask']
Found tools: ['Asana_AttachFileToTask', 'Asana_CreateTask', 'Asana_GetSubtasksFromATask', 'Asana_GetTaskById', 'Asana_GetTasksWithoutId']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Spotify - Play music
================================================================================
Expected tools: ['Spotify_PlayTrackByName']
Found tools: ['Spotify_AdjustPlaybackPosition', 'Spotify_GetCurrentlyPlaying', 'Spotify_GetPlaybackState', 'Spotify_PausePlayback', 'Spotify_PlayArtistByName', 'Spotify_PlayTrackByName', 'Spotify_ResumePlayback', 'Spotify_Search', 'Spotify_StartTracksPlaybackById']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: YouTube - Search videos
================================================================================
Expected tools: ['Youtube_SearchVideos']
Found tools: ['Youtube_GetYoutubeVideoDetails', 'Youtube_SearchForVideos']
❌ FAILURE: Missing tools: ['Youtube_SearchVideos']
Total tools returned: 2
================================================================================
Running test: Stripe - Create customer
================================================================================
Expected tools: ['Stripe_CreateCustomer']
Found tools: ['Stripe_CreateBillingPortalSession', 'Stripe_CreateCustomer', 'Stripe_CreateInvoice', 'Stripe_CreateInvoiceItem', 'StripeApi_SearchStripeCustomers']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: HubSpot - Create contact
================================================================================
Expected tools: ['HubSpot_CreateContact']
Found tools: ['Hubspot_CreateContact', 'Hubspot_CreateNoteActivity', 'HubspotAutomationApi_EnrollContactInSequence', 'HubspotUsersApi_CreateHubspotUser', 'HubspotMarketingApi_CreateSmtpApiToken']
❌ FAILURE: Missing tools: ['HubSpot_CreateContact']
Total tools returned: 5
================================================================================
Running test: Zendesk - Create ticket
================================================================================
Expected tools: ['Zendesk_CreateTicket']
Found tools: ['Zendesk_AddTicketComment', 'Zendesk_GetTicketComments', 'Zendesk_ListTickets', 'Zendesk_MarkTicketSolved']
❌ FAILURE: Missing tools: ['Zendesk_CreateTicket']
Total tools returned: 4
================================================================================
Running test: Confluence - Create page
================================================================================
Expected tools: ['Confluence_CreatePage']
Found tools: ['Confluence_CreatePage', 'Confluence_ListPages']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Microsoft Teams - Send message
================================================================================
Expected tools: ['MicrosoftTeams_SendMessageToChannel']
Found tools: ['MicrosoftTeams_GetChannelMessageReplies', 'MicrosoftTeams_GetChannelMessages', 'MicrosoftTeams_GetChannelMetadata', 'MicrosoftTeams_GetChatMessageById', 'MicrosoftTeams_GetChatMessages']
❌ FAILURE: Missing tools: ['MicrosoftTeams_SendMessageToChannel']
Total tools returned: 5
================================================================================
Running test: Outlook Mail - Send email
================================================================================
Expected tools: ['OutlookMail_CreateDraftEmail']
Found tools: ['GoogleCalendar_CreateEvent', 'Gmail_DeleteDraftEmail', 'Gmail_ListDraftEmails', 'Gmail_ListEmailsByHeader', 'Gmail_ReplyToEmail']
❌ FAILURE: Missing tools: ['OutlookMail_CreateDraftEmail']
Total tools returned: 5
================================================================================
Running test: Dropbox - List files
================================================================================
Expected tools: ['Dropbox_ListItemsInFolder']
Found tools: ['Dropbox_ListItemsInFolder', 'Dropbox_SearchFilesAndFolders']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: X (Twitter) - Post tweet
================================================================================
Expected tools: ['X_PostTweet']
Found tools: ['Firecrawl_CrawlWebsite', 'Linkedin_CreateTextPost', 'Reddit_CommentOnPost', 'Reddit_GetContentOfMultiplePosts', 'Reddit_GetContentOfPost']
❌ FAILURE: Missing tools: ['X_PostTweet']
Total tools returned: 5
================================================================================
Running test: ClickUp - Create task
================================================================================
Expected tools: ['ClickUp_CreateTask']
Found tools: ['Clickup_CreateTask', 'Clickup_CreateTaskComment', 'Clickup_CreateTaskCommentReply', 'Clickup_FuzzySearchTasksByName', 'Clickup_GetListsForFolder', 'Clickup_GetListsForSpace', 'Clickup_GetStatusesForList', 'Clickup_GetTaskById', 'Clickup_GetTaskCommentReplies', 'Clickup_GetTaskComments']
❌ FAILURE: Missing tools: ['ClickUp_CreateTask']
Total tools returned: 10
================================================================================
Running test: Salesforce - Create contact
================================================================================
Expected tools: ['Salesforce_CreateContact']
Found tools: ['Salesforce_CreateContact']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Google Slides - Create presentation
================================================================================
Expected tools: ['GoogleSlides_CreatePresentation']
Found tools: ['GoogleDrive_SearchFiles', 'GoogleSlides_CommentOnPresentation', 'GoogleSlides_CreatePresentation', 'GoogleSlides_CreateSlide', 'GoogleSlides_GenerateGoogleFilePickerUrl', 'GoogleSlides_GetPresentationAsMarkdown', 'GoogleSlides_ListPresentationComments', 'GoogleSlides_SearchPresentations', 'GoogleSlides_WhoAmI']
✅ SUCCESS: All expected tools were found!
================================================================================
Running test: Figma - Get file info
================================================================================
Expected tools: ['FigmaApi_FetchFigmaFile']
Found tools: ['FigmaApi_AddCommentToFigmaFile', 'FigmaApi_AddFigmaCommentReaction', 'FigmaApi_BulkUpdateFigmaDevResources', 'FigmaApi_CreateBulkDevResources', 'FigmaApi_DeleteDevResource', 'FigmaApi_DeleteFigmaComment', 'FigmaApi_DeleteMyCommentReaction', 'FigmaApi_FetchCommentReactions', 'FigmaApi_FetchComponentUsageData', 'FigmaApi_FetchFigmaFile']
✅ SUCCESS: All expected tools were found!
================================================================================
SUMMARY
================================================================================
Total tests: 25
Passed: 14
Failed: 11
Success rate: 56.0%
Failed tests:
- Reddit - Get posts from subreddit: Missing ['Reddit_GetPostsInSubreddit']
- Gmail - Send email: Missing ['Gmail_SendEmail']
- Slack - Send message: Missing ['Slack_SendMessage']
- Linear - Create issue: Missing ['Linear_GetTeams']
- YouTube - Search videos: Missing ['Youtube_SearchVideos']
- HubSpot - Create contact: Missing ['HubSpot_CreateContact']
- Zendesk - Create ticket: Missing ['Zendesk_CreateTicket']
- Microsoft Teams - Send message: Missing ['MicrosoftTeams_SendMessageToChannel']
- Outlook Mail - Send email: Missing ['OutlookMail_CreateDraftEmail']
- X (Twitter) - Post tweet: Missing ['X_PostTweet']
- ClickUp - Create task: Missing ['ClickUp_CreateTask']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment