Forked from EricGustin/anthropic_tool_search_beta_evals.py
Last active
December 2, 2025 02:26
-
-
Save evantahler/5c4cda9d412192de35f9702892fdd657 to your computer and use it in GitHub Desktop.
Evaluating the performance of Anthropic's tool search with 4000+ Arcade tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| This script provides testing Anthropic's tool search functionality across multiple scenarios. | |
| REQUIRED: | |
| - pip install arcadepy | |
| - pip install anthropic | |
| - export ANTHROPIC_API_KEY=<your_anthropic_api_key> | |
| - export ARCADE_API_KEY=<your_arcade_api_key> | |
| """ | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from dataclasses import dataclass | |
| from anthropic import Anthropic | |
| from arcadepy import Arcade | |
| @dataclass | |
| class TestCase: | |
| """Defines a single test case for tool search evaluation.""" | |
| name: str | |
| messages: list[dict] | |
| expected_tools: list[str] # List of tool names that should be found | |
| @dataclass | |
| class TestResult: | |
| """Results from running a test case.""" | |
| name: str | |
| expected_tools: list[str] | |
| found_tools: list[str] | |
| success: bool | |
| response: object | |
| class ToolSearchEvaluationSuite: | |
| """Evaluation suite for testing tool search functionality.""" | |
| def __init__( | |
| self, | |
| server_names: list[str], | |
| arcade_client: Arcade, | |
| anthropic_client: Anthropic, | |
| max_workers: int = 20, | |
| ): | |
| self.server_names = server_names | |
| self.arcade_client = arcade_client | |
| self.anthropic_client = anthropic_client | |
| self.max_workers = max_workers | |
| self.arcade_tools: list[dict] = [] | |
| self.search_tool = [ | |
| {"type": "tool_search_tool_bm25_20251119", "name": "tool_search_tool_bm25"} | |
| ] | |
| def _fetch_server_tools(self, tk_name: str) -> tuple[str, list[dict], int]: | |
| """Fetch tools for a single server.""" | |
| tools = [] | |
| tools_page_iter = self.arcade_client.tools.list(toolkit=tk_name, include_format="anthropic") | |
| for tool in tools_page_iter: | |
| tool_def = tool.formatted_schema["anthropic"] | |
| tool_def["defer_loading"] = True | |
| tools.append(tool_def) | |
| return tk_name, tools, len(tools) | |
| def load_tools(self, verbose: bool = True) -> None: | |
| """Load all tools from the specified servers in parallel.""" | |
| self.arcade_tools = [] | |
| with ThreadPoolExecutor(max_workers=self.max_workers) as executor: | |
| # Submit all server fetches | |
| future_to_server = { | |
| executor.submit(self._fetch_server_tools, tk_name): tk_name | |
| for tk_name in self.server_names | |
| } | |
| # Process results as they complete | |
| for future in as_completed(future_to_server): | |
| tk_name, tools, num_tools = future.result() | |
| self.arcade_tools.extend(tools) | |
| if verbose: | |
| print(f"{tk_name}: {num_tools} tools. Total tools: {len(self.arcade_tools)}") | |
| if verbose: | |
| print(f"\nTotal: {len(self.arcade_tools)} tools loaded\n") | |
| def run_test_case(self, test_case: TestCase, verbose: bool = True) -> TestResult: | |
| """Run a single test case and return the results.""" | |
| if verbose: | |
| print(f"\n{'=' * 80}") | |
| print(f"Running test: {test_case.name}") | |
| print(f"{'=' * 80}") | |
| response = self.anthropic_client.beta.messages.create( | |
| model="claude-sonnet-4-5-20250929", | |
| betas=["advanced-tool-use-2025-11-20", "mcp-client-2025-11-20"], | |
| max_tokens=2048, | |
| tools=self.search_tool + self.arcade_tools, | |
| messages=test_case.messages, | |
| ) | |
| # Extract found tools from the response | |
| found_tools = [] | |
| for content_block in response.content: | |
| if hasattr(content_block, "type") and content_block.type == "tool_search_tool_result": | |
| tool_references = content_block.content.tool_references | |
| found_tools = [ref.tool_name for ref in tool_references] | |
| break | |
| # Check if all expected tools were found | |
| success = all(tool in found_tools for tool in test_case.expected_tools) | |
| if verbose: | |
| print(f"\nExpected tools: {test_case.expected_tools}") | |
| print(f"Found tools: {found_tools}") | |
| if success: | |
| print("✅ SUCCESS: All expected tools were found!") | |
| else: | |
| missing_tools = [ | |
| tool for tool in test_case.expected_tools if tool not in found_tools | |
| ] | |
| print(f"❌ FAILURE: Missing tools: {missing_tools}") | |
| print(f" Total tools returned: {len(found_tools)}") | |
| return TestResult( | |
| name=test_case.name, | |
| expected_tools=test_case.expected_tools, | |
| found_tools=found_tools, | |
| success=success, | |
| response=response, | |
| ) | |
| def run_all_tests(self, test_cases: list[TestCase], verbose: bool = True) -> list[TestResult]: | |
| """Run all test cases and return the results.""" | |
| results = [] | |
| for test_case in test_cases: | |
| result = self.run_test_case(test_case, verbose=verbose) | |
| results.append(result) | |
| # Print summary | |
| if verbose: | |
| self._print_summary(results) | |
| return results | |
| def _print_summary(self, results: list[TestResult]) -> None: | |
| """Print a summary of all test results.""" | |
| print(f"\n{'=' * 80}") | |
| print("SUMMARY") | |
| print(f"{'=' * 80}") | |
| passed = sum(1 for r in results if r.success) | |
| total = len(results) | |
| print(f"\nTotal tests: {total}") | |
| print(f"Passed: {passed}") | |
| print(f"Failed: {total - passed}") | |
| print(f"Success rate: {passed / total * 100:.1f}%") | |
| if total - passed > 0: | |
| print("\nFailed tests:") | |
| for result in results: | |
| if not result.success: | |
| missing_tools = [ | |
| tool for tool in result.expected_tools if tool not in result.found_tools | |
| ] | |
| print(f" - {result.name}: Missing {missing_tools}") | |
| # Define the servers to load | |
| SERVER_NAMES = [ | |
| "Asana", | |
| "ClickUp", | |
| "Confluence", | |
| "Dropbox", | |
| "E2B", | |
| "Firecrawl", | |
| "Github", | |
| "Gmail", | |
| "GoogleCalendar", | |
| "GoogleContacts", | |
| "GoogleDocs", | |
| "GoogleDrive", | |
| "GoogleFinance", | |
| "GoogleFlights", | |
| "GoogleHotels", | |
| "GoogleJobs", | |
| "GoogleMaps", | |
| "GoogleNews", | |
| "GoogleSearch", | |
| "GoogleSheets", | |
| "GoogleShopping", | |
| "GoogleSlides", | |
| "HubSpot", | |
| "Imgflip", | |
| "Linear", | |
| "LinkedIn", | |
| "SharePoint", | |
| "MicrosoftTeams", | |
| "NotionToolkit", | |
| "OutlookCalendar", | |
| "OutlookMail", | |
| "Reddit", | |
| "Salesforce", | |
| "Slack", | |
| "Spotify", | |
| "Stripe", | |
| "Walmart", | |
| "X", | |
| "Youtube", | |
| "Zendesk", | |
| "Zoom", | |
| "AsanaApi", | |
| "ArcadeEngineApi", | |
| "BoxApi", | |
| "CalendlyApi", | |
| "ClickupApi", | |
| "CursorAgentsApi", | |
| "CustomerioPipelinesApi", | |
| "CustomerioTrackApi", | |
| "FigmaApi", | |
| "GithubApi", | |
| "HubspotAutomationApi", | |
| "HubspotCmsApi", | |
| "HubspotConversationsApi", | |
| "HubspotCrmApi", | |
| "HubspotMarketingApi", | |
| "HubspotMeetingsApi", | |
| "HubspotUsersApi", | |
| "MiroApi", | |
| "SlackApi", | |
| "StripeApi", | |
| "TicktickApi", | |
| "TrelloApi", | |
| "VercelApi", | |
| "ZohobooksApi", | |
| "ZohocreatorApi", | |
| ] | |
| # Define test cases | |
| TEST_CASES = [ | |
| TestCase( | |
| name="Reddit - Get posts from subreddit", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I want to read some posts on a specific subreddit.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "I'd be happy to help you with your task. Which subreddit are you interested in?", | |
| }, | |
| {"role": "user", "content": "mcp"}, | |
| ], | |
| expected_tools=["Reddit_GetPostsInSubreddit"], | |
| ), | |
| TestCase( | |
| name="Gmail - Send email", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I need to send an Email to my colleague about the project update.", | |
| }, | |
| ], | |
| expected_tools=["Gmail_SendEmail"], | |
| ), | |
| TestCase( | |
| name="Google Calendar - Create event", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Schedule a meeting for 1:1 tomorrow at 2pm. Invite [email protected]", | |
| }, | |
| ], | |
| expected_tools=["GoogleCalendar_CreateEvent"], | |
| ), | |
| TestCase( | |
| name="Slack - Send message", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Post a message to the #general channel in Slack that says that I will be OOO tomorrow.", | |
| }, | |
| ], | |
| expected_tools=["Slack_SendMessage"], | |
| ), | |
| TestCase( | |
| name="GitHub - Create issue", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Create a new issue in my GitHub repo arcadeai/arcade-mcp that says 'This isn't an issue, per se, but great work on this!'", | |
| }, | |
| ], | |
| expected_tools=["Github_CreateIssue"], | |
| ), | |
| TestCase( | |
| name="Google Docs - Create blank document", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I need to write a document. Can you help me create a new Google Doc?", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Of course! I can help you create a new Google Doc. What would you like to name it?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Call it 'Q1 Planning Notes'", | |
| }, | |
| ], | |
| expected_tools=["GoogleDocs_CreateBlankDocument"], | |
| ), | |
| TestCase( | |
| name="Google Drive - Search for files", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Find all the PDF files in my Google Drive that contain the word 'invoice'", | |
| }, | |
| ], | |
| expected_tools=["GoogleDrive_SearchFiles"], | |
| ), | |
| TestCase( | |
| name="Google Sheets - Create spreadsheet", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Create a new spreadsheet to track my expenses for this month.", | |
| }, | |
| ], | |
| expected_tools=["GoogleSheets_CreateSpreadsheet"], | |
| ), | |
| TestCase( | |
| name="Notion - Create page", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I want to create a new page in Notion.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "I can help with that! What would you like to title the page, and where should it be created?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Title it 'Meeting Notes - Nov 26' and have it be a child of the 'Meetings' page", | |
| }, | |
| ], | |
| expected_tools=["NotionToolkit_CreatePage"], | |
| ), | |
| TestCase( | |
| name="Linear - Create issue", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I'm a part of so many teams in Linear and it's hard to keep track of them all. Can you help me get a list of all my teams?", | |
| }, | |
| ], | |
| expected_tools=["Linear_GetTeams"], | |
| ), | |
| TestCase( | |
| name="Asana - Create task", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Add a task in Asana to review the pull requests by Friday.", | |
| }, | |
| ], | |
| expected_tools=["Asana_CreateTask"], | |
| ), | |
| TestCase( | |
| name="Spotify - Play music", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Play some music on Spotify.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "I'd be happy to play music for you! What would you like to listen to?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Play the 'Helter Skelter' song by The Beatles", | |
| }, | |
| ], | |
| expected_tools=["Spotify_PlayTrackByName"], | |
| ), | |
| TestCase( | |
| name="YouTube - Search videos", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Search YouTube for tutorials on Python async programming.", | |
| }, | |
| ], | |
| expected_tools=["Youtube_SearchVideos"], | |
| ), | |
| TestCase( | |
| name="Stripe - Create customer", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Create a new customer in Stripe for [email protected]", | |
| }, | |
| ], | |
| expected_tools=["Stripe_CreateCustomer"], | |
| ), | |
| TestCase( | |
| name="HubSpot - Create contact", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I need to add a new contact to HubSpot.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Sure! I can help you add a contact to HubSpot. What are the contact details?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Name is Sarah Johnson, email is [email protected]", | |
| }, | |
| ], | |
| expected_tools=["HubSpot_CreateContact"], | |
| ), | |
| TestCase( | |
| name="Zendesk - Create ticket", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "List recent tickets that I have in Zendesk.", | |
| }, | |
| ], | |
| expected_tools=["Zendesk_CreateTicket"], | |
| ), | |
| TestCase( | |
| name="Confluence - Create page", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I want to document our new API endpoints in Confluence.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Great idea! I can help you create a Confluence page. What should the title be?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Call it 'API Documentation - v2.0.0'", | |
| }, | |
| ], | |
| expected_tools=["Confluence_CreatePage"], | |
| ), | |
| TestCase( | |
| name="Microsoft Teams - Send message", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Send a message in the Engineering channel on Teams to remind everyone about standup.", | |
| }, | |
| ], | |
| expected_tools=["MicrosoftTeams_SendMessageToChannel"], | |
| ), | |
| TestCase( | |
| name="Outlook Mail - Send email", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Draft up an email in Outlook to the team about tomorrow's workshop.", | |
| }, | |
| ], | |
| expected_tools=["OutlookMail_CreateDraftEmail"], | |
| ), | |
| TestCase( | |
| name="Dropbox - List files", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "What files do I have in my Dropbox?", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "I can check your Dropbox for you. Would you like to see files in a specific folder or all files?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Show me what's in the 'Documents' folder", | |
| }, | |
| ], | |
| expected_tools=["Dropbox_ListItemsInFolder"], | |
| ), | |
| TestCase( | |
| name="X (Twitter) - Post tweet", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Post a tweet saying 'Just finished an amazing project with the team! 🚀'", | |
| }, | |
| ], | |
| expected_tools=["X_PostTweet"], | |
| ), | |
| TestCase( | |
| name="ClickUp - Create task", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Add a task in ClickUp to update the documentation.", | |
| }, | |
| ], | |
| expected_tools=["ClickUp_CreateTask"], | |
| ), | |
| TestCase( | |
| name="Salesforce - Create contact", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I met a potential customer at a conference. Can you help me add them to Salesforce?", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "Absolutely! I can help you create a contact in Salesforce. What are their details?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Name is Michael Chen, company is TechCorp, email is [email protected]", | |
| }, | |
| ], | |
| expected_tools=["Salesforce_CreateContact"], | |
| ), | |
| TestCase( | |
| name="Google Slides - Create presentation", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "Create a new Google Slides presentation for my quarterly review. It should be a 5 slide presentation with a title slide, an overview slide, a section on the past quarter, a section on the current quarter, and a section on the future quarter.", | |
| }, | |
| ], | |
| expected_tools=["GoogleSlides_CreatePresentation"], | |
| ), | |
| TestCase( | |
| name="Figma - Get file info", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "I need information about a Figma design file.", | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": "I can help you get information about a Figma file. What's the file key or URL?", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "It's the file with key abc123def456", | |
| }, | |
| ], | |
| expected_tools=["FigmaApi_FetchFigmaFile"], | |
| ), | |
| ] | |
| if __name__ == "__main__": | |
| # Initialize clients | |
| arcade_client = Arcade() | |
| anthropic_client = Anthropic() | |
| # Create evaluation suite | |
| suite = ToolSearchEvaluationSuite( | |
| server_names=SERVER_NAMES, | |
| arcade_client=arcade_client, | |
| anthropic_client=anthropic_client, | |
| ) | |
| # Load tools once | |
| print("Loading tools...") | |
| suite.load_tools(verbose=True) | |
| # Run all test cases | |
| results = suite.run_all_tests(TEST_CASES, verbose=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| BM25 Results | |
| Total: 4027 tools loaded | |
| ================================================================================ | |
| Running test: Reddit - Get posts from subreddit | |
| ================================================================================ | |
| Expected tools: ['Reddit_GetPostsInSubreddit'] | |
| Found tools: ['Reddit_SubmitTextPost', 'Reddit_GetPostsInSubreddit', 'Reddit_CheckSubredditAccess', 'Reddit_GetSubredditRules', 'Reddit_GetContentOfPost'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Gmail - Send email | |
| ================================================================================ | |
| Expected tools: ['Gmail_SendEmail'] | |
| Found tools: ['OutlookMail_SendDraftEmail', 'ZohoBooksApi_SendInvitationEmail', 'ZohoBooksApi_SendInvoiceReminders', 'HubspotMarketingApi_SendTransactionalEmail', 'Slack_SendMessage'] | |
| ❌ FAILURE: Missing tools: ['Gmail_SendEmail'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Google Calendar - Create event | |
| ================================================================================ | |
| Expected tools: ['GoogleCalendar_CreateEvent'] | |
| Found tools: ['GoogleCalendar_CreateEvent', 'GoogleCalendar_UpdateEvent', 'OutlookCalendar_CreateEvent', 'GoogleCalendar_DeleteEvent', 'HubspotMeetingsApi_ScheduleMeetingHubspot'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Slack - Send message | |
| ================================================================================ | |
| Expected tools: ['Slack_SendMessage'] | |
| Found tools: ['SlackApi_ScheduleSlackMessage', 'SlackApi_GetSlackMessagePermalink', 'SlackApi_CustomUnfurlSlackUrls', 'SlackApi_AddSlackReaction', 'SlackApi_SendEphemeralMessageSlack'] | |
| ❌ FAILURE: Missing tools: ['Slack_SendMessage'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: GitHub - Create issue | |
| ================================================================================ | |
| Expected tools: ['Github_CreateIssue'] | |
| Found tools: ['GithubApi_GithubCreateIssueComment', 'GithubApi_ListGithubIssuesForRepo', 'GithubApi_ListGithubIssues', 'GithubApi_DeleteGithubIssueReaction', 'Github_CreateIssue'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Docs - Create blank document | |
| ================================================================================ | |
| Expected tools: ['GoogleDocs_CreateBlankDocument'] | |
| Found tools: ['GoogleDocs_CreateBlankDocument', 'GoogleDocs_CreateDocumentFromText', 'GoogleDocs_GetDocumentById', 'GoogleDocs_InsertTextAtEndOfDocument', 'GoogleDocs_GetDocumentMetadata'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Drive - Search for files | |
| ================================================================================ | |
| Expected tools: ['GoogleDrive_SearchFiles'] | |
| Found tools: ['GoogleDrive_SearchFiles', 'Sharepoint_SearchDriveItems', 'GoogleDrive_GetFileTreeStructure', 'GoogleSheets_GenerateGoogleFilePickerUrl', 'GoogleDrive_GenerateGoogleFilePickerUrl'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Sheets - Create spreadsheet | |
| ================================================================================ | |
| Expected tools: ['GoogleSheets_CreateSpreadsheet'] | |
| Found tools: ['GoogleSheets_CreateSpreadsheet', 'GoogleSheets_GetSpreadsheetMetadata', 'GoogleSheets_SearchSpreadsheets', 'GoogleSheets_GetSpreadsheet', 'GoogleSheets_WriteToCell'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Notion - Create page | |
| ================================================================================ | |
| Expected tools: ['NotionToolkit_CreatePage'] | |
| Found tools: ['NotionToolkit_CreatePage', 'NotionToolkit_GetPageContentById', 'NotionToolkit_GetPageContentByTitle', 'NotionToolkit_AppendContentToEndOfPage', 'NotionToolkit_GetObjectMetadata'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Linear - Create issue | |
| ================================================================================ | |
| Expected tools: ['Linear_GetTeams'] | |
| Found tools: ['Linear_GetTeams', 'MicrosoftTeams_GetTeam', 'GithubApi_ListChildTeams', 'MicrosoftTeams_GetChannelMessageReplies', 'GithubApi_ListTeamProjectsInOrg'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Asana - Create task | |
| ================================================================================ | |
| Expected tools: ['Asana_CreateTask'] | |
| Found tools: ['AsanaApi_CreateAsanaTask', 'AsanaApi_DeleteAsanaTask', 'AsanaApi_DuplicateAsanaTask', 'AsanaApi_DeleteAsanaStory', 'AsanaApi_AddTaskDependencies'] | |
| ❌ FAILURE: Missing tools: ['Asana_CreateTask'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Spotify - Play music | |
| ================================================================================ | |
| Expected tools: ['Spotify_PlayTrackByName'] | |
| Found tools: ['Spotify_StartTracksPlaybackById', 'Spotify_PlayTrackByName', 'Spotify_PlayArtistByName', 'Spotify_GetTrackFromId', 'Spotify_PausePlayback'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: YouTube - Search videos | |
| ================================================================================ | |
| Expected tools: ['Youtube_SearchVideos'] | |
| Found tools: ['Youtube_SearchForVideos', 'Youtube_GetYoutubeVideoDetails', 'Dropbox_SearchFilesAndFolders', 'GoogleDrive_SearchFiles', 'TrelloApi_UpdateSavedSearch'] | |
| ❌ FAILURE: Missing tools: ['Youtube_SearchVideos'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Stripe - Create customer | |
| ================================================================================ | |
| Expected tools: ['Stripe_CreateCustomer'] | |
| Found tools: ['Stripe_CreateInvoice', 'Stripe_CreateInvoiceItem', 'Stripe_CreateCustomer', 'StripeApi_DeleteStripeCustomer', 'StripeApi_SearchStripeCustomers'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: HubSpot - Create contact | |
| ================================================================================ | |
| Expected tools: ['HubSpot_CreateContact'] | |
| Found tools: ['HubspotCrmApi_CreateHubspotContact', 'HubspotCrmApi_CreateCrmObject', 'HubspotMarketingApi_CreateSmtpApiToken', 'HubspotCrmApi_CreateHubspotCrmProperty', 'ZohoBooksApi_CreateContactPerson'] | |
| ❌ FAILURE: Missing tools: ['HubSpot_CreateContact'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Zendesk - Create ticket | |
| ================================================================================ | |
| Expected tools: ['Zendesk_CreateTicket'] | |
| Found tools: ['Zendesk_MarkTicketSolved', 'Zendesk_AddTicketComment', 'Zendesk_GetTicketComments', 'Zendesk_ListTickets', 'Zendesk_WhoAmI'] | |
| ❌ FAILURE: Missing tools: ['Zendesk_CreateTicket'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Confluence - Create page | |
| ================================================================================ | |
| Expected tools: ['Confluence_CreatePage'] | |
| Found tools: ['Confluence_GetSpaceHierarchy', 'Confluence_SearchContent', 'GoogleDocs_CreateDocumentFromText', 'GoogleDocs_CreateBlankDocument', 'Confluence_WhoAmI'] | |
| ❌ FAILURE: Missing tools: ['Confluence_CreatePage'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Microsoft Teams - Send message | |
| ================================================================================ | |
| Expected tools: ['MicrosoftTeams_SendMessageToChannel'] | |
| Found tools: ['MicrosoftTeams_SendMessageToChannel', 'MicrosoftTeams_ReplyToChannelMessage', 'Slack_SendMessage', 'SlackApi_SendEphemeralMessageSlack', 'MicrosoftTeams_GetChannelMessages'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Outlook Mail - Send email | |
| ================================================================================ | |
| Expected tools: ['OutlookMail_CreateDraftEmail'] | |
| Found tools: ['OutlookMail_CreateDraftEmail', 'Gmail_WriteDraftEmail', 'Gmail_WriteDraftReplyEmail', 'OutlookMail_CreateAndSendEmail', 'OutlookMail_WhoAmI'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Dropbox - List files | |
| ================================================================================ | |
| Expected tools: ['Dropbox_ListItemsInFolder'] | |
| Found tools: ['Dropbox_SearchFilesAndFolders', 'Dropbox_ListItemsInFolder', 'GoogleDrive_GetFileTreeStructure', 'GoogleDrive_SearchFiles', 'FigmaApi_FetchFigmaFile'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: X (Twitter) - Post tweet | |
| ================================================================================ | |
| Expected tools: ['X_PostTweet'] | |
| Found tools: ['X_LookupTweetById', 'X_DeleteTweetById', 'X_PostTweet', 'X_ReplyToTweet', 'X_SearchRecentTweetsByUsername'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: ClickUp - Create task | |
| ================================================================================ | |
| Expected tools: ['ClickUp_CreateTask'] | |
| Found tools: ['Clickup_CreateTaskComment', 'ClickupApi_AddTaskToClickupList', 'ClickupApi_CreateFolderInSpace', 'Clickup_CreateTask', 'ClickupApi_CreateNewClickupTask'] | |
| ❌ FAILURE: Missing tools: ['ClickUp_CreateTask'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Salesforce - Create contact | |
| ================================================================================ | |
| Expected tools: ['Salesforce_CreateContact'] | |
| Found tools: ['Salesforce_CreateContact', 'Salesforce_GetAccountDataByKeywords', 'Salesforce_GetAccountDataById', 'ZohoBooksApi_CreateContactPerson', 'Hubspot_CreateContact'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Slides - Create presentation | |
| ================================================================================ | |
| Expected tools: ['GoogleSlides_CreatePresentation'] | |
| Found tools: ['GoogleSlides_CreatePresentation', 'GoogleSlides_CommentOnPresentation', 'GoogleSlides_CreateSlide', 'GoogleSlides_GetPresentationAsMarkdown', 'GoogleSlides_ListPresentationComments'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Figma - Get file info | |
| ================================================================================ | |
| Expected tools: ['FigmaApi_FetchFigmaFile'] | |
| Found tools: ['FigmaApi_GetPublishedComponentSets', 'FigmaApi_GetStyleMetadata', 'FigmaApi_GetFileMetadata', 'FigmaApi_FetchImageFillLinks', 'FigmaApi_GetFigmaFileNodes'] | |
| ❌ FAILURE: Missing tools: ['FigmaApi_FetchFigmaFile'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| SUMMARY | |
| ================================================================================ | |
| Total tests: 25 | |
| Passed: 16 | |
| Failed: 9 | |
| Success rate: 64.0% | |
| Failed tests: | |
| - Gmail - Send email: Missing ['Gmail_SendEmail'] | |
| - Slack - Send message: Missing ['Slack_SendMessage'] | |
| - Asana - Create task: Missing ['Asana_CreateTask'] | |
| - YouTube - Search videos: Missing ['Youtube_SearchVideos'] | |
| - HubSpot - Create contact: Missing ['HubSpot_CreateContact'] | |
| - Zendesk - Create ticket: Missing ['Zendesk_CreateTicket'] | |
| - Confluence - Create page: Missing ['Confluence_CreatePage'] | |
| - ClickUp - Create task: Missing ['ClickUp_CreateTask'] | |
| - Figma - Get file info: Missing ['FigmaApi_FetchFigmaFile'] | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Regex Results | |
| Total: 4027 tools loaded | |
| ================================================================================ | |
| Running test: Reddit - Get posts from subreddit | |
| ================================================================================ | |
| Expected tools: ['Reddit_GetPostsInSubreddit'] | |
| Found tools: ['Firecrawl_CrawlWebsite', 'Linkedin_CreateTextPost', 'Reddit_CheckSubredditAccess', 'Reddit_CommentOnPost', 'Reddit_GetContentOfMultiplePosts'] | |
| ❌ FAILURE: Missing tools: ['Reddit_GetPostsInSubreddit'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Gmail - Send email | |
| ================================================================================ | |
| Expected tools: ['Gmail_SendEmail'] | |
| Found tools: ['GoogleContacts_CreateContact', 'GoogleContacts_SearchContactsByEmail', 'GoogleContacts_WhoAmI', 'GoogleDrive_WhoAmI', 'GoogleCalendar_CreateEvent'] | |
| ❌ FAILURE: Missing tools: ['Gmail_SendEmail'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Google Calendar - Create event | |
| ================================================================================ | |
| Expected tools: ['GoogleCalendar_CreateEvent'] | |
| Found tools: ['GoogleCalendar_CreateEvent', 'GoogleCalendar_DeleteEvent', 'GoogleCalendar_FindTimeSlotsWhenEveryoneIsFree', 'GoogleCalendar_ListCalendars', 'GoogleCalendar_ListEvents', 'GoogleCalendar_UpdateEvent', 'GoogleCalendar_WhoAmI', 'Github_SubmitPullRequestReview', 'OutlookCalendar_CreateEvent', 'OutlookCalendar_GetEvent'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Slack - Send message | |
| ================================================================================ | |
| Expected tools: ['Slack_SendMessage'] | |
| Found tools: ['Slack_GetConversationMetadata', 'Slack_GetMessages', 'Slack_GetUsersInfo', 'Slack_ListConversations', 'Slack_ListUsers'] | |
| ❌ FAILURE: Missing tools: ['Slack_SendMessage'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: GitHub - Create issue | |
| ================================================================================ | |
| Expected tools: ['Github_CreateIssue'] | |
| Found tools: ['Github_CreateIssue', 'Github_CreateIssueComment', 'Github_ListIssues', 'GithubApi_AddLabelsToGithubIssue', 'GithubApi_CreateGithubIssue'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Docs - Create blank document | |
| ================================================================================ | |
| Expected tools: ['GoogleDocs_CreateBlankDocument'] | |
| Found tools: ['GoogleDrive_GenerateGoogleFilePickerUrl', 'GoogleDrive_GetFileTreeStructure', 'GoogleDrive_SearchFiles', 'GoogleDocs_CommentOnDocument', 'GoogleDocs_CreateBlankDocument'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Drive - Search for files | |
| ================================================================================ | |
| Expected tools: ['GoogleDrive_SearchFiles'] | |
| Found tools: ['GoogleDrive_GenerateGoogleFilePickerUrl', 'GoogleDrive_SearchFiles', 'GoogleDocs_GenerateGoogleFilePickerUrl', 'GoogleDocs_SearchAndRetrieveDocuments', 'GoogleDocs_SearchDocuments'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Sheets - Create spreadsheet | |
| ================================================================================ | |
| Expected tools: ['GoogleSheets_CreateSpreadsheet'] | |
| Found tools: ['GoogleSheets_CreateSpreadsheet', 'GoogleSheets_SearchSpreadsheets', 'GoogleSheets_UpdateCells'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Notion - Create page | |
| ================================================================================ | |
| Expected tools: ['NotionToolkit_CreatePage'] | |
| Found tools: ['NotionToolkit_CreatePage'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Linear - Create issue | |
| ================================================================================ | |
| Expected tools: ['Linear_GetTeams'] | |
| Found tools: [] | |
| ❌ FAILURE: Missing tools: ['Linear_GetTeams'] | |
| Total tools returned: 0 | |
| ================================================================================ | |
| Running test: Asana - Create task | |
| ================================================================================ | |
| Expected tools: ['Asana_CreateTask'] | |
| Found tools: ['Asana_AttachFileToTask', 'Asana_CreateTask', 'Asana_GetSubtasksFromATask', 'Asana_GetTaskById', 'Asana_GetTasksWithoutId'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Spotify - Play music | |
| ================================================================================ | |
| Expected tools: ['Spotify_PlayTrackByName'] | |
| Found tools: ['Spotify_AdjustPlaybackPosition', 'Spotify_GetCurrentlyPlaying', 'Spotify_GetPlaybackState', 'Spotify_PausePlayback', 'Spotify_PlayArtistByName', 'Spotify_PlayTrackByName', 'Spotify_ResumePlayback', 'Spotify_Search', 'Spotify_StartTracksPlaybackById'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: YouTube - Search videos | |
| ================================================================================ | |
| Expected tools: ['Youtube_SearchVideos'] | |
| Found tools: ['Youtube_GetYoutubeVideoDetails', 'Youtube_SearchForVideos'] | |
| ❌ FAILURE: Missing tools: ['Youtube_SearchVideos'] | |
| Total tools returned: 2 | |
| ================================================================================ | |
| Running test: Stripe - Create customer | |
| ================================================================================ | |
| Expected tools: ['Stripe_CreateCustomer'] | |
| Found tools: ['Stripe_CreateBillingPortalSession', 'Stripe_CreateCustomer', 'Stripe_CreateInvoice', 'Stripe_CreateInvoiceItem', 'StripeApi_SearchStripeCustomers'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: HubSpot - Create contact | |
| ================================================================================ | |
| Expected tools: ['HubSpot_CreateContact'] | |
| Found tools: ['Hubspot_CreateContact', 'Hubspot_CreateNoteActivity', 'HubspotAutomationApi_EnrollContactInSequence', 'HubspotUsersApi_CreateHubspotUser', 'HubspotMarketingApi_CreateSmtpApiToken'] | |
| ❌ FAILURE: Missing tools: ['HubSpot_CreateContact'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Zendesk - Create ticket | |
| ================================================================================ | |
| Expected tools: ['Zendesk_CreateTicket'] | |
| Found tools: ['Zendesk_AddTicketComment', 'Zendesk_GetTicketComments', 'Zendesk_ListTickets', 'Zendesk_MarkTicketSolved'] | |
| ❌ FAILURE: Missing tools: ['Zendesk_CreateTicket'] | |
| Total tools returned: 4 | |
| ================================================================================ | |
| Running test: Confluence - Create page | |
| ================================================================================ | |
| Expected tools: ['Confluence_CreatePage'] | |
| Found tools: ['Confluence_CreatePage', 'Confluence_ListPages'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Microsoft Teams - Send message | |
| ================================================================================ | |
| Expected tools: ['MicrosoftTeams_SendMessageToChannel'] | |
| Found tools: ['MicrosoftTeams_GetChannelMessageReplies', 'MicrosoftTeams_GetChannelMessages', 'MicrosoftTeams_GetChannelMetadata', 'MicrosoftTeams_GetChatMessageById', 'MicrosoftTeams_GetChatMessages'] | |
| ❌ FAILURE: Missing tools: ['MicrosoftTeams_SendMessageToChannel'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Outlook Mail - Send email | |
| ================================================================================ | |
| Expected tools: ['OutlookMail_CreateDraftEmail'] | |
| Found tools: ['GoogleCalendar_CreateEvent', 'Gmail_DeleteDraftEmail', 'Gmail_ListDraftEmails', 'Gmail_ListEmailsByHeader', 'Gmail_ReplyToEmail'] | |
| ❌ FAILURE: Missing tools: ['OutlookMail_CreateDraftEmail'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: Dropbox - List files | |
| ================================================================================ | |
| Expected tools: ['Dropbox_ListItemsInFolder'] | |
| Found tools: ['Dropbox_ListItemsInFolder', 'Dropbox_SearchFilesAndFolders'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: X (Twitter) - Post tweet | |
| ================================================================================ | |
| Expected tools: ['X_PostTweet'] | |
| Found tools: ['Firecrawl_CrawlWebsite', 'Linkedin_CreateTextPost', 'Reddit_CommentOnPost', 'Reddit_GetContentOfMultiplePosts', 'Reddit_GetContentOfPost'] | |
| ❌ FAILURE: Missing tools: ['X_PostTweet'] | |
| Total tools returned: 5 | |
| ================================================================================ | |
| Running test: ClickUp - Create task | |
| ================================================================================ | |
| Expected tools: ['ClickUp_CreateTask'] | |
| Found tools: ['Clickup_CreateTask', 'Clickup_CreateTaskComment', 'Clickup_CreateTaskCommentReply', 'Clickup_FuzzySearchTasksByName', 'Clickup_GetListsForFolder', 'Clickup_GetListsForSpace', 'Clickup_GetStatusesForList', 'Clickup_GetTaskById', 'Clickup_GetTaskCommentReplies', 'Clickup_GetTaskComments'] | |
| ❌ FAILURE: Missing tools: ['ClickUp_CreateTask'] | |
| Total tools returned: 10 | |
| ================================================================================ | |
| Running test: Salesforce - Create contact | |
| ================================================================================ | |
| Expected tools: ['Salesforce_CreateContact'] | |
| Found tools: ['Salesforce_CreateContact'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Google Slides - Create presentation | |
| ================================================================================ | |
| Expected tools: ['GoogleSlides_CreatePresentation'] | |
| Found tools: ['GoogleDrive_SearchFiles', 'GoogleSlides_CommentOnPresentation', 'GoogleSlides_CreatePresentation', 'GoogleSlides_CreateSlide', 'GoogleSlides_GenerateGoogleFilePickerUrl', 'GoogleSlides_GetPresentationAsMarkdown', 'GoogleSlides_ListPresentationComments', 'GoogleSlides_SearchPresentations', 'GoogleSlides_WhoAmI'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| Running test: Figma - Get file info | |
| ================================================================================ | |
| Expected tools: ['FigmaApi_FetchFigmaFile'] | |
| Found tools: ['FigmaApi_AddCommentToFigmaFile', 'FigmaApi_AddFigmaCommentReaction', 'FigmaApi_BulkUpdateFigmaDevResources', 'FigmaApi_CreateBulkDevResources', 'FigmaApi_DeleteDevResource', 'FigmaApi_DeleteFigmaComment', 'FigmaApi_DeleteMyCommentReaction', 'FigmaApi_FetchCommentReactions', 'FigmaApi_FetchComponentUsageData', 'FigmaApi_FetchFigmaFile'] | |
| ✅ SUCCESS: All expected tools were found! | |
| ================================================================================ | |
| SUMMARY | |
| ================================================================================ | |
| Total tests: 25 | |
| Passed: 14 | |
| Failed: 11 | |
| Success rate: 56.0% | |
| Failed tests: | |
| - Reddit - Get posts from subreddit: Missing ['Reddit_GetPostsInSubreddit'] | |
| - Gmail - Send email: Missing ['Gmail_SendEmail'] | |
| - Slack - Send message: Missing ['Slack_SendMessage'] | |
| - Linear - Create issue: Missing ['Linear_GetTeams'] | |
| - YouTube - Search videos: Missing ['Youtube_SearchVideos'] | |
| - HubSpot - Create contact: Missing ['HubSpot_CreateContact'] | |
| - Zendesk - Create ticket: Missing ['Zendesk_CreateTicket'] | |
| - Microsoft Teams - Send message: Missing ['MicrosoftTeams_SendMessageToChannel'] | |
| - Outlook Mail - Send email: Missing ['OutlookMail_CreateDraftEmail'] | |
| - X (Twitter) - Post tweet: Missing ['X_PostTweet'] | |
| - ClickUp - Create task: Missing ['ClickUp_CreateTask'] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment