EricGustin · November 26, 2025 01:49
diff --git a/anthropic_tool_search_beta_evals.py b/anthropic_tool_search_beta_evals.py
 """
 This script provides testing Anthropic's tool search functionality across multiple scenarios.

 REQUIRED:
 - pip install arcadepy
 - pip install anthropic
 - export ANTHROPIC_API_KEY=<your_anthropic_api_key>
 - export ARCADE_API_KEY=<your_arcade_api_key>
 """

 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass

 from anthropic import Anthropic
 from arcadepy import Arcade


 @dataclass
 class TestCase:
    """Defines a single test case for tool search evaluation."""

    name: str
    messages: list[dict]
    expected_tools: list[str]  # List of tool names that should be found


 @dataclass
 class TestResult:
    """Results from running a test case."""

    name: str
    expected_tools: list[str]
    found_tools: list[str]
    success: bool
    response: object


 class ToolSearchEvaluationSuite:
    """Evaluation suite for testing tool search functionality."""

    def __init__(
        self,
        server_names: list[str],
        arcade_client: Arcade,
        anthropic_client: Anthropic,
        max_workers: int = 20,
    ):
        self.server_names = server_names
        self.arcade_client = arcade_client
        self.anthropic_client = anthropic_client
        self.max_workers = max_workers
        self.arcade_tools: list[dict] = []
        self.search_tool = [
            {"type": "tool_search_tool_bm25_20251119", "name": "tool_search_tool_bm25"}
        ]

    def _fetch_server_tools(self, tk_name: str) -> tuple[str, list[dict], int]:
        """Fetch tools for a single server."""
        tools = []
        tools_page_iter = self.arcade_client.tools.list(toolkit=tk_name, include_format="anthropic")
        for tool in tools_page_iter:
            tool_def = tool.formatted_schema["anthropic"]
            tool_def["defer_loading"] = True
            tools.append(tool_def)
        return tk_name, tools, len(tools)

    def load_tools(self, verbose: bool = True) -> None:
        """Load all tools from the specified servers in parallel."""
        self.arcade_tools = []

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all server fetches
            future_to_server = {
                executor.submit(self._fetch_server_tools, tk_name): tk_name
                for tk_name in self.server_names
            }

            # Process results as they complete
            for future in as_completed(future_to_server):
                tk_name, tools, num_tools = future.result()
                self.arcade_tools.extend(tools)
                if verbose:
                    print(f"{tk_name}: {num_tools} tools. Total tools: {len(self.arcade_tools)}")

        if verbose:
            print(f"\nTotal: {len(self.arcade_tools)} tools loaded\n")

    def run_test_case(self, test_case: TestCase, verbose: bool = True) -> TestResult:
        """Run a single test case and return the results."""
        if verbose:
            print(f"\n{'=' * 80}")
            print(f"Running test: {test_case.name}")
            print(f"{'=' * 80}")

        response = self.anthropic_client.beta.messages.create(
            model="claude-sonnet-4-5-20250929",
            betas=["advanced-tool-use-2025-11-20", "mcp-client-2025-11-20"],
            max_tokens=2048,
            tools=self.search_tool + self.arcade_tools,
            messages=test_case.messages,
        )

        # Extract found tools from the response
        found_tools = []
        for content_block in response.content:
            if hasattr(content_block, "type") and content_block.type == "tool_search_tool_result":
                tool_references = content_block.content.tool_references
                found_tools = [ref.tool_name for ref in tool_references]
                break

        # Check if all expected tools were found
        success = all(tool in found_tools for tool in test_case.expected_tools)

        if verbose:
            print(f"\nExpected tools: {test_case.expected_tools}")
            print(f"Found tools: {found_tools}")
            if success:
                print("✅ SUCCESS: All expected tools were found!")
            else:
                missing_tools = [
                    tool for tool in test_case.expected_tools if tool not in found_tools
                ]
                print(f"❌ FAILURE: Missing tools: {missing_tools}")
                print(f"   Total tools returned: {len(found_tools)}")

        return TestResult(
            name=test_case.name,
            expected_tools=test_case.expected_tools,
            found_tools=found_tools,
            success=success,
            response=response,
        )

    def run_all_tests(self, test_cases: list[TestCase], verbose: bool = True) -> list[TestResult]:
        """Run all test cases and return the results."""
        results = []

        for test_case in test_cases:
            result = self.run_test_case(test_case, verbose=verbose)
            results.append(result)

        # Print summary
        if verbose:
            self._print_summary(results)

        return results

    def _print_summary(self, results: list[TestResult]) -> None:
        """Print a summary of all test results."""
        print(f"\n{'=' * 80}")
        print("SUMMARY")
        print(f"{'=' * 80}")

        passed = sum(1 for r in results if r.success)
        total = len(results)

        print(f"\nTotal tests: {total}")
        print(f"Passed: {passed}")
        print(f"Failed: {total - passed}")
        print(f"Success rate: {passed / total * 100:.1f}%")

        if total - passed > 0:
            print("\nFailed tests:")
            for result in results:
                if not result.success:
                    missing_tools = [
                        tool for tool in result.expected_tools if tool not in result.found_tools
                    ]
                    print(f"  - {result.name}: Missing {missing_tools}")


 # Define the servers to load
 SERVER_NAMES = [
    "Asana",
    "ClickUp",
    "Confluence",
    "Dropbox",
    "E2B",
    "Firecrawl",
    "Github",
    "Gmail",
    "GoogleCalendar",
    "GoogleContacts",
    "GoogleDocs",
    "GoogleDrive",
    "GoogleFinance",
    "GoogleFlights",
    "GoogleHotels",
    "GoogleJobs",
    "GoogleMaps",
    "GoogleNews",
    "GoogleSearch",
    "GoogleSheets",
    "GoogleShopping",
    "GoogleSlides",
    "HubSpot",
    "Imgflip",
    "Linear",
    "LinkedIn",
    "SharePoint",
    "MicrosoftTeams",
    "NotionToolkit",
    "OutlookCalendar",
    "OutlookMail",
    "Reddit",
    "Salesforce",
    "Slack",
    "Spotify",
    "Stripe",
    "Walmart",
    "X",
    "Youtube",
    "Zendesk",
    "Zoom",
    "AsanaApi",
    "ArcadeEngineApi",
    "BoxApi",
    "CalendlyApi",
    "ClickupApi",
    "CursorAgentsApi",
    "CustomerioPipelinesApi",
    "CustomerioTrackApi",
    "FigmaApi",
    "GithubApi",
    "HubspotAutomationApi",
    "HubspotCmsApi",
    "HubspotConversationsApi",
    "HubspotCrmApi",
    "HubspotMarketingApi",
    "HubspotMeetingsApi",
    "HubspotUsersApi",
    "MiroApi",
    "SlackApi",
    "StripeApi",
    "TicktickApi",
    "TrelloApi",
    "VercelApi",
    "ZohobooksApi",
    "ZohocreatorApi",
 ]

 # Define test cases
 TEST_CASES = [
    TestCase(
        name="Reddit - Get posts from subreddit",
        messages=[
            {
                "role": "user",
                "content": "I want to read some posts on a specific subreddit.",
            },
            {
                "role": "assistant",
                "content": "I'd be happy to help you with your task. Which subreddit are you interested in?",
            },
            {"role": "user", "content": "mcp"},
        ],
        expected_tools=["Reddit_GetPostsInSubreddit"],
    ),
    TestCase(
        name="Gmail - Send email",
        messages=[
            {
                "role": "user",
                "content": "I need to send an Email to my colleague about the project update.",
            },
        ],
        expected_tools=["Gmail_SendEmail"],
    ),
    TestCase(
        name="Google Calendar - Create event",
        messages=[
            {
                "role": "user",
                "content": "Schedule a meeting for 1:1 tomorrow at 2pm. Invite [email protected]",
            },
        ],
        expected_tools=["GoogleCalendar_CreateEvent"],
    ),
    TestCase(
        name="Slack - Send message",
        messages=[
            {
                "role": "user",
                "content": "Post a message to the #general channel in Slack that says that I will be OOO tomorrow.",
            },
        ],
        expected_tools=["Slack_SendMessage"],
    ),
    TestCase(
        name="GitHub - Create issue",
        messages=[
            {
                "role": "user",
                "content": "Create a new issue in my GitHub repo arcadeai/arcade-mcp that says 'This isn't an issue, per se, but great work on this!'",
            },
        ],
        expected_tools=["Github_CreateIssue"],
    ),
    TestCase(
        name="Google Docs - Create blank document",
        messages=[
            {
                "role": "user",
                "content": "I need to write a document. Can you help me create a new Google Doc?",
            },
            {
                "role": "assistant",
                "content": "Of course! I can help you create a new Google Doc. What would you like to name it?",
            },
            {
                "role": "user",
                "content": "Call it 'Q1 Planning Notes'",
            },
        ],
        expected_tools=["GoogleDocs_CreateBlankDocument"],
    ),
    TestCase(
        name="Google Drive - Search for files",
        messages=[
            {
                "role": "user",
                "content": "Find all the PDF files in my Google Drive that contain the word 'invoice'",
            },
        ],
        expected_tools=["GoogleDrive_SearchFiles"],
    ),
    TestCase(
        name="Google Sheets - Create spreadsheet",
        messages=[
            {
                "role": "user",
                "content": "Create a new spreadsheet to track my expenses for this month.",
            },
        ],
        expected_tools=["GoogleSheets_CreateSpreadsheet"],
    ),
    TestCase(
        name="Notion - Create page",
        messages=[
            {
                "role": "user",
                "content": "I want to create a new page in Notion.",
            },
            {
                "role": "assistant",
                "content": "I can help with that! What would you like to title the page, and where should it be created?",
            },
            {
                "role": "user",
                "content": "Title it 'Meeting Notes - Nov 26' and have it be a child of the 'Meetings' page",
            },
        ],
        expected_tools=["NotionToolkit_CreatePage"],
    ),
    TestCase(
        name="Linear - Create issue",
        messages=[
            {
                "role": "user",
                "content": "I'm a part of so many teams in Linear and it's hard to keep track of them all. Can you help me get a list of all my teams?",
            },
        ],
        expected_tools=["Linear_GetTeams"],
    ),
    TestCase(
        name="Asana - Create task",
        messages=[
            {
                "role": "user",
                "content": "Add a task in Asana to review the pull requests by Friday.",
            },
        ],
        expected_tools=["Asana_CreateTask"],
    ),
    TestCase(
        name="Spotify - Play music",
        messages=[
            {
                "role": "user",
                "content": "Play some music on Spotify.",
            },
            {
                "role": "assistant",
                "content": "I'd be happy to play music for you! What would you like to listen to?",
            },
            {
                "role": "user",
                "content": "Play the 'Helter Skelter' song by The Beatles",
            },
        ],
        expected_tools=["Spotify_PlayTrackByName"],
    ),
    TestCase(
        name="YouTube - Search videos",
        messages=[
            {
                "role": "user",
                "content": "Search YouTube for tutorials on Python async programming.",
            },
        ],
        expected_tools=["Youtube_SearchVideos"],
    ),
    TestCase(
        name="Stripe - Create customer",
        messages=[
            {
                "role": "user",
                "content": "Create a new customer in Stripe for [email protected]",
            },
        ],
        expected_tools=["Stripe_CreateCustomer"],
    ),
    TestCase(
        name="HubSpot - Create contact",
        messages=[
            {
                "role": "user",
                "content": "I need to add a new contact to HubSpot.",
            },
            {
                "role": "assistant",
                "content": "Sure! I can help you add a contact to HubSpot. What are the contact details?",
            },
            {
                "role": "user",
                "content": "Name is Sarah Johnson, email is [email protected]",
            },
        ],
        expected_tools=["HubSpot_CreateContact"],
    ),
    TestCase(
        name="Zendesk - Create ticket",
        messages=[
            {
                "role": "user",
                "content": "List recent tickets that I have in Zendesk.",
            },
        ],
        expected_tools=["Zendesk_CreateTicket"],
    ),
    TestCase(
        name="Confluence - Create page",
        messages=[
            {
                "role": "user",
                "content": "I want to document our new API endpoints in Confluence.",
            },
            {
                "role": "assistant",
                "content": "Great idea! I can help you create a Confluence page. What should the title be?",
            },
            {
                "role": "user",
                "content": "Call it 'API Documentation - v2.0.0'",
            },
        ],
        expected_tools=["Confluence_CreatePage"],
    ),
    TestCase(
        name="Microsoft Teams - Send message",
        messages=[
            {
                "role": "user",
                "content": "Send a message in the Engineering channel on Teams to remind everyone about standup.",
            },
        ],
        expected_tools=["MicrosoftTeams_SendMessageToChannel"],
    ),
    TestCase(
        name="Outlook Mail - Send email",
        messages=[
            {
                "role": "user",
                "content": "Draft up an email in Outlook to the team about tomorrow's workshop.",
            },
        ],
        expected_tools=["OutlookMail_CreateDraftEmail"],
    ),
    TestCase(
        name="Dropbox - List files",
        messages=[
            {
                "role": "user",
                "content": "What files do I have in my Dropbox?",
            },
            {
                "role": "assistant",
                "content": "I can check your Dropbox for you. Would you like to see files in a specific folder or all files?",
            },
            {
                "role": "user",
                "content": "Show me what's in the 'Documents' folder",
            },
        ],
        expected_tools=["Dropbox_ListItemsInFolder"],
    ),
    TestCase(
        name="X (Twitter) - Post tweet",
        messages=[
            {
                "role": "user",
                "content": "Post a tweet saying 'Just finished an amazing project with the team! 🚀'",
            },
        ],
        expected_tools=["X_PostTweet"],
    ),
    TestCase(
        name="ClickUp - Create task",
        messages=[
            {
                "role": "user",
                "content": "Add a task in ClickUp to update the documentation.",
            },
        ],
        expected_tools=["ClickUp_CreateTask"],
    ),
    TestCase(
        name="Salesforce - Create contact",
        messages=[
            {
                "role": "user",
                "content": "I met a potential customer at a conference. Can you help me add them to Salesforce?",
            },
            {
                "role": "assistant",
                "content": "Absolutely! I can help you create a contact in Salesforce. What are their details?",
            },
            {
                "role": "user",
                "content": "Name is Michael Chen, company is TechCorp, email is [email protected]",
            },
        ],
        expected_tools=["Salesforce_CreateContact"],
    ),
    TestCase(
        name="Google Slides - Create presentation",
        messages=[
            {
                "role": "user",
                "content": "Create a new Google Slides presentation for my quarterly review. It should be a 5 slide presentation with a title slide, an overview slide, a section on the past quarter, a section on the current quarter, and a section on the future quarter.",
            },
        ],
        expected_tools=["GoogleSlides_CreatePresentation"],
    ),
    TestCase(
        name="Figma - Get file info",
        messages=[
            {
                "role": "user",
                "content": "I need information about a Figma design file.",
            },
            {
                "role": "assistant",
                "content": "I can help you get information about a Figma file. What's the file key or URL?",
            },
            {
                "role": "user",
                "content": "It's the file with key abc123def456",
            },
        ],
        expected_tools=["FigmaApi_FetchFigmaFile"],
    ),
 ]


 if __name__ == "__main__":
    # Initialize clients
    arcade_client = Arcade()
    anthropic_client = Anthropic()

    # Create evaluation suite
    suite = ToolSearchEvaluationSuite(
        server_names=SERVER_NAMES,
        arcade_client=arcade_client,
        anthropic_client=anthropic_client,
    )

    # Load tools once
    print("Loading tools...")
    suite.load_tools(verbose=True)

    # Run all test cases
    results = suite.run_all_tests(TEST_CASES, verbose=True)
	"""
	This script provides testing Anthropic's tool search functionality across multiple scenarios.

	REQUIRED:
	- pip install arcadepy
	- pip install anthropic
	- export ANTHROPIC_API_KEY=<your_anthropic_api_key>
	- export ARCADE_API_KEY=<your_arcade_api_key>
	"""

	from concurrent.futures import ThreadPoolExecutor, as_completed
	from dataclasses import dataclass

	from anthropic import Anthropic
	from arcadepy import Arcade


	@dataclass
	class TestCase:
	"""Defines a single test case for tool search evaluation."""

	name: str
	messages: list[dict]
	expected_tools: list[str] # List of tool names that should be found


	@dataclass
	class TestResult:
	"""Results from running a test case."""

	name: str
	expected_tools: list[str]
	found_tools: list[str]
	success: bool
	response: object


	class ToolSearchEvaluationSuite:
	"""Evaluation suite for testing tool search functionality."""

	def __init__(
	self,
	server_names: list[str],
	arcade_client: Arcade,
	anthropic_client: Anthropic,
	max_workers: int = 20,
	):
	self.server_names = server_names
	self.arcade_client = arcade_client
	self.anthropic_client = anthropic_client
	self.max_workers = max_workers
	self.arcade_tools: list[dict] = []
	self.search_tool = [
	{"type": "tool_search_tool_bm25_20251119", "name": "tool_search_tool_bm25"}
	]

	def _fetch_server_tools(self, tk_name: str) -> tuple[str, list[dict], int]:
	"""Fetch tools for a single server."""
	tools = []
	tools_page_iter = self.arcade_client.tools.list(toolkit=tk_name, include_format="anthropic")
	for tool in tools_page_iter:
	tool_def = tool.formatted_schema["anthropic"]
	tool_def["defer_loading"] = True
	tools.append(tool_def)
	return tk_name, tools, len(tools)

	def load_tools(self, verbose: bool = True) -> None:
	"""Load all tools from the specified servers in parallel."""
	self.arcade_tools = []

	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	# Submit all server fetches
	future_to_server = {
	executor.submit(self._fetch_server_tools, tk_name): tk_name
	for tk_name in self.server_names
	}

	# Process results as they complete
	for future in as_completed(future_to_server):
	tk_name, tools, num_tools = future.result()
	self.arcade_tools.extend(tools)
	if verbose:
	print(f"{tk_name}: {num_tools} tools. Total tools: {len(self.arcade_tools)}")

	if verbose:
	print(f"\nTotal: {len(self.arcade_tools)} tools loaded\n")

	def run_test_case(self, test_case: TestCase, verbose: bool = True) -> TestResult:
	"""Run a single test case and return the results."""
	if verbose:
	print(f"\n{'=' * 80}")
	print(f"Running test: {test_case.name}")
	print(f"{'=' * 80}")

	response = self.anthropic_client.beta.messages.create(
	model="claude-sonnet-4-5-20250929",
	betas=["advanced-tool-use-2025-11-20", "mcp-client-2025-11-20"],
	max_tokens=2048,
	tools=self.search_tool + self.arcade_tools,
	messages=test_case.messages,
	)

	# Extract found tools from the response
	found_tools = []
	for content_block in response.content:
	if hasattr(content_block, "type") and content_block.type == "tool_search_tool_result":
	tool_references = content_block.content.tool_references
	found_tools = [ref.tool_name for ref in tool_references]
	break

	# Check if all expected tools were found
	success = all(tool in found_tools for tool in test_case.expected_tools)

	if verbose:
	print(f"\nExpected tools: {test_case.expected_tools}")
	print(f"Found tools: {found_tools}")
	if success:
	print("✅ SUCCESS: All expected tools were found!")
	else:
	missing_tools = [
	tool for tool in test_case.expected_tools if tool not in found_tools
	]
	print(f"❌ FAILURE: Missing tools: {missing_tools}")
	print(f" Total tools returned: {len(found_tools)}")

	return TestResult(
	name=test_case.name,
	expected_tools=test_case.expected_tools,
	found_tools=found_tools,
	success=success,
	response=response,
	)

	def run_all_tests(self, test_cases: list[TestCase], verbose: bool = True) -> list[TestResult]:
	"""Run all test cases and return the results."""
	results = []

	for test_case in test_cases:
	result = self.run_test_case(test_case, verbose=verbose)
	results.append(result)

	# Print summary
	if verbose:
	self._print_summary(results)

	return results

	def _print_summary(self, results: list[TestResult]) -> None:
	"""Print a summary of all test results."""
	print(f"\n{'=' * 80}")
	print("SUMMARY")
	print(f"{'=' * 80}")

	passed = sum(1 for r in results if r.success)
	total = len(results)

	print(f"\nTotal tests: {total}")
	print(f"Passed: {passed}")
	print(f"Failed: {total - passed}")
	print(f"Success rate: {passed / total * 100:.1f}%")

	if total - passed > 0:
	print("\nFailed tests:")
	for result in results:
	if not result.success:
	missing_tools = [
	tool for tool in result.expected_tools if tool not in result.found_tools
	]
	print(f" - {result.name}: Missing {missing_tools}")


	# Define the servers to load
	SERVER_NAMES = [
	"Asana",
	"ClickUp",
	"Confluence",
	"Dropbox",
	"E2B",
	"Firecrawl",
	"Github",
	"Gmail",
	"GoogleCalendar",
	"GoogleContacts",
	"GoogleDocs",
	"GoogleDrive",
	"GoogleFinance",
	"GoogleFlights",
	"GoogleHotels",
	"GoogleJobs",
	"GoogleMaps",
	"GoogleNews",
	"GoogleSearch",
	"GoogleSheets",
	"GoogleShopping",
	"GoogleSlides",
	"HubSpot",
	"Imgflip",
	"Linear",
	"LinkedIn",
	"SharePoint",
	"MicrosoftTeams",
	"NotionToolkit",
	"OutlookCalendar",
	"OutlookMail",
	"Reddit",
	"Salesforce",
	"Slack",
	"Spotify",
	"Stripe",
	"Walmart",
	"X",
	"Youtube",
	"Zendesk",
	"Zoom",
	"AsanaApi",
	"ArcadeEngineApi",
	"BoxApi",
	"CalendlyApi",
	"ClickupApi",
	"CursorAgentsApi",
	"CustomerioPipelinesApi",
	"CustomerioTrackApi",
	"FigmaApi",
	"GithubApi",
	"HubspotAutomationApi",
	"HubspotCmsApi",
	"HubspotConversationsApi",
	"HubspotCrmApi",
	"HubspotMarketingApi",
	"HubspotMeetingsApi",
	"HubspotUsersApi",
	"MiroApi",
	"SlackApi",
	"StripeApi",
	"TicktickApi",
	"TrelloApi",
	"VercelApi",
	"ZohobooksApi",
	"ZohocreatorApi",
	]

	# Define test cases
	TEST_CASES = [
	TestCase(
	name="Reddit - Get posts from subreddit",
	messages=[
	{
	"role": "user",
	"content": "I want to read some posts on a specific subreddit.",
	},
	{
	"role": "assistant",
	"content": "I'd be happy to help you with your task. Which subreddit are you interested in?",
	},
	{"role": "user", "content": "mcp"},
	],
	expected_tools=["Reddit_GetPostsInSubreddit"],
	),
	TestCase(
	name="Gmail - Send email",
	messages=[
	{
	"role": "user",
	"content": "I need to send an Email to my colleague about the project update.",
	},
	],
	expected_tools=["Gmail_SendEmail"],
	),
	TestCase(
	name="Google Calendar - Create event",
	messages=[
	{
	"role": "user",
	"content": "Schedule a meeting for 1:1 tomorrow at 2pm. Invite [email protected]",
	},
	],
	expected_tools=["GoogleCalendar_CreateEvent"],
	),
	TestCase(
	name="Slack - Send message",
	messages=[
	{
	"role": "user",
	"content": "Post a message to the #general channel in Slack that says that I will be OOO tomorrow.",
	},
	],
	expected_tools=["Slack_SendMessage"],
	),
	TestCase(
	name="GitHub - Create issue",
	messages=[
	{
	"role": "user",
	"content": "Create a new issue in my GitHub repo arcadeai/arcade-mcp that says 'This isn't an issue, per se, but great work on this!'",
	},
	],
	expected_tools=["Github_CreateIssue"],
	),
	TestCase(
	name="Google Docs - Create blank document",
	messages=[
	{
	"role": "user",
	"content": "I need to write a document. Can you help me create a new Google Doc?",
	},
	{
	"role": "assistant",
	"content": "Of course! I can help you create a new Google Doc. What would you like to name it?",
	},
	{
	"role": "user",
	"content": "Call it 'Q1 Planning Notes'",
	},
	],
	expected_tools=["GoogleDocs_CreateBlankDocument"],
	),
	TestCase(
	name="Google Drive - Search for files",
	messages=[
	{
	"role": "user",
	"content": "Find all the PDF files in my Google Drive that contain the word 'invoice'",
	},
	],
	expected_tools=["GoogleDrive_SearchFiles"],
	),
	TestCase(
	name="Google Sheets - Create spreadsheet",
	messages=[
	{
	"role": "user",
	"content": "Create a new spreadsheet to track my expenses for this month.",
	},
	],
	expected_tools=["GoogleSheets_CreateSpreadsheet"],
	),
	TestCase(
	name="Notion - Create page",
	messages=[
	{
	"role": "user",
	"content": "I want to create a new page in Notion.",
	},
	{
	"role": "assistant",
	"content": "I can help with that! What would you like to title the page, and where should it be created?",
	},
	{
	"role": "user",
	"content": "Title it 'Meeting Notes - Nov 26' and have it be a child of the 'Meetings' page",
	},
	],
	expected_tools=["NotionToolkit_CreatePage"],
	),
	TestCase(
	name="Linear - Create issue",
	messages=[
	{
	"role": "user",
	"content": "I'm a part of so many teams in Linear and it's hard to keep track of them all. Can you help me get a list of all my teams?",
	},
	],
	expected_tools=["Linear_GetTeams"],
	),
	TestCase(
	name="Asana - Create task",
	messages=[
	{
	"role": "user",
	"content": "Add a task in Asana to review the pull requests by Friday.",
	},
	],
	expected_tools=["Asana_CreateTask"],
	),
	TestCase(
	name="Spotify - Play music",
	messages=[
	{
	"role": "user",
	"content": "Play some music on Spotify.",
	},
	{
	"role": "assistant",
	"content": "I'd be happy to play music for you! What would you like to listen to?",
	},
	{
	"role": "user",
	"content": "Play the 'Helter Skelter' song by The Beatles",
	},
	],
	expected_tools=["Spotify_PlayTrackByName"],
	),
	TestCase(
	name="YouTube - Search videos",
	messages=[
	{
	"role": "user",
	"content": "Search YouTube for tutorials on Python async programming.",
	},
	],
	expected_tools=["Youtube_SearchVideos"],
	),
	TestCase(
	name="Stripe - Create customer",
	messages=[
	{
	"role": "user",
	"content": "Create a new customer in Stripe for [email protected]",
	},
	],
	expected_tools=["Stripe_CreateCustomer"],
	),
	TestCase(
	name="HubSpot - Create contact",
	messages=[
	{
	"role": "user",
	"content": "I need to add a new contact to HubSpot.",
	},
	{
	"role": "assistant",
	"content": "Sure! I can help you add a contact to HubSpot. What are the contact details?",
	},
	{
	"role": "user",
	"content": "Name is Sarah Johnson, email is [email protected]",
	},
	],
	expected_tools=["HubSpot_CreateContact"],
	),
	TestCase(
	name="Zendesk - Create ticket",
	messages=[
	{
	"role": "user",
	"content": "List recent tickets that I have in Zendesk.",
	},
	],
	expected_tools=["Zendesk_CreateTicket"],
	),
	TestCase(
	name="Confluence - Create page",
	messages=[
	{
	"role": "user",
	"content": "I want to document our new API endpoints in Confluence.",
	},
	{
	"role": "assistant",
	"content": "Great idea! I can help you create a Confluence page. What should the title be?",
	},
	{
	"role": "user",
	"content": "Call it 'API Documentation - v2.0.0'",
	},
	],
	expected_tools=["Confluence_CreatePage"],
	),
	TestCase(
	name="Microsoft Teams - Send message",
	messages=[
	{
	"role": "user",
	"content": "Send a message in the Engineering channel on Teams to remind everyone about standup.",
	},
	],
	expected_tools=["MicrosoftTeams_SendMessageToChannel"],
	),
	TestCase(
	name="Outlook Mail - Send email",
	messages=[
	{
	"role": "user",
	"content": "Draft up an email in Outlook to the team about tomorrow's workshop.",
	},
	],
	expected_tools=["OutlookMail_CreateDraftEmail"],
	),
	TestCase(
	name="Dropbox - List files",
	messages=[
	{
	"role": "user",
	"content": "What files do I have in my Dropbox?",
	},
	{
	"role": "assistant",
	"content": "I can check your Dropbox for you. Would you like to see files in a specific folder or all files?",
	},
	{
	"role": "user",
	"content": "Show me what's in the 'Documents' folder",
	},
	],
	expected_tools=["Dropbox_ListItemsInFolder"],
	),
	TestCase(
	name="X (Twitter) - Post tweet",
	messages=[
	{
	"role": "user",
	"content": "Post a tweet saying 'Just finished an amazing project with the team! 🚀'",
	},
	],
	expected_tools=["X_PostTweet"],
	),
	TestCase(
	name="ClickUp - Create task",
	messages=[
	{
	"role": "user",
	"content": "Add a task in ClickUp to update the documentation.",
	},
	],
	expected_tools=["ClickUp_CreateTask"],
	),
	TestCase(
	name="Salesforce - Create contact",
	messages=[
	{
	"role": "user",
	"content": "I met a potential customer at a conference. Can you help me add them to Salesforce?",
	},
	{
	"role": "assistant",
	"content": "Absolutely! I can help you create a contact in Salesforce. What are their details?",
	},
	{
	"role": "user",
	"content": "Name is Michael Chen, company is TechCorp, email is [email protected]",
	},
	],
	expected_tools=["Salesforce_CreateContact"],
	),
	TestCase(
	name="Google Slides - Create presentation",
	messages=[
	{
	"role": "user",
	"content": "Create a new Google Slides presentation for my quarterly review. It should be a 5 slide presentation with a title slide, an overview slide, a section on the past quarter, a section on the current quarter, and a section on the future quarter.",
	},
	],
	expected_tools=["GoogleSlides_CreatePresentation"],
	),
	TestCase(
	name="Figma - Get file info",
	messages=[
	{
	"role": "user",
	"content": "I need information about a Figma design file.",
	},
	{
	"role": "assistant",
	"content": "I can help you get information about a Figma file. What's the file key or URL?",
	},
	{
	"role": "user",
	"content": "It's the file with key abc123def456",
	},
	],
	expected_tools=["FigmaApi_FetchFigmaFile"],
	),
	]


	if __name__ == "__main__":
	# Initialize clients
	arcade_client = Arcade()
	anthropic_client = Anthropic()

	# Create evaluation suite
	suite = ToolSearchEvaluationSuite(
	server_names=SERVER_NAMES,
	arcade_client=arcade_client,
	anthropic_client=anthropic_client,
	)

	# Load tools once
	print("Loading tools...")
	suite.load_tools(verbose=True)

	# Run all test cases
	results = suite.run_all_tests(TEST_CASES, verbose=True)
No results found