Run agents against CUA environments without conforming to the BaseAgent class.
# Run locally (Docker + QEMU for Windows)
cb run windows_agent.py
# Run on CUA Cloud
cb run windows_agent.py --on cloud --max-parallel 50from cua_bench import Benchmark, Image, Secret
benchmark = Benchmark("windows-eval")
image = (
Image.from_registry("trycua/cua-bench:latest")
.pip_install("anthropic")
)
@benchmark.function(
image=image,
timeout=600,
secrets=[Secret.from_name("anthropic-api-key")],
environment="windows", # provisions Windows VM
)
async def run_task(task_index: int, env, session) -> dict:
"""
env: the cua_bench environment (provides reset/evaluate)
session: DesktopSession connected to Windows VM
"""
screenshot, task = await env.reset(task_id=task_index)
import anthropic
client = anthropic.Anthropic()
for _ in range(50):
screenshot = await session.screenshot()
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": screenshot}},
{"type": "text", "text": f"Task: {task.description}\nReturn JSON: {{\"action\": \"click|type|done\", \"x\": int, \"y\": int, \"text\": str}}"}
]
}]
)
import json
action = json.loads(response.content[0].text)
if action["action"] == "done":
break
elif action["action"] == "click":
await session.click(action["x"], action["y"])
elif action["action"] == "type":
await session.type(action["text"])
result = await env.evaluate()
return {"task_index": task_index, "reward": result}
@benchmark.local_entrypoint()
def main():
results = run_task.map(range(100))
avg = sum(r["reward"] for r in results) / len(results)
print(f"Avg reward: {avg:.2%}")| Feature | Description |
|---|---|
env |
Environment with reset() and evaluate() |
session |
DesktopSession for interacting with the VM |
.map() |
Parallel execution across task variants |
.remote() |
Single remote execution |
With BaseAgent class:
class MyAgent(BaseAgent):
@staticmethod
def name() -> str:
return "my-agent"
async def perform_task(self, task_description, session, logging_dir):
# your logic
return AgentResult(...)With Benchmark API:
@benchmark.function(environment="windows")
async def run_task(task_index: int, env, session) -> dict:
# your logic
return {"reward": result}No class, no interface - just a function.