Skip to content

Instantly share code, notes, and snippets.

@Timtech4u
Created March 18, 2026 11:12
Show Gist options
  • Select an option

  • Save Timtech4u/ae08c163f280a5048f7bc3adfef1fb91 to your computer and use it in GitHub Desktop.

Select an option

Save Timtech4u/ae08c163f280a5048f7bc3adfef1fb91 to your computer and use it in GitHub Desktop.
Browser control AI agent using Google ADK + Chrome CDP — companion code for Build With AI Kigali talk
"""Browser control agent using Google ADK and Chrome DevTools Protocol."""
import json
import base64
import requests
import websocket
from google.adk.agents import Agent
CHROME_HOST = "http://localhost:9222"
def _cdp_send(ws_url: str, method: str, params: dict = None) -> dict:
"""Send a CDP command over WebSocket and return the result."""
ws = websocket.create_connection(ws_url)
try:
msg = {"id": 1, "method": method, "params": params or {}}
ws.send(json.dumps(msg))
while True:
data = json.loads(ws.recv())
if data.get("id") == 1:
return data.get("result", {})
finally:
ws.close()
def _get_tabs() -> list:
"""Get all open Chrome tabs via CDP HTTP endpoint."""
resp = requests.get(f"{CHROME_HOST}/json/list")
return resp.json()
def _get_first_page_ws() -> str | None:
"""Get the WebSocket URL for the first page tab."""
tabs = _get_tabs()
pages = [t for t in tabs if t.get("type") == "page"]
return pages[0]["webSocketDebuggerUrl"] if pages else None
# --- Tool Functions ---
def list_tabs() -> dict:
"""List all open browser tabs with their titles and URLs."""
tabs = _get_tabs()
result = []
for i, tab in enumerate(tabs):
if tab.get("type") == "page":
result.append({
"index": i,
"title": tab.get("title", ""),
"url": tab.get("url", "")
})
return {"tabs": result, "count": len(result)}
def open_url(url: str) -> dict:
"""Open a new browser tab with the given URL.
Args:
url: The full URL to open (e.g. https://example.com).
"""
resp = requests.put(f"{CHROME_HOST}/json/new?{url}")
tab = resp.json()
return {"status": "success", "opened": url, "title": tab.get("title", "")}
def get_page_text() -> dict:
"""Get the visible text content of the current page (first 3000 characters)."""
ws_url = _get_first_page_ws()
if not ws_url:
return {"error": "No tabs open"}
result = _cdp_send(
ws_url, "Runtime.evaluate",
{"expression": "document.body.innerText.substring(0, 3000)"}
)
text = result.get("result", {}).get("value", "")
return {"status": "success", "text": text, "length": len(text)}
def take_screenshot() -> dict:
"""Take a screenshot of the current page and save it to /tmp."""
ws_url = _get_first_page_ws()
if not ws_url:
return {"error": "No tabs open"}
result = _cdp_send(ws_url, "Page.captureScreenshot")
img_data = base64.b64decode(result.get("data", ""))
path = "/tmp/agent-screenshot.png"
with open(path, "wb") as f:
f.write(img_data)
return {"status": "success", "saved": path, "size_bytes": len(img_data)}
def click_element(selector: str) -> dict:
"""Click an element on the page using a CSS selector.
Args:
selector: CSS selector for the element to click (e.g. 'button.submit', '#login').
"""
ws_url = _get_first_page_ws()
if not ws_url:
return {"error": "No tabs open"}
js = f"""(() => {{
const el = document.querySelector('{selector}');
if (!el) return 'Element not found: {selector}';
el.click();
return 'Clicked: ' + el.tagName + ' ' + (el.textContent || '').substring(0, 50);
}})()"""
result = _cdp_send(ws_url, "Runtime.evaluate", {"expression": js})
return {"status": "success", "result": result.get("result", {}).get("value", "")}
def run_javascript(code: str) -> dict:
"""Execute JavaScript code on the current page and return the result.
Args:
code: JavaScript code to evaluate in the page context.
"""
ws_url = _get_first_page_ws()
if not ws_url:
return {"error": "No tabs open"}
result = _cdp_send(ws_url, "Runtime.evaluate", {"expression": code})
return {"status": "success", "result": result.get("result", {}).get("value", "")}
# --- The Agent ---
root_agent = Agent(
name="browser_agent",
model="gemini-2.5-flash",
description="An AI agent that controls your Chrome browser via CDP.",
instruction="""You are a browser control agent. You can open tabs, read page content,
take screenshots, click elements, and run JavaScript — all in the user's real Chrome browser.
When the user asks you to do something with their browser, use the appropriate tool.
Always confirm what you did after each action. Be concise but informative.""",
tools=[
list_tabs,
open_url,
get_page_text,
take_screenshot,
click_element,
run_javascript,
],
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment