Skip to content

Instantly share code, notes, and snippets.

@JacobFV
Created October 22, 2024 18:30
Show Gist options
  • Save JacobFV/2ed9912170c26d558cec0f13e034824c to your computer and use it in GitHub Desktop.
Save JacobFV/2ed9912170c26d558cec0f13e034824c to your computer and use it in GitHub Desktop.
async def __call__(
self,
*,
action: Action,
text: str | None = None,
coordinate: tuple[int, int] | None = None,
**kwargs,
):
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
raise ToolError(f"coordinate is required for {action}")
if text is not None:
raise ToolError(f"text is not accepted for {action}")
if not isinstance(coordinate, list) or len(coordinate) != 2:
raise ToolError(f"{coordinate} must be a tuple of length 2")
if not all(isinstance(i, int) and i >= 0 for i in coordinate):
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
x, y = self.scale_coordinates(
ScalingSource.API, coordinate[0], coordinate[1]
)
if action == "mouse_move":
return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}")
elif action == "left_click_drag":
return await self.shell(
f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
)
if action in ("key", "type"):
if text is None:
raise ToolError(f"text is required for {action}")
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if not isinstance(text, str):
raise ToolError(output=f"{text} must be a string")
if action == "key":
return await self.shell(f"{self.xdotool} key -- {text}")
elif action == "type":
results: list[ToolResult] = []
for chunk in chunks(text, TYPING_GROUP_SIZE):
cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
results.append(await self.shell(cmd, take_screenshot=False))
screenshot_base64 = (await self.screenshot()).base64_image
return ToolResult(
output="".join(result.output or "" for result in results),
error="".join(result.error or "" for result in results),
base64_image=screenshot_base64,
)
if action in (
"left_click",
"right_click",
"double_click",
"middle_click",
"screenshot",
"cursor_position",
):
if text is not None:
raise ToolError(f"text is not accepted for {action}")
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if action == "screenshot":
return await self.screenshot()
elif action == "cursor_position":
result = await self.shell(
f"{self.xdotool} getmouselocation --shell",
take_screenshot=False,
)
output = result.output or ""
x, y = self.scale_coordinates(
ScalingSource.COMPUTER,
int(output.split("X=")[1].split("\n")[0]),
int(output.split("Y=")[1].split("\n")[0]),
)
return result.replace(output=f"X={x},Y={y}")
else:
click_arg = {
"left_click": "1",
"right_click": "3",
"middle_click": "2",
"double_click": "--repeat 2 --delay 500 1",
}[action]
return await self.shell(f"{self.xdotool} click {click_arg}")
raise ToolError(f"Invalid action: {action}")
async def screenshot(self):
"""Take a screenshot of the current screen and return the base64 encoded image."""
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
path = output_dir / f"screenshot_{uuid4().hex}.png"
# Try gnome-screenshot first
if shutil.which("gnome-screenshot"):
screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p"
else:
# Fall back to scrot if gnome-screenshot isn't available
screenshot_cmd = f"{self._display_prefix}scrot -p {path}"
result = await self.shell(screenshot_cmd, take_screenshot=False)
if self._scaling_enabled:
x, y = self.scale_coordinates(
ScalingSource.COMPUTER, self.width, self.height
)
await self.shell(
f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False
)
if path.exists():
return result.replace(
base64_image=base64.b64encode(path.read_bytes()).decode()
)
raise ToolError(f"Failed to take screenshot: {result.error}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment