JacobFV · October 22, 2024 18:30
diff --git a/computer.py b/computer.py
    async def __call__(
        self,
        *,
        action: Action,
        text: str | None = None,
        coordinate: tuple[int, int] | None = None,
        **kwargs,
    ):
        if action in ("mouse_move", "left_click_drag"):
            if coordinate is None:
                raise ToolError(f"coordinate is required for {action}")
            if text is not None:
                raise ToolError(f"text is not accepted for {action}")
            if not isinstance(coordinate, list) or len(coordinate) != 2:
                raise ToolError(f"{coordinate} must be a tuple of length 2")
            if not all(isinstance(i, int) and i >= 0 for i in coordinate):
                raise ToolError(f"{coordinate} must be a tuple of non-negative ints")

            x, y = self.scale_coordinates(
                ScalingSource.API, coordinate[0], coordinate[1]
            )

            if action == "mouse_move":
                return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}")
            elif action == "left_click_drag":
                return await self.shell(
                    f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
                )

        if action in ("key", "type"):
            if text is None:
                raise ToolError(f"text is required for {action}")
            if coordinate is not None:
                raise ToolError(f"coordinate is not accepted for {action}")
            if not isinstance(text, str):
                raise ToolError(output=f"{text} must be a string")

            if action == "key":
                return await self.shell(f"{self.xdotool} key -- {text}")
            elif action == "type":
                results: list[ToolResult] = []
                for chunk in chunks(text, TYPING_GROUP_SIZE):
                    cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
                    results.append(await self.shell(cmd, take_screenshot=False))
                screenshot_base64 = (await self.screenshot()).base64_image
                return ToolResult(
                    output="".join(result.output or "" for result in results),
                    error="".join(result.error or "" for result in results),
                    base64_image=screenshot_base64,
                )

        if action in (
            "left_click",
            "right_click",
            "double_click",
            "middle_click",
            "screenshot",
            "cursor_position",
        ):
            if text is not None:
                raise ToolError(f"text is not accepted for {action}")
            if coordinate is not None:
                raise ToolError(f"coordinate is not accepted for {action}")

            if action == "screenshot":
                return await self.screenshot()
            elif action == "cursor_position":
                result = await self.shell(
                    f"{self.xdotool} getmouselocation --shell",
                    take_screenshot=False,
                )
                output = result.output or ""
                x, y = self.scale_coordinates(
                    ScalingSource.COMPUTER,
                    int(output.split("X=")[1].split("\n")[0]),
                    int(output.split("Y=")[1].split("\n")[0]),
                )
                return result.replace(output=f"X={x},Y={y}")
            else:
                click_arg = {
                    "left_click": "1",
                    "right_click": "3",
                    "middle_click": "2",
                    "double_click": "--repeat 2 --delay 500 1",
                }[action]
                return await self.shell(f"{self.xdotool} click {click_arg}")

        raise ToolError(f"Invalid action: {action}")

    async def screenshot(self):
        """Take a screenshot of the current screen and return the base64 encoded image."""
        output_dir = Path(OUTPUT_DIR)
        output_dir.mkdir(parents=True, exist_ok=True)
        path = output_dir / f"screenshot_{uuid4().hex}.png"

        # Try gnome-screenshot first
        if shutil.which("gnome-screenshot"):
            screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p"
        else:
            # Fall back to scrot if gnome-screenshot isn't available
            screenshot_cmd = f"{self._display_prefix}scrot -p {path}"

        result = await self.shell(screenshot_cmd, take_screenshot=False)
        if self._scaling_enabled:
            x, y = self.scale_coordinates(
                ScalingSource.COMPUTER, self.width, self.height
            )
            await self.shell(
                f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False
            )

        if path.exists():
            return result.replace(
                base64_image=base64.b64encode(path.read_bytes()).decode()
            )
        raise ToolError(f"Failed to take screenshot: {result.error}")
	async def __call__(
	self,
	*,
	action: Action,
	text: str \| None = None,
	coordinate: tuple[int, int] \| None = None,
	**kwargs,
	):
	if action in ("mouse_move", "left_click_drag"):
	if coordinate is None:
	raise ToolError(f"coordinate is required for {action}")
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if not isinstance(coordinate, list) or len(coordinate) != 2:
	raise ToolError(f"{coordinate} must be a tuple of length 2")
	if not all(isinstance(i, int) and i >= 0 for i in coordinate):
	raise ToolError(f"{coordinate} must be a tuple of non-negative ints")

	x, y = self.scale_coordinates(
	ScalingSource.API, coordinate[0], coordinate[1]
	)

	if action == "mouse_move":
	return await self.shell(f"{self.xdotool} mousemove --sync {x} {y}")
	elif action == "left_click_drag":
	return await self.shell(
	f"{self.xdotool} mousedown 1 mousemove --sync {x} {y} mouseup 1"
	)

	if action in ("key", "type"):
	if text is None:
	raise ToolError(f"text is required for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")
	if not isinstance(text, str):
	raise ToolError(output=f"{text} must be a string")

	if action == "key":
	return await self.shell(f"{self.xdotool} key -- {text}")
	elif action == "type":
	results: list[ToolResult] = []
	for chunk in chunks(text, TYPING_GROUP_SIZE):
	cmd = f"{self.xdotool} type --delay {TYPING_DELAY_MS} -- {shlex.quote(chunk)}"
	results.append(await self.shell(cmd, take_screenshot=False))
	screenshot_base64 = (await self.screenshot()).base64_image
	return ToolResult(
	output="".join(result.output or "" for result in results),
	error="".join(result.error or "" for result in results),
	base64_image=screenshot_base64,
	)

	if action in (
	"left_click",
	"right_click",
	"double_click",
	"middle_click",
	"screenshot",
	"cursor_position",
	):
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")

	if action == "screenshot":
	return await self.screenshot()
	elif action == "cursor_position":
	result = await self.shell(
	f"{self.xdotool} getmouselocation --shell",
	take_screenshot=False,
	)
	output = result.output or ""
	x, y = self.scale_coordinates(
	ScalingSource.COMPUTER,
	int(output.split("X=")[1].split("\n")[0]),
	int(output.split("Y=")[1].split("\n")[0]),
	)
	return result.replace(output=f"X={x},Y={y}")
	else:
	click_arg = {
	"left_click": "1",
	"right_click": "3",
	"middle_click": "2",
	"double_click": "--repeat 2 --delay 500 1",
	}[action]
	return await self.shell(f"{self.xdotool} click {click_arg}")

	raise ToolError(f"Invalid action: {action}")

	async def screenshot(self):
	"""Take a screenshot of the current screen and return the base64 encoded image."""
	output_dir = Path(OUTPUT_DIR)
	output_dir.mkdir(parents=True, exist_ok=True)
	path = output_dir / f"screenshot_{uuid4().hex}.png"

	# Try gnome-screenshot first
	if shutil.which("gnome-screenshot"):
	screenshot_cmd = f"{self._display_prefix}gnome-screenshot -f {path} -p"
	else:
	# Fall back to scrot if gnome-screenshot isn't available
	screenshot_cmd = f"{self._display_prefix}scrot -p {path}"

	result = await self.shell(screenshot_cmd, take_screenshot=False)
	if self._scaling_enabled:
	x, y = self.scale_coordinates(
	ScalingSource.COMPUTER, self.width, self.height
	)
	await self.shell(
	f"convert {path} -resize {x}x{y}! {path}", take_screenshot=False
	)

	if path.exists():
	return result.replace(
	base64_image=base64.b64encode(path.read_bytes()).decode()
	)
	raise ToolError(f"Failed to take screenshot: {result.error}")