amosgyamfi · September 30, 2025 11:40
diff --git a/gemini_vision_ai.py b/gemini_vision_ai.py
 import io
 import json
 import re
 from typing import Any, Dict, List

 import streamlit as st
 from google import genai
 from google.genai import types
 from PIL import Image, ImageDraw, ImageFont


 MODEL_ID = "gemini-robotics-er-1.5-preview"
 PROMPT = """
 Point to no more than 10 items in the image. The label returned should be an identifying name for the object detected. The answer should follow the json format: [{"point": <point>, "label": <label>}]. The points are in [y, x] format normalized to 0-1000.
 """
 MAX_MODEL_WIDTH = 1024


 st.set_page_config(page_title="Gemini Vision Points", layout="wide")
 st.title("Gemini Vision Pointing Demo")
 st.caption("Visualize Gemini Robotics ER detections as labeled points over your image.")


 @st.cache_resource(show_spinner=False)
 def get_client() -> genai.Client:
    return genai.Client()


 def load_image(image_source: io.BytesIO | str) -> Image.Image:
    image = Image.open(image_source).convert("RGB")
    if image.width > MAX_MODEL_WIDTH:
        scale = MAX_MODEL_WIDTH / image.width
        new_size = (MAX_MODEL_WIDTH, int(image.height * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)
    return image


 def extract_json_payload(raw_text: str) -> str:
    text = raw_text.strip()
    if not text:
        raise ValueError("Model returned an empty response.")

    code_block_match = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", text, flags=re.DOTALL)
    if code_block_match:
        return code_block_match.group(1)

    bracket_start = text.find("[")
    bracket_end = text.rfind("]")
    if bracket_start != -1 and bracket_end != -1 and bracket_end > bracket_start:
        return text[bracket_start : bracket_end + 1]

    raise ValueError("Model response did not contain a JSON array.")


 def parse_detections(raw_text: str) -> List[Dict[str, Any]]:
    payload = extract_json_payload(raw_text)
    try:
        detections = json.loads(payload)
    except json.JSONDecodeError as exc:
        raise ValueError("Model response was not valid JSON array.") from exc

    if not isinstance(detections, list):
        raise ValueError("Model response must be a list of detections.")

    cleaned: List[Dict[str, Any]] = []
    for detection in detections:
        point = detection.get("point")
        label = detection.get("label")
        if not isinstance(point, (list, tuple)) or len(point) != 2:
            continue
        if not isinstance(label, str) or not label.strip():
            continue
        try:
            y = float(point[0])
            x = float(point[1])
        except (TypeError, ValueError):
            continue
        cleaned.append({"point": [y, x], "label": label.strip()})

    if not cleaned:
        raise ValueError("No valid detections returned by the model.")

    return cleaned


 def annotate_image(image: Image.Image, detections: List[Dict[str, Any]]) -> Image.Image:
    annotated = image.copy()
    draw = ImageDraw.Draw(annotated)
    font = ImageFont.load_default()

    width, height = annotated.size
    for detection in detections:
        y_norm, x_norm = detection["point"]
        px = max(0, min(width, (x_norm / 1000) * width))
        py = max(0, min(height, (y_norm / 1000) * height))

        r = max(6, min(width, height) // 50)
        ellipse_bounds = [(px - r, py - r), (px + r, py + r)]
        draw.ellipse(ellipse_bounds, outline="#1D4ED8", width=3, fill="#93C5FD88")

        text = detection["label"]
        text_bbox = draw.textbbox((0, 0), text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]

        pad_x = 10
        pad_y = 6
        bubble_width = text_width + pad_x * 2
        bubble_height = text_height + pad_y * 2

        box_x0 = px - bubble_width / 2
        if box_x0 < 4:
            box_x0 = 4
        if box_x0 + bubble_width > width - 4:
            box_x0 = width - bubble_width - 4

        box_y0 = py - r - bubble_height - 12
        if box_y0 < 4:
            box_y0 = min(height - bubble_height - 4, py + r + 12)

        box_x1 = box_x0 + bubble_width
        box_y1 = box_y0 + bubble_height
        radius = max(6, int(bubble_height * 0.4))

        draw.rounded_rectangle(
            [box_x0, box_y0, box_x1, box_y1],
            radius=radius,
            fill="#0F172ACC",
            outline="#60A5FA",
            width=2,
        )
        text_position = (box_x0 + pad_x, box_y0 + pad_y)
        draw.text(text_position, text, fill="#F8FAFC", font=font)

        connector_y = box_y1 + 4 if box_y0 < py else box_y0 - 4
        draw.line([(px, py), (px, connector_y)], fill="#1D4ED8", width=2)

    return annotated


 @st.cache_data(show_spinner=False)
 def run_inference(image_bytes: bytes, temperature: float, thinking_budget: int) -> Dict[str, Any]:
    image = load_image(io.BytesIO(image_bytes))
    client = get_client()
    response = client.models.generate_content(
        model=MODEL_ID,
        contents=[image, PROMPT],
        config=types.GenerateContentConfig(
            temperature=temperature,
            thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget),
        ),
    )
    raw_text = response.text.strip()
    detections = parse_detections(raw_text)
    annotated = annotate_image(image, detections)
    return {"detections": detections, "annotated": annotated, "raw_text": raw_text}


 with st.sidebar:
    st.header("Image & Model Settings")
    uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "webp"])
    temperature = st.slider("Temperature", 0.0, 1.0, 0.3, step=0.05)
    thinking_budget = st.slider("Thinking budget", 0, 30, 0, step=1)
    st.markdown("Using model: `gemini-robotics-er-1.5-preview`")


 default_image_path = "/Users/amosgyamfi/Documents/StreamDevRel/2026/AIPython/GeminiAPI/my-image.png"

 if uploaded_file is not None:
    image_source = uploaded_file
    filename = uploaded_file.name
 else:
    image_source = default_image_path
    filename = "my-image.png"


 try:
    base_image = load_image(image_source)
 except FileNotFoundError:
    st.error("Default image `my-image.png` was not found. Upload an image to continue.")
    st.stop()

 st.subheader("Input Image")
 st.image(base_image, caption=f"Original image: {filename}", use_container_width=True)

 image_bytes_io = io.BytesIO()
 base_image.save(image_bytes_io, format="PNG")

 with st.spinner("Calling Gemini Vision model..."):
    try:
        result = run_inference(image_bytes_io.getvalue(), temperature, thinking_budget)
    except ValueError as exc:
        st.error(f"Gemini Vision returned no detections: {exc}")
        st.stop()
    except Exception as exc:  # noqa: BLE001
        st.exception(exc)
        st.stop()

 st.subheader("Detections")
 annotated_image = result["annotated"]
 st.image(annotated_image, caption="Annotated detections", use_container_width=True)

 with st.expander("Detection details", expanded=True):
    st.json(result["detections"])

 with st.expander("Raw model response"):
    st.code(result["raw_text"], language="json")

 st.info("Run `streamlit run gemini_vision_ai.py` to launch the app.")
	import io
	import json
	import re
	from typing import Any, Dict, List

	import streamlit as st
	from google import genai
	from google.genai import types
	from PIL import Image, ImageDraw, ImageFont


	MODEL_ID = "gemini-robotics-er-1.5-preview"
	PROMPT = """
	Point to no more than 10 items in the image. The label returned should be an identifying name for the object detected. The answer should follow the json format: [{"point": <point>, "label": <label>}]. The points are in [y, x] format normalized to 0-1000.
	"""
	MAX_MODEL_WIDTH = 1024


	st.set_page_config(page_title="Gemini Vision Points", layout="wide")
	st.title("Gemini Vision Pointing Demo")
	st.caption("Visualize Gemini Robotics ER detections as labeled points over your image.")


	@st.cache_resource(show_spinner=False)
	def get_client() -> genai.Client:
	return genai.Client()


	def load_image(image_source: io.BytesIO \| str) -> Image.Image:
	image = Image.open(image_source).convert("RGB")
	if image.width > MAX_MODEL_WIDTH:
	scale = MAX_MODEL_WIDTH / image.width
	new_size = (MAX_MODEL_WIDTH, int(image.height * scale))
	image = image.resize(new_size, Image.Resampling.LANCZOS)
	return image


	def extract_json_payload(raw_text: str) -> str:
	text = raw_text.strip()
	if not text:
	raise ValueError("Model returned an empty response.")

	code_block_match = re.search(r"```(?:json)?\s(\[.?\])\s*```", text, flags=re.DOTALL)
	if code_block_match:
	return code_block_match.group(1)

	bracket_start = text.find("[")
	bracket_end = text.rfind("]")
	if bracket_start != -1 and bracket_end != -1 and bracket_end > bracket_start:
	return text[bracket_start : bracket_end + 1]

	raise ValueError("Model response did not contain a JSON array.")


	def parse_detections(raw_text: str) -> List[Dict[str, Any]]:
	payload = extract_json_payload(raw_text)
	try:
	detections = json.loads(payload)
	except json.JSONDecodeError as exc:
	raise ValueError("Model response was not valid JSON array.") from exc

	if not isinstance(detections, list):
	raise ValueError("Model response must be a list of detections.")

	cleaned: List[Dict[str, Any]] = []
	for detection in detections:
	point = detection.get("point")
	label = detection.get("label")
	if not isinstance(point, (list, tuple)) or len(point) != 2:
	continue
	if not isinstance(label, str) or not label.strip():
	continue
	try:
	y = float(point[0])
	x = float(point[1])
	except (TypeError, ValueError):
	continue
	cleaned.append({"point": [y, x], "label": label.strip()})

	if not cleaned:
	raise ValueError("No valid detections returned by the model.")

	return cleaned


	def annotate_image(image: Image.Image, detections: List[Dict[str, Any]]) -> Image.Image:
	annotated = image.copy()
	draw = ImageDraw.Draw(annotated)
	font = ImageFont.load_default()

	width, height = annotated.size
	for detection in detections:
	y_norm, x_norm = detection["point"]
	px = max(0, min(width, (x_norm / 1000) * width))
	py = max(0, min(height, (y_norm / 1000) * height))

	r = max(6, min(width, height) // 50)
	ellipse_bounds = [(px - r, py - r), (px + r, py + r)]
	draw.ellipse(ellipse_bounds, outline="#1D4ED8", width=3, fill="#93C5FD88")

	text = detection["label"]
	text_bbox = draw.textbbox((0, 0), text, font=font)
	text_width = text_bbox[2] - text_bbox[0]
	text_height = text_bbox[3] - text_bbox[1]

	pad_x = 10
	pad_y = 6
	bubble_width = text_width + pad_x * 2
	bubble_height = text_height + pad_y * 2

	box_x0 = px - bubble_width / 2
	if box_x0 < 4:
	box_x0 = 4
	if box_x0 + bubble_width > width - 4:
	box_x0 = width - bubble_width - 4

	box_y0 = py - r - bubble_height - 12
	if box_y0 < 4:
	box_y0 = min(height - bubble_height - 4, py + r + 12)

	box_x1 = box_x0 + bubble_width
	box_y1 = box_y0 + bubble_height
	radius = max(6, int(bubble_height * 0.4))

	draw.rounded_rectangle(
	[box_x0, box_y0, box_x1, box_y1],
	radius=radius,
	fill="#0F172ACC",
	outline="#60A5FA",
	width=2,
	)
	text_position = (box_x0 + pad_x, box_y0 + pad_y)
	draw.text(text_position, text, fill="#F8FAFC", font=font)

	connector_y = box_y1 + 4 if box_y0 < py else box_y0 - 4
	draw.line([(px, py), (px, connector_y)], fill="#1D4ED8", width=2)

	return annotated


	@st.cache_data(show_spinner=False)
	def run_inference(image_bytes: bytes, temperature: float, thinking_budget: int) -> Dict[str, Any]:
	image = load_image(io.BytesIO(image_bytes))
	client = get_client()
	response = client.models.generate_content(
	model=MODEL_ID,
	contents=[image, PROMPT],
	config=types.GenerateContentConfig(
	temperature=temperature,
	thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget),
	),
	)
	raw_text = response.text.strip()
	detections = parse_detections(raw_text)
	annotated = annotate_image(image, detections)
	return {"detections": detections, "annotated": annotated, "raw_text": raw_text}


	with st.sidebar:
	st.header("Image & Model Settings")
	uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "webp"])
	temperature = st.slider("Temperature", 0.0, 1.0, 0.3, step=0.05)
	thinking_budget = st.slider("Thinking budget", 0, 30, 0, step=1)
	st.markdown("Using model: `gemini-robotics-er-1.5-preview`")


	default_image_path = "/Users/amosgyamfi/Documents/StreamDevRel/2026/AIPython/GeminiAPI/my-image.png"

	if uploaded_file is not None:
	image_source = uploaded_file
	filename = uploaded_file.name
	else:
	image_source = default_image_path
	filename = "my-image.png"


	try:
	base_image = load_image(image_source)
	except FileNotFoundError:
	st.error("Default image `my-image.png` was not found. Upload an image to continue.")
	st.stop()

	st.subheader("Input Image")
	st.image(base_image, caption=f"Original image: {filename}", use_container_width=True)

	image_bytes_io = io.BytesIO()
	base_image.save(image_bytes_io, format="PNG")

	with st.spinner("Calling Gemini Vision model..."):
	try:
	result = run_inference(image_bytes_io.getvalue(), temperature, thinking_budget)
	except ValueError as exc:
	st.error(f"Gemini Vision returned no detections: {exc}")
	st.stop()
	except Exception as exc: # noqa: BLE001
	st.exception(exc)
	st.stop()

	st.subheader("Detections")
	annotated_image = result["annotated"]
	st.image(annotated_image, caption="Annotated detections", use_container_width=True)

	with st.expander("Detection details", expanded=True):
	st.json(result["detections"])

	with st.expander("Raw model response"):
	st.code(result["raw_text"], language="json")

	st.info("Run `streamlit run gemini_vision_ai.py` to launch the app.")