Skip to content

Instantly share code, notes, and snippets.

@amosgyamfi
Created September 30, 2025 11:40
Show Gist options
  • Save amosgyamfi/e0df5da5f98018f0f9d6bf81b8bb3451 to your computer and use it in GitHub Desktop.
Save amosgyamfi/e0df5da5f98018f0f9d6bf81b8bb3451 to your computer and use it in GitHub Desktop.
import io
import json
import re
from typing import Any, Dict, List
import streamlit as st
from google import genai
from google.genai import types
from PIL import Image, ImageDraw, ImageFont
MODEL_ID = "gemini-robotics-er-1.5-preview"
PROMPT = """
Point to no more than 10 items in the image. The label returned should be an identifying name for the object detected. The answer should follow the json format: [{"point": <point>, "label": <label>}]. The points are in [y, x] format normalized to 0-1000.
"""
MAX_MODEL_WIDTH = 1024
st.set_page_config(page_title="Gemini Vision Points", layout="wide")
st.title("Gemini Vision Pointing Demo")
st.caption("Visualize Gemini Robotics ER detections as labeled points over your image.")
@st.cache_resource(show_spinner=False)
def get_client() -> genai.Client:
return genai.Client()
def load_image(image_source: io.BytesIO | str) -> Image.Image:
image = Image.open(image_source).convert("RGB")
if image.width > MAX_MODEL_WIDTH:
scale = MAX_MODEL_WIDTH / image.width
new_size = (MAX_MODEL_WIDTH, int(image.height * scale))
image = image.resize(new_size, Image.Resampling.LANCZOS)
return image
def extract_json_payload(raw_text: str) -> str:
text = raw_text.strip()
if not text:
raise ValueError("Model returned an empty response.")
code_block_match = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", text, flags=re.DOTALL)
if code_block_match:
return code_block_match.group(1)
bracket_start = text.find("[")
bracket_end = text.rfind("]")
if bracket_start != -1 and bracket_end != -1 and bracket_end > bracket_start:
return text[bracket_start : bracket_end + 1]
raise ValueError("Model response did not contain a JSON array.")
def parse_detections(raw_text: str) -> List[Dict[str, Any]]:
payload = extract_json_payload(raw_text)
try:
detections = json.loads(payload)
except json.JSONDecodeError as exc:
raise ValueError("Model response was not valid JSON array.") from exc
if not isinstance(detections, list):
raise ValueError("Model response must be a list of detections.")
cleaned: List[Dict[str, Any]] = []
for detection in detections:
point = detection.get("point")
label = detection.get("label")
if not isinstance(point, (list, tuple)) or len(point) != 2:
continue
if not isinstance(label, str) or not label.strip():
continue
try:
y = float(point[0])
x = float(point[1])
except (TypeError, ValueError):
continue
cleaned.append({"point": [y, x], "label": label.strip()})
if not cleaned:
raise ValueError("No valid detections returned by the model.")
return cleaned
def annotate_image(image: Image.Image, detections: List[Dict[str, Any]]) -> Image.Image:
annotated = image.copy()
draw = ImageDraw.Draw(annotated)
font = ImageFont.load_default()
width, height = annotated.size
for detection in detections:
y_norm, x_norm = detection["point"]
px = max(0, min(width, (x_norm / 1000) * width))
py = max(0, min(height, (y_norm / 1000) * height))
r = max(6, min(width, height) // 50)
ellipse_bounds = [(px - r, py - r), (px + r, py + r)]
draw.ellipse(ellipse_bounds, outline="#1D4ED8", width=3, fill="#93C5FD88")
text = detection["label"]
text_bbox = draw.textbbox((0, 0), text, font=font)
text_width = text_bbox[2] - text_bbox[0]
text_height = text_bbox[3] - text_bbox[1]
pad_x = 10
pad_y = 6
bubble_width = text_width + pad_x * 2
bubble_height = text_height + pad_y * 2
box_x0 = px - bubble_width / 2
if box_x0 < 4:
box_x0 = 4
if box_x0 + bubble_width > width - 4:
box_x0 = width - bubble_width - 4
box_y0 = py - r - bubble_height - 12
if box_y0 < 4:
box_y0 = min(height - bubble_height - 4, py + r + 12)
box_x1 = box_x0 + bubble_width
box_y1 = box_y0 + bubble_height
radius = max(6, int(bubble_height * 0.4))
draw.rounded_rectangle(
[box_x0, box_y0, box_x1, box_y1],
radius=radius,
fill="#0F172ACC",
outline="#60A5FA",
width=2,
)
text_position = (box_x0 + pad_x, box_y0 + pad_y)
draw.text(text_position, text, fill="#F8FAFC", font=font)
connector_y = box_y1 + 4 if box_y0 < py else box_y0 - 4
draw.line([(px, py), (px, connector_y)], fill="#1D4ED8", width=2)
return annotated
@st.cache_data(show_spinner=False)
def run_inference(image_bytes: bytes, temperature: float, thinking_budget: int) -> Dict[str, Any]:
image = load_image(io.BytesIO(image_bytes))
client = get_client()
response = client.models.generate_content(
model=MODEL_ID,
contents=[image, PROMPT],
config=types.GenerateContentConfig(
temperature=temperature,
thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget),
),
)
raw_text = response.text.strip()
detections = parse_detections(raw_text)
annotated = annotate_image(image, detections)
return {"detections": detections, "annotated": annotated, "raw_text": raw_text}
with st.sidebar:
st.header("Image & Model Settings")
uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "webp"])
temperature = st.slider("Temperature", 0.0, 1.0, 0.3, step=0.05)
thinking_budget = st.slider("Thinking budget", 0, 30, 0, step=1)
st.markdown("Using model: `gemini-robotics-er-1.5-preview`")
default_image_path = "/Users/amosgyamfi/Documents/StreamDevRel/2026/AIPython/GeminiAPI/my-image.png"
if uploaded_file is not None:
image_source = uploaded_file
filename = uploaded_file.name
else:
image_source = default_image_path
filename = "my-image.png"
try:
base_image = load_image(image_source)
except FileNotFoundError:
st.error("Default image `my-image.png` was not found. Upload an image to continue.")
st.stop()
st.subheader("Input Image")
st.image(base_image, caption=f"Original image: {filename}", use_container_width=True)
image_bytes_io = io.BytesIO()
base_image.save(image_bytes_io, format="PNG")
with st.spinner("Calling Gemini Vision model..."):
try:
result = run_inference(image_bytes_io.getvalue(), temperature, thinking_budget)
except ValueError as exc:
st.error(f"Gemini Vision returned no detections: {exc}")
st.stop()
except Exception as exc: # noqa: BLE001
st.exception(exc)
st.stop()
st.subheader("Detections")
annotated_image = result["annotated"]
st.image(annotated_image, caption="Annotated detections", use_container_width=True)
with st.expander("Detection details", expanded=True):
st.json(result["detections"])
with st.expander("Raw model response"):
st.code(result["raw_text"], language="json")
st.info("Run `streamlit run gemini_vision_ai.py` to launch the app.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment