Created
September 30, 2025 11:40
-
-
Save amosgyamfi/e0df5da5f98018f0f9d6bf81b8bb3451 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import io | |
| import json | |
| import re | |
| from typing import Any, Dict, List | |
| import streamlit as st | |
| from google import genai | |
| from google.genai import types | |
| from PIL import Image, ImageDraw, ImageFont | |
| MODEL_ID = "gemini-robotics-er-1.5-preview" | |
| PROMPT = """ | |
| Point to no more than 10 items in the image. The label returned should be an identifying name for the object detected. The answer should follow the json format: [{"point": <point>, "label": <label>}]. The points are in [y, x] format normalized to 0-1000. | |
| """ | |
| MAX_MODEL_WIDTH = 1024 | |
| st.set_page_config(page_title="Gemini Vision Points", layout="wide") | |
| st.title("Gemini Vision Pointing Demo") | |
| st.caption("Visualize Gemini Robotics ER detections as labeled points over your image.") | |
| @st.cache_resource(show_spinner=False) | |
| def get_client() -> genai.Client: | |
| return genai.Client() | |
| def load_image(image_source: io.BytesIO | str) -> Image.Image: | |
| image = Image.open(image_source).convert("RGB") | |
| if image.width > MAX_MODEL_WIDTH: | |
| scale = MAX_MODEL_WIDTH / image.width | |
| new_size = (MAX_MODEL_WIDTH, int(image.height * scale)) | |
| image = image.resize(new_size, Image.Resampling.LANCZOS) | |
| return image | |
| def extract_json_payload(raw_text: str) -> str: | |
| text = raw_text.strip() | |
| if not text: | |
| raise ValueError("Model returned an empty response.") | |
| code_block_match = re.search(r"```(?:json)?\s*(\[.*?\])\s*```", text, flags=re.DOTALL) | |
| if code_block_match: | |
| return code_block_match.group(1) | |
| bracket_start = text.find("[") | |
| bracket_end = text.rfind("]") | |
| if bracket_start != -1 and bracket_end != -1 and bracket_end > bracket_start: | |
| return text[bracket_start : bracket_end + 1] | |
| raise ValueError("Model response did not contain a JSON array.") | |
| def parse_detections(raw_text: str) -> List[Dict[str, Any]]: | |
| payload = extract_json_payload(raw_text) | |
| try: | |
| detections = json.loads(payload) | |
| except json.JSONDecodeError as exc: | |
| raise ValueError("Model response was not valid JSON array.") from exc | |
| if not isinstance(detections, list): | |
| raise ValueError("Model response must be a list of detections.") | |
| cleaned: List[Dict[str, Any]] = [] | |
| for detection in detections: | |
| point = detection.get("point") | |
| label = detection.get("label") | |
| if not isinstance(point, (list, tuple)) or len(point) != 2: | |
| continue | |
| if not isinstance(label, str) or not label.strip(): | |
| continue | |
| try: | |
| y = float(point[0]) | |
| x = float(point[1]) | |
| except (TypeError, ValueError): | |
| continue | |
| cleaned.append({"point": [y, x], "label": label.strip()}) | |
| if not cleaned: | |
| raise ValueError("No valid detections returned by the model.") | |
| return cleaned | |
| def annotate_image(image: Image.Image, detections: List[Dict[str, Any]]) -> Image.Image: | |
| annotated = image.copy() | |
| draw = ImageDraw.Draw(annotated) | |
| font = ImageFont.load_default() | |
| width, height = annotated.size | |
| for detection in detections: | |
| y_norm, x_norm = detection["point"] | |
| px = max(0, min(width, (x_norm / 1000) * width)) | |
| py = max(0, min(height, (y_norm / 1000) * height)) | |
| r = max(6, min(width, height) // 50) | |
| ellipse_bounds = [(px - r, py - r), (px + r, py + r)] | |
| draw.ellipse(ellipse_bounds, outline="#1D4ED8", width=3, fill="#93C5FD88") | |
| text = detection["label"] | |
| text_bbox = draw.textbbox((0, 0), text, font=font) | |
| text_width = text_bbox[2] - text_bbox[0] | |
| text_height = text_bbox[3] - text_bbox[1] | |
| pad_x = 10 | |
| pad_y = 6 | |
| bubble_width = text_width + pad_x * 2 | |
| bubble_height = text_height + pad_y * 2 | |
| box_x0 = px - bubble_width / 2 | |
| if box_x0 < 4: | |
| box_x0 = 4 | |
| if box_x0 + bubble_width > width - 4: | |
| box_x0 = width - bubble_width - 4 | |
| box_y0 = py - r - bubble_height - 12 | |
| if box_y0 < 4: | |
| box_y0 = min(height - bubble_height - 4, py + r + 12) | |
| box_x1 = box_x0 + bubble_width | |
| box_y1 = box_y0 + bubble_height | |
| radius = max(6, int(bubble_height * 0.4)) | |
| draw.rounded_rectangle( | |
| [box_x0, box_y0, box_x1, box_y1], | |
| radius=radius, | |
| fill="#0F172ACC", | |
| outline="#60A5FA", | |
| width=2, | |
| ) | |
| text_position = (box_x0 + pad_x, box_y0 + pad_y) | |
| draw.text(text_position, text, fill="#F8FAFC", font=font) | |
| connector_y = box_y1 + 4 if box_y0 < py else box_y0 - 4 | |
| draw.line([(px, py), (px, connector_y)], fill="#1D4ED8", width=2) | |
| return annotated | |
| @st.cache_data(show_spinner=False) | |
| def run_inference(image_bytes: bytes, temperature: float, thinking_budget: int) -> Dict[str, Any]: | |
| image = load_image(io.BytesIO(image_bytes)) | |
| client = get_client() | |
| response = client.models.generate_content( | |
| model=MODEL_ID, | |
| contents=[image, PROMPT], | |
| config=types.GenerateContentConfig( | |
| temperature=temperature, | |
| thinking_config=types.ThinkingConfig(thinking_budget=thinking_budget), | |
| ), | |
| ) | |
| raw_text = response.text.strip() | |
| detections = parse_detections(raw_text) | |
| annotated = annotate_image(image, detections) | |
| return {"detections": detections, "annotated": annotated, "raw_text": raw_text} | |
| with st.sidebar: | |
| st.header("Image & Model Settings") | |
| uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg", "webp"]) | |
| temperature = st.slider("Temperature", 0.0, 1.0, 0.3, step=0.05) | |
| thinking_budget = st.slider("Thinking budget", 0, 30, 0, step=1) | |
| st.markdown("Using model: `gemini-robotics-er-1.5-preview`") | |
| default_image_path = "/Users/amosgyamfi/Documents/StreamDevRel/2026/AIPython/GeminiAPI/my-image.png" | |
| if uploaded_file is not None: | |
| image_source = uploaded_file | |
| filename = uploaded_file.name | |
| else: | |
| image_source = default_image_path | |
| filename = "my-image.png" | |
| try: | |
| base_image = load_image(image_source) | |
| except FileNotFoundError: | |
| st.error("Default image `my-image.png` was not found. Upload an image to continue.") | |
| st.stop() | |
| st.subheader("Input Image") | |
| st.image(base_image, caption=f"Original image: {filename}", use_container_width=True) | |
| image_bytes_io = io.BytesIO() | |
| base_image.save(image_bytes_io, format="PNG") | |
| with st.spinner("Calling Gemini Vision model..."): | |
| try: | |
| result = run_inference(image_bytes_io.getvalue(), temperature, thinking_budget) | |
| except ValueError as exc: | |
| st.error(f"Gemini Vision returned no detections: {exc}") | |
| st.stop() | |
| except Exception as exc: # noqa: BLE001 | |
| st.exception(exc) | |
| st.stop() | |
| st.subheader("Detections") | |
| annotated_image = result["annotated"] | |
| st.image(annotated_image, caption="Annotated detections", use_container_width=True) | |
| with st.expander("Detection details", expanded=True): | |
| st.json(result["detections"]) | |
| with st.expander("Raw model response"): | |
| st.code(result["raw_text"], language="json") | |
| st.info("Run `streamlit run gemini_vision_ai.py` to launch the app.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment