Skip to content

Instantly share code, notes, and snippets.

@insaneyilin
Created March 16, 2026 18:32
Show Gist options
  • Select an option

  • Save insaneyilin/82516f38e05c8c8ac392bef084a0d1ce to your computer and use it in GitHub Desktop.

Select an option

Save insaneyilin/82516f38e05c8c8ac392bef084a0d1ce to your computer and use it in GitHub Desktop.
ollama_detect_object_in_image
#!/usr/bin/env python3
"""
Object detection using Ollama vision models.
Usage: python ollama_object_detect.py <input_image_file_path> <object_text> <output_dir>
Arguments:
- input_image_file_path: Path to the input image file
- object_text: Object category description to detect (e.g., "dog", "cat")
- output_dir: Output directory where detection results will be saved
"""
import os
import sys
import json
import argparse
from typing import Optional, List
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import ollama
import supervision as sv
def detect_objects_with_ollama(image_path: str, object_text: str, model_name: str = "llava:7b") -> dict:
"""
Detect objects in an image using Ollama vision model.
Args:
image_path: Path to the input image
object_text: Description of objects to detect
model_name: Name of the Ollama vision model to use
Returns:
Dictionary containing detection results
"""
# Create the prompt for object detection
prompt = f"Detect and outline all instances of '{object_text}' in this image. " \
f"Provide the bounding box coordinates in JSON format with the following structure: " \
f"{{'detections': [{{'x': center_x, 'y': center_y, 'width': width, 'height': height, " \
f"'label': '{object_text}', 'confidence': confidence_value}}]}}. " \
f"Use pixel coordinates relative to the image dimensions. " \
f"If no instances are found, return an empty detections array: {{'detections': []}}."
# Create the message with image and prompt
message = {
'role': 'user',
'content': prompt,
'images': [image_path]
}
# Call the Ollama model
try:
response = ollama.chat(
model=model_name,
messages=[message],
options={'temperature': 0}
)
# Extract the response content
response_content = response.message.content
# Try to parse the JSON response
# Look for JSON between curly braces
start = response_content.find('{')
end = response_content.rfind('}')
if start != -1 and end != -1:
json_str = response_content[start:end+1]
try:
result = json.loads(json_str)
return result
except json.JSONDecodeError:
print(f"Could not parse JSON from response: {json_str[:200]}...")
else:
print(f"No JSON found in response: {response_content[:500]}...")
# If JSON parsing fails, return a basic response with raw content
return {
'raw_response': response_content,
'detections': []
}
except Exception as e:
print(f"Error calling Ollama model: {e}")
return {
'error': str(e),
'detections': []
}
def draw_bounding_boxes(image_path: str, detections: List[dict], output_path: str):
"""
Draw bounding boxes on the image based on detection results.
Args:
image_path: Path to the input image
detections: List of detection dictionaries with x, y, width, height
output_path: Path where annotated image will be saved
"""
# Load the original image
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
# Draw bounding boxes for each detection
for detection in detections:
if all(key in detection for key in ['x', 'y', 'width', 'height']):
# Calculate bounding box coordinates (x1, y1, x2, y2)
center_x = detection['x']
center_y = detection['y']
width = detection['width']
height = detection['height']
x1 = center_x - width / 2
y1 = center_y - height / 2
x2 = center_x + width / 2
y2 = center_y + height / 2
# Draw rectangle
draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
# Add label if available
label = detection.get('label', 'object')
confidence = detection.get('confidence', None)
if confidence is not None:
label = f"{label} ({confidence:.2f})"
# Try to add text (simple approach)
try:
draw.text((x1, y1 - 10), label, fill="red")
except:
# If font is not available, just continue
pass
# Save the annotated image
image.save(output_path)
def create_supervision_detections(detection_data: List[dict], image_size: tuple) -> sv.Detections:
"""
Convert detection data to supervision.Detections format.
Args:
detection_data: List of detection dictionaries
image_size: Tuple of (width, height) of the image
Returns:
supervision.Detections object
"""
import numpy as np
# Prepare data for supervision Detections
xyxy = [] # bounding boxes in format [x1, y1, x2, y2]
confidence = []
class_id = []
for i, detection in enumerate(detection_data):
# Check if we have center coordinates (x, y) and dimensions (width, height)
if all(key in detection for key in ['x', 'y', 'width', 'height']):
center_x = detection['x']
center_y = detection['y']
width = detection['width']
height = detection['height']
x1 = center_x - width / 2
y1 = center_y - height / 2
x2 = center_x + width / 2
y2 = center_y + height / 2
xyxy.append([x1, y1, x2, y2])
confidence.append(detection.get('confidence', 1.0))
class_id.append(i) # Simple class assignment
# Also handle case where the response has x and y but different naming for dimensions
elif all(key in detection for key in ['center_x', 'y', 'width', 'height']):
# Adjust for potential different naming
center_x = detection['center_x']
center_y = detection['y'] # This might be the same as 'y'
width = detection['width']
height = detection['height']
x1 = center_x - width / 2
y1 = center_y - height / 2
x2 = center_x + width / 2
y2 = center_y + height / 2
xyxy.append([x1, y1, x2, y2])
confidence.append(detection.get('confidence', 1.0))
class_id.append(i) # Simple class assignment
return sv.Detections(
xyxy=np.array(xyxy, dtype=np.float32),
confidence=np.array(confidence, dtype=np.float32),
class_id=np.array(class_id, dtype=int)
)
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Object detection using Ollama vision models')
parser.add_argument('input_image', help='Path to the input image file')
parser.add_argument('object_text', help='Object category description to detect (e.g., dog, cat)')
parser.add_argument('output_dir', help='Output directory for results')
parser.add_argument('--model', default='qwen2.5vl:7b', help='Ollama vision model to use (default: qwen2.5vl:7b)')
args = parser.parse_args()
# Validate input image path
if not os.path.isfile(args.input_image):
print(f"Error: Input image file does not exist: {args.input_image}")
sys.exit(1)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
print(f"Detecting '{args.object_text}' in image: {args.input_image}")
print(f"Using model: {args.model}")
# Perform object detection
result = detect_objects_with_ollama(args.input_image, args.object_text, args.model)
# Save the raw result as JSON
result_path = os.path.join(args.output_dir, "detection_result.json")
with open(result_path, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Extract detections from result
detections = result.get('detections', [])
if detections:
print(f"Found {len(detections)} instances of '{args.object_text}'")
# Create annotated image
annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
draw_bounding_boxes(args.input_image, detections, annotated_image_path)
print(f"Annotated image saved to: {annotated_image_path}")
# Create supervision detections and save annotations
original_image = Image.open(args.input_image)
image_size = original_image.size # (width, height)
try:
import cv2
# Convert PIL image to numpy array for supervision
image_np = np.array(original_image)
# Ensure image is in RGB format (supervision expects RGB)
if image_np.shape[-1] == 4: # If image has alpha channel
image_np = image_np[:, :, :3] # Remove alpha channel
sv_detections = create_supervision_detections(detections, image_size)
# Create annotated image using supervision
box_annotator = sv.BoxAnnotator()
annotated_image = box_annotator.annotate(scene=image_np.copy(), detections=sv_detections)
# Add labels if we have detections and labels
if len(detections) > 0:
labels = [f"{det.get('label', args.object_text)} {det.get('confidence', 1.0):.2f}"
for det in detections]
label_annotator = sv.LabelAnnotator()
annotated_image = label_annotator.annotate(
scene=annotated_image,
detections=sv_detections,
labels=labels
)
supervision_annotated_path = os.path.join(args.output_dir, "annotated_image_supervision.jpg")
cv2.imwrite(supervision_annotated_path, annotated_image[:, :, ::-1]) # Convert RGB to BGR for cv2
print(f"Supervision annotated image saved to: {supervision_annotated_path}")
except ImportError as e:
print(f"Required library not available for supervision annotations: {e}")
except Exception as e:
print(f"Error creating supervision annotations: {e}")
else:
print(f"No instances of '{args.object_text}' found or error occurred")
# Save even if no detections were found
annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
# Just copy the original image if no detections
original = Image.open(args.input_image)
original.save(annotated_image_path)
print(f"Original image saved to: {annotated_image_path} (no detections found)")
print(f"Detection results saved to: {result_path}")
print(f"Output directory: {args.output_dir}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment