Created
March 16, 2026 18:32
-
-
Save insaneyilin/82516f38e05c8c8ac392bef084a0d1ce to your computer and use it in GitHub Desktop.
ollama_detect_object_in_image
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Object detection using Ollama vision models. | |
| Usage: python ollama_object_detect.py <input_image_file_path> <object_text> <output_dir> | |
| Arguments: | |
| - input_image_file_path: Path to the input image file | |
| - object_text: Object category description to detect (e.g., "dog", "cat") | |
| - output_dir: Output directory where detection results will be saved | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import argparse | |
| from typing import Optional, List | |
| import numpy as np | |
| from PIL import Image, ImageDraw, ImageFont | |
| import ollama | |
| import supervision as sv | |
| def detect_objects_with_ollama(image_path: str, object_text: str, model_name: str = "llava:7b") -> dict: | |
| """ | |
| Detect objects in an image using Ollama vision model. | |
| Args: | |
| image_path: Path to the input image | |
| object_text: Description of objects to detect | |
| model_name: Name of the Ollama vision model to use | |
| Returns: | |
| Dictionary containing detection results | |
| """ | |
| # Create the prompt for object detection | |
| prompt = f"Detect and outline all instances of '{object_text}' in this image. " \ | |
| f"Provide the bounding box coordinates in JSON format with the following structure: " \ | |
| f"{{'detections': [{{'x': center_x, 'y': center_y, 'width': width, 'height': height, " \ | |
| f"'label': '{object_text}', 'confidence': confidence_value}}]}}. " \ | |
| f"Use pixel coordinates relative to the image dimensions. " \ | |
| f"If no instances are found, return an empty detections array: {{'detections': []}}." | |
| # Create the message with image and prompt | |
| message = { | |
| 'role': 'user', | |
| 'content': prompt, | |
| 'images': [image_path] | |
| } | |
| # Call the Ollama model | |
| try: | |
| response = ollama.chat( | |
| model=model_name, | |
| messages=[message], | |
| options={'temperature': 0} | |
| ) | |
| # Extract the response content | |
| response_content = response.message.content | |
| # Try to parse the JSON response | |
| # Look for JSON between curly braces | |
| start = response_content.find('{') | |
| end = response_content.rfind('}') | |
| if start != -1 and end != -1: | |
| json_str = response_content[start:end+1] | |
| try: | |
| result = json.loads(json_str) | |
| return result | |
| except json.JSONDecodeError: | |
| print(f"Could not parse JSON from response: {json_str[:200]}...") | |
| else: | |
| print(f"No JSON found in response: {response_content[:500]}...") | |
| # If JSON parsing fails, return a basic response with raw content | |
| return { | |
| 'raw_response': response_content, | |
| 'detections': [] | |
| } | |
| except Exception as e: | |
| print(f"Error calling Ollama model: {e}") | |
| return { | |
| 'error': str(e), | |
| 'detections': [] | |
| } | |
| def draw_bounding_boxes(image_path: str, detections: List[dict], output_path: str): | |
| """ | |
| Draw bounding boxes on the image based on detection results. | |
| Args: | |
| image_path: Path to the input image | |
| detections: List of detection dictionaries with x, y, width, height | |
| output_path: Path where annotated image will be saved | |
| """ | |
| # Load the original image | |
| image = Image.open(image_path).convert("RGB") | |
| draw = ImageDraw.Draw(image) | |
| # Draw bounding boxes for each detection | |
| for detection in detections: | |
| if all(key in detection for key in ['x', 'y', 'width', 'height']): | |
| # Calculate bounding box coordinates (x1, y1, x2, y2) | |
| center_x = detection['x'] | |
| center_y = detection['y'] | |
| width = detection['width'] | |
| height = detection['height'] | |
| x1 = center_x - width / 2 | |
| y1 = center_y - height / 2 | |
| x2 = center_x + width / 2 | |
| y2 = center_y + height / 2 | |
| # Draw rectangle | |
| draw.rectangle([x1, y1, x2, y2], outline="red", width=2) | |
| # Add label if available | |
| label = detection.get('label', 'object') | |
| confidence = detection.get('confidence', None) | |
| if confidence is not None: | |
| label = f"{label} ({confidence:.2f})" | |
| # Try to add text (simple approach) | |
| try: | |
| draw.text((x1, y1 - 10), label, fill="red") | |
| except: | |
| # If font is not available, just continue | |
| pass | |
| # Save the annotated image | |
| image.save(output_path) | |
| def create_supervision_detections(detection_data: List[dict], image_size: tuple) -> sv.Detections: | |
| """ | |
| Convert detection data to supervision.Detections format. | |
| Args: | |
| detection_data: List of detection dictionaries | |
| image_size: Tuple of (width, height) of the image | |
| Returns: | |
| supervision.Detections object | |
| """ | |
| import numpy as np | |
| # Prepare data for supervision Detections | |
| xyxy = [] # bounding boxes in format [x1, y1, x2, y2] | |
| confidence = [] | |
| class_id = [] | |
| for i, detection in enumerate(detection_data): | |
| # Check if we have center coordinates (x, y) and dimensions (width, height) | |
| if all(key in detection for key in ['x', 'y', 'width', 'height']): | |
| center_x = detection['x'] | |
| center_y = detection['y'] | |
| width = detection['width'] | |
| height = detection['height'] | |
| x1 = center_x - width / 2 | |
| y1 = center_y - height / 2 | |
| x2 = center_x + width / 2 | |
| y2 = center_y + height / 2 | |
| xyxy.append([x1, y1, x2, y2]) | |
| confidence.append(detection.get('confidence', 1.0)) | |
| class_id.append(i) # Simple class assignment | |
| # Also handle case where the response has x and y but different naming for dimensions | |
| elif all(key in detection for key in ['center_x', 'y', 'width', 'height']): | |
| # Adjust for potential different naming | |
| center_x = detection['center_x'] | |
| center_y = detection['y'] # This might be the same as 'y' | |
| width = detection['width'] | |
| height = detection['height'] | |
| x1 = center_x - width / 2 | |
| y1 = center_y - height / 2 | |
| x2 = center_x + width / 2 | |
| y2 = center_y + height / 2 | |
| xyxy.append([x1, y1, x2, y2]) | |
| confidence.append(detection.get('confidence', 1.0)) | |
| class_id.append(i) # Simple class assignment | |
| return sv.Detections( | |
| xyxy=np.array(xyxy, dtype=np.float32), | |
| confidence=np.array(confidence, dtype=np.float32), | |
| class_id=np.array(class_id, dtype=int) | |
| ) | |
| def main(): | |
| # Parse command line arguments | |
| parser = argparse.ArgumentParser(description='Object detection using Ollama vision models') | |
| parser.add_argument('input_image', help='Path to the input image file') | |
| parser.add_argument('object_text', help='Object category description to detect (e.g., dog, cat)') | |
| parser.add_argument('output_dir', help='Output directory for results') | |
| parser.add_argument('--model', default='qwen2.5vl:7b', help='Ollama vision model to use (default: qwen2.5vl:7b)') | |
| args = parser.parse_args() | |
| # Validate input image path | |
| if not os.path.isfile(args.input_image): | |
| print(f"Error: Input image file does not exist: {args.input_image}") | |
| sys.exit(1) | |
| # Create output directory if it doesn't exist | |
| os.makedirs(args.output_dir, exist_ok=True) | |
| print(f"Detecting '{args.object_text}' in image: {args.input_image}") | |
| print(f"Using model: {args.model}") | |
| # Perform object detection | |
| result = detect_objects_with_ollama(args.input_image, args.object_text, args.model) | |
| # Save the raw result as JSON | |
| result_path = os.path.join(args.output_dir, "detection_result.json") | |
| with open(result_path, 'w', encoding='utf-8') as f: | |
| json.dump(result, f, ensure_ascii=False, indent=2) | |
| # Extract detections from result | |
| detections = result.get('detections', []) | |
| if detections: | |
| print(f"Found {len(detections)} instances of '{args.object_text}'") | |
| # Create annotated image | |
| annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg") | |
| draw_bounding_boxes(args.input_image, detections, annotated_image_path) | |
| print(f"Annotated image saved to: {annotated_image_path}") | |
| # Create supervision detections and save annotations | |
| original_image = Image.open(args.input_image) | |
| image_size = original_image.size # (width, height) | |
| try: | |
| import cv2 | |
| # Convert PIL image to numpy array for supervision | |
| image_np = np.array(original_image) | |
| # Ensure image is in RGB format (supervision expects RGB) | |
| if image_np.shape[-1] == 4: # If image has alpha channel | |
| image_np = image_np[:, :, :3] # Remove alpha channel | |
| sv_detections = create_supervision_detections(detections, image_size) | |
| # Create annotated image using supervision | |
| box_annotator = sv.BoxAnnotator() | |
| annotated_image = box_annotator.annotate(scene=image_np.copy(), detections=sv_detections) | |
| # Add labels if we have detections and labels | |
| if len(detections) > 0: | |
| labels = [f"{det.get('label', args.object_text)} {det.get('confidence', 1.0):.2f}" | |
| for det in detections] | |
| label_annotator = sv.LabelAnnotator() | |
| annotated_image = label_annotator.annotate( | |
| scene=annotated_image, | |
| detections=sv_detections, | |
| labels=labels | |
| ) | |
| supervision_annotated_path = os.path.join(args.output_dir, "annotated_image_supervision.jpg") | |
| cv2.imwrite(supervision_annotated_path, annotated_image[:, :, ::-1]) # Convert RGB to BGR for cv2 | |
| print(f"Supervision annotated image saved to: {supervision_annotated_path}") | |
| except ImportError as e: | |
| print(f"Required library not available for supervision annotations: {e}") | |
| except Exception as e: | |
| print(f"Error creating supervision annotations: {e}") | |
| else: | |
| print(f"No instances of '{args.object_text}' found or error occurred") | |
| # Save even if no detections were found | |
| annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg") | |
| # Just copy the original image if no detections | |
| original = Image.open(args.input_image) | |
| original.save(annotated_image_path) | |
| print(f"Original image saved to: {annotated_image_path} (no detections found)") | |
| print(f"Detection results saved to: {result_path}") | |
| print(f"Output directory: {args.output_dir}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment