insaneyilin · March 16, 2026 18:32
diff --git a/ollama_object_detect.py b/ollama_object_detect.py
 #!/usr/bin/env python3
 """
 Object detection using Ollama vision models.

 Usage: python ollama_object_detect.py <input_image_file_path> <object_text> <output_dir>

 Arguments:
 - input_image_file_path: Path to the input image file
 - object_text: Object category description to detect (e.g., "dog", "cat")
 - output_dir: Output directory where detection results will be saved
 """

 import os
 import sys
 import json
 import argparse
 from typing import Optional, List
 import numpy as np

 from PIL import Image, ImageDraw, ImageFont
 import ollama
 import supervision as sv


 def detect_objects_with_ollama(image_path: str, object_text: str, model_name: str = "llava:7b") -> dict:
    """
    Detect objects in an image using Ollama vision model.

    Args:
        image_path: Path to the input image
        object_text: Description of objects to detect
        model_name: Name of the Ollama vision model to use

    Returns:
        Dictionary containing detection results
    """
    # Create the prompt for object detection
    prompt = f"Detect and outline all instances of '{object_text}' in this image. " \
             f"Provide the bounding box coordinates in JSON format with the following structure: " \
             f"{{'detections': [{{'x': center_x, 'y': center_y, 'width': width, 'height': height, " \
             f"'label': '{object_text}', 'confidence': confidence_value}}]}}. " \
             f"Use pixel coordinates relative to the image dimensions. " \
             f"If no instances are found, return an empty detections array: {{'detections': []}}."

    # Create the message with image and prompt
    message = {
        'role': 'user',
        'content': prompt,
        'images': [image_path]
    }

    # Call the Ollama model
    try:
        response = ollama.chat(
            model=model_name,
            messages=[message],
            options={'temperature': 0}
        )

        # Extract the response content
        response_content = response.message.content

        # Try to parse the JSON response
        # Look for JSON between curly braces
        start = response_content.find('{')
        end = response_content.rfind('}')
        if start != -1 and end != -1:
            json_str = response_content[start:end+1]
            try:
                result = json.loads(json_str)
                return result
            except json.JSONDecodeError:
                print(f"Could not parse JSON from response: {json_str[:200]}...")
        else:
            print(f"No JSON found in response: {response_content[:500]}...")

        # If JSON parsing fails, return a basic response with raw content
        return {
            'raw_response': response_content,
            'detections': []
        }
    except Exception as e:
        print(f"Error calling Ollama model: {e}")
        return {
            'error': str(e),
            'detections': []
        }


 def draw_bounding_boxes(image_path: str, detections: List[dict], output_path: str):
    """
    Draw bounding boxes on the image based on detection results.

    Args:
        image_path: Path to the input image
        detections: List of detection dictionaries with x, y, width, height
        output_path: Path where annotated image will be saved
    """
    # Load the original image
    image = Image.open(image_path).convert("RGB")
    draw = ImageDraw.Draw(image)

    # Draw bounding boxes for each detection
    for detection in detections:
        if all(key in detection for key in ['x', 'y', 'width', 'height']):
            # Calculate bounding box coordinates (x1, y1, x2, y2)
            center_x = detection['x']
            center_y = detection['y']
            width = detection['width']
            height = detection['height']

            x1 = center_x - width / 2
            y1 = center_y - height / 2
            x2 = center_x + width / 2
            y2 = center_y + height / 2

            # Draw rectangle
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

            # Add label if available
            label = detection.get('label', 'object')
            confidence = detection.get('confidence', None)
            if confidence is not None:
                label = f"{label} ({confidence:.2f})"

            # Try to add text (simple approach)
            try:
                draw.text((x1, y1 - 10), label, fill="red")
            except:
                # If font is not available, just continue
                pass

    # Save the annotated image
    image.save(output_path)


 def create_supervision_detections(detection_data: List[dict], image_size: tuple) -> sv.Detections:
    """
    Convert detection data to supervision.Detections format.

    Args:
        detection_data: List of detection dictionaries
        image_size: Tuple of (width, height) of the image

    Returns:
        supervision.Detections object
    """
    import numpy as np

    # Prepare data for supervision Detections
    xyxy = []  # bounding boxes in format [x1, y1, x2, y2]
    confidence = []
    class_id = []

    for i, detection in enumerate(detection_data):
        # Check if we have center coordinates (x, y) and dimensions (width, height)
        if all(key in detection for key in ['x', 'y', 'width', 'height']):
            center_x = detection['x']
            center_y = detection['y']
            width = detection['width']
            height = detection['height']

            x1 = center_x - width / 2
            y1 = center_y - height / 2
            x2 = center_x + width / 2
            y2 = center_y + height / 2

            xyxy.append([x1, y1, x2, y2])
            confidence.append(detection.get('confidence', 1.0))
            class_id.append(i)  # Simple class assignment
        # Also handle case where the response has x and y but different naming for dimensions
        elif all(key in detection for key in ['center_x', 'y', 'width', 'height']):
            # Adjust for potential different naming
            center_x = detection['center_x']
            center_y = detection['y']  # This might be the same as 'y'
            width = detection['width']
            height = detection['height']

            x1 = center_x - width / 2
            y1 = center_y - height / 2
            x2 = center_x + width / 2
            y2 = center_y + height / 2

            xyxy.append([x1, y1, x2, y2])
            confidence.append(detection.get('confidence', 1.0))
            class_id.append(i)  # Simple class assignment

    return sv.Detections(
        xyxy=np.array(xyxy, dtype=np.float32),
        confidence=np.array(confidence, dtype=np.float32),
        class_id=np.array(class_id, dtype=int)
    )


 def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Object detection using Ollama vision models')
    parser.add_argument('input_image', help='Path to the input image file')
    parser.add_argument('object_text', help='Object category description to detect (e.g., dog, cat)')
    parser.add_argument('output_dir', help='Output directory for results')
    parser.add_argument('--model', default='qwen2.5vl:7b', help='Ollama vision model to use (default: qwen2.5vl:7b)')

    args = parser.parse_args()

    # Validate input image path
    if not os.path.isfile(args.input_image):
        print(f"Error: Input image file does not exist: {args.input_image}")
        sys.exit(1)

    # Create output directory if it doesn't exist
    os.makedirs(args.output_dir, exist_ok=True)

    print(f"Detecting '{args.object_text}' in image: {args.input_image}")
    print(f"Using model: {args.model}")

    # Perform object detection
    result = detect_objects_with_ollama(args.input_image, args.object_text, args.model)

    # Save the raw result as JSON
    result_path = os.path.join(args.output_dir, "detection_result.json")
    with open(result_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    # Extract detections from result
    detections = result.get('detections', [])

    if detections:
        print(f"Found {len(detections)} instances of '{args.object_text}'")

        # Create annotated image
        annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
        draw_bounding_boxes(args.input_image, detections, annotated_image_path)
        print(f"Annotated image saved to: {annotated_image_path}")

        # Create supervision detections and save annotations
        original_image = Image.open(args.input_image)
        image_size = original_image.size  # (width, height)

        try:
            import cv2
            # Convert PIL image to numpy array for supervision
            image_np = np.array(original_image)

            # Ensure image is in RGB format (supervision expects RGB)
            if image_np.shape[-1] == 4:  # If image has alpha channel
                image_np = image_np[:, :, :3]  # Remove alpha channel

            sv_detections = create_supervision_detections(detections, image_size)

            # Create annotated image using supervision
            box_annotator = sv.BoxAnnotator()
            annotated_image = box_annotator.annotate(scene=image_np.copy(), detections=sv_detections)

            # Add labels if we have detections and labels
            if len(detections) > 0:
                labels = [f"{det.get('label', args.object_text)} {det.get('confidence', 1.0):.2f}"
                         for det in detections]

                label_annotator = sv.LabelAnnotator()
                annotated_image = label_annotator.annotate(
                    scene=annotated_image,
                    detections=sv_detections,
                    labels=labels
                )

            supervision_annotated_path = os.path.join(args.output_dir, "annotated_image_supervision.jpg")
            cv2.imwrite(supervision_annotated_path, annotated_image[:, :, ::-1])  # Convert RGB to BGR for cv2
            print(f"Supervision annotated image saved to: {supervision_annotated_path}")
        except ImportError as e:
            print(f"Required library not available for supervision annotations: {e}")
        except Exception as e:
            print(f"Error creating supervision annotations: {e}")

    else:
        print(f"No instances of '{args.object_text}' found or error occurred")
        # Save even if no detections were found
        annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
        # Just copy the original image if no detections
        original = Image.open(args.input_image)
        original.save(annotated_image_path)
        print(f"Original image saved to: {annotated_image_path} (no detections found)")

    print(f"Detection results saved to: {result_path}")
    print(f"Output directory: {args.output_dir}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Object detection using Ollama vision models.

	Usage: python ollama_object_detect.py <input_image_file_path> <object_text> <output_dir>

	Arguments:
	- input_image_file_path: Path to the input image file
	- object_text: Object category description to detect (e.g., "dog", "cat")
	- output_dir: Output directory where detection results will be saved
	"""

	import os
	import sys
	import json
	import argparse
	from typing import Optional, List
	import numpy as np

	from PIL import Image, ImageDraw, ImageFont
	import ollama
	import supervision as sv


	def detect_objects_with_ollama(image_path: str, object_text: str, model_name: str = "llava:7b") -> dict:
	"""
	Detect objects in an image using Ollama vision model.

	Args:
	image_path: Path to the input image
	object_text: Description of objects to detect
	model_name: Name of the Ollama vision model to use

	Returns:
	Dictionary containing detection results
	"""
	# Create the prompt for object detection
	prompt = f"Detect and outline all instances of '{object_text}' in this image. " \
	f"Provide the bounding box coordinates in JSON format with the following structure: " \
	f"{{'detections': [{{'x': center_x, 'y': center_y, 'width': width, 'height': height, " \
	f"'label': '{object_text}', 'confidence': confidence_value}}]}}. " \
	f"Use pixel coordinates relative to the image dimensions. " \
	f"If no instances are found, return an empty detections array: {{'detections': []}}."

	# Create the message with image and prompt
	message = {
	'role': 'user',
	'content': prompt,
	'images': [image_path]
	}

	# Call the Ollama model
	try:
	response = ollama.chat(
	model=model_name,
	messages=[message],
	options={'temperature': 0}
	)

	# Extract the response content
	response_content = response.message.content

	# Try to parse the JSON response
	# Look for JSON between curly braces
	start = response_content.find('{')
	end = response_content.rfind('}')
	if start != -1 and end != -1:
	json_str = response_content[start:end+1]
	try:
	result = json.loads(json_str)
	return result
	except json.JSONDecodeError:
	print(f"Could not parse JSON from response: {json_str[:200]}...")
	else:
	print(f"No JSON found in response: {response_content[:500]}...")

	# If JSON parsing fails, return a basic response with raw content
	return {
	'raw_response': response_content,
	'detections': []
	}
	except Exception as e:
	print(f"Error calling Ollama model: {e}")
	return {
	'error': str(e),
	'detections': []
	}


	def draw_bounding_boxes(image_path: str, detections: List[dict], output_path: str):
	"""
	Draw bounding boxes on the image based on detection results.

	Args:
	image_path: Path to the input image
	detections: List of detection dictionaries with x, y, width, height
	output_path: Path where annotated image will be saved
	"""
	# Load the original image
	image = Image.open(image_path).convert("RGB")
	draw = ImageDraw.Draw(image)

	# Draw bounding boxes for each detection
	for detection in detections:
	if all(key in detection for key in ['x', 'y', 'width', 'height']):
	# Calculate bounding box coordinates (x1, y1, x2, y2)
	center_x = detection['x']
	center_y = detection['y']
	width = detection['width']
	height = detection['height']

	x1 = center_x - width / 2
	y1 = center_y - height / 2
	x2 = center_x + width / 2
	y2 = center_y + height / 2

	# Draw rectangle
	draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

	# Add label if available
	label = detection.get('label', 'object')
	confidence = detection.get('confidence', None)
	if confidence is not None:
	label = f"{label} ({confidence:.2f})"

	# Try to add text (simple approach)
	try:
	draw.text((x1, y1 - 10), label, fill="red")
	except:
	# If font is not available, just continue
	pass

	# Save the annotated image
	image.save(output_path)


	def create_supervision_detections(detection_data: List[dict], image_size: tuple) -> sv.Detections:
	"""
	Convert detection data to supervision.Detections format.

	Args:
	detection_data: List of detection dictionaries
	image_size: Tuple of (width, height) of the image

	Returns:
	supervision.Detections object
	"""
	import numpy as np

	# Prepare data for supervision Detections
	xyxy = [] # bounding boxes in format [x1, y1, x2, y2]
	confidence = []
	class_id = []

	for i, detection in enumerate(detection_data):
	# Check if we have center coordinates (x, y) and dimensions (width, height)
	if all(key in detection for key in ['x', 'y', 'width', 'height']):
	center_x = detection['x']
	center_y = detection['y']
	width = detection['width']
	height = detection['height']

	x1 = center_x - width / 2
	y1 = center_y - height / 2
	x2 = center_x + width / 2
	y2 = center_y + height / 2

	xyxy.append([x1, y1, x2, y2])
	confidence.append(detection.get('confidence', 1.0))
	class_id.append(i) # Simple class assignment
	# Also handle case where the response has x and y but different naming for dimensions
	elif all(key in detection for key in ['center_x', 'y', 'width', 'height']):
	# Adjust for potential different naming
	center_x = detection['center_x']
	center_y = detection['y'] # This might be the same as 'y'
	width = detection['width']
	height = detection['height']

	x1 = center_x - width / 2
	y1 = center_y - height / 2
	x2 = center_x + width / 2
	y2 = center_y + height / 2

	xyxy.append([x1, y1, x2, y2])
	confidence.append(detection.get('confidence', 1.0))
	class_id.append(i) # Simple class assignment

	return sv.Detections(
	xyxy=np.array(xyxy, dtype=np.float32),
	confidence=np.array(confidence, dtype=np.float32),
	class_id=np.array(class_id, dtype=int)
	)


	def main():
	# Parse command line arguments
	parser = argparse.ArgumentParser(description='Object detection using Ollama vision models')
	parser.add_argument('input_image', help='Path to the input image file')
	parser.add_argument('object_text', help='Object category description to detect (e.g., dog, cat)')
	parser.add_argument('output_dir', help='Output directory for results')
	parser.add_argument('--model', default='qwen2.5vl:7b', help='Ollama vision model to use (default: qwen2.5vl:7b)')

	args = parser.parse_args()

	# Validate input image path
	if not os.path.isfile(args.input_image):
	print(f"Error: Input image file does not exist: {args.input_image}")
	sys.exit(1)

	# Create output directory if it doesn't exist
	os.makedirs(args.output_dir, exist_ok=True)

	print(f"Detecting '{args.object_text}' in image: {args.input_image}")
	print(f"Using model: {args.model}")

	# Perform object detection
	result = detect_objects_with_ollama(args.input_image, args.object_text, args.model)

	# Save the raw result as JSON
	result_path = os.path.join(args.output_dir, "detection_result.json")
	with open(result_path, 'w', encoding='utf-8') as f:
	json.dump(result, f, ensure_ascii=False, indent=2)

	# Extract detections from result
	detections = result.get('detections', [])

	if detections:
	print(f"Found {len(detections)} instances of '{args.object_text}'")

	# Create annotated image
	annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
	draw_bounding_boxes(args.input_image, detections, annotated_image_path)
	print(f"Annotated image saved to: {annotated_image_path}")

	# Create supervision detections and save annotations
	original_image = Image.open(args.input_image)
	image_size = original_image.size # (width, height)

	try:
	import cv2
	# Convert PIL image to numpy array for supervision
	image_np = np.array(original_image)

	# Ensure image is in RGB format (supervision expects RGB)
	if image_np.shape[-1] == 4: # If image has alpha channel
	image_np = image_np[:, :, :3] # Remove alpha channel

	sv_detections = create_supervision_detections(detections, image_size)

	# Create annotated image using supervision
	box_annotator = sv.BoxAnnotator()
	annotated_image = box_annotator.annotate(scene=image_np.copy(), detections=sv_detections)

	# Add labels if we have detections and labels
	if len(detections) > 0:
	labels = [f"{det.get('label', args.object_text)} {det.get('confidence', 1.0):.2f}"
	for det in detections]

	label_annotator = sv.LabelAnnotator()
	annotated_image = label_annotator.annotate(
	scene=annotated_image,
	detections=sv_detections,
	labels=labels
	)

	supervision_annotated_path = os.path.join(args.output_dir, "annotated_image_supervision.jpg")
	cv2.imwrite(supervision_annotated_path, annotated_image[:, :, ::-1]) # Convert RGB to BGR for cv2
	print(f"Supervision annotated image saved to: {supervision_annotated_path}")
	except ImportError as e:
	print(f"Required library not available for supervision annotations: {e}")
	except Exception as e:
	print(f"Error creating supervision annotations: {e}")

	else:
	print(f"No instances of '{args.object_text}' found or error occurred")
	# Save even if no detections were found
	annotated_image_path = os.path.join(args.output_dir, "annotated_image.jpg")
	# Just copy the original image if no detections
	original = Image.open(args.input_image)
	original.save(annotated_image_path)
	print(f"Original image saved to: {annotated_image_path} (no detections found)")

	print(f"Detection results saved to: {result_path}")
	print(f"Output directory: {args.output_dir}")


	if __name__ == "__main__":
	main()
No results found