Skip to content

Instantly share code, notes, and snippets.

@olooney
Created November 13, 2025 03:00
Show Gist options
  • Select an option

  • Save olooney/30fa7ca8aaa7ce720de20f525fc3db60 to your computer and use it in GitHub Desktop.

Select an option

Save olooney/30fa7ca8aaa7ce720de20f525fc3db60 to your computer and use it in GitHub Desktop.
Grid Alignment with OpenCV
#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().run_line_magic('pip', 'install opencv-python')
# In[56]:
import cv2
import numpy as np
from PIL import Image, ImageDraw
# In[57]:
# ---------- CONFIG ----------
IMAGE_PATH = "beatles.jpg" # path to your original image
OUTPUT_VIS_PATH = "beatles_detect_rows.png"
THRESH_VALUE = 240 # near white
MIN_AREA = 50
ROW_GAP_FACTOR = 0.5
EROSION_ITERATIONS = 5
# In[58]:
# ---------- STEP 1: LOAD IMAGE & THRESHOLD ----------
# OpenCV loads BGR
bgr = cv2.imread(IMAGE_PATH)
if bgr is None:
raise ValueError(f"Could not read image at {IMAGE_PATH}")
h_img, w_img = bgr.shape[:2]
gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
# Threshold: background is near white, so invert
_, th = cv2.threshold(gray, THRESH_VALUE, 255, cv2.THRESH_BINARY_INV)
# Optional: clean up with morphology
kernel = np.ones((3, 3), np.uint8)
th_clean = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=EROSION_ITERATIONS)
# In[59]:
# ---------- STEP 2: FIND OBJECTS (CONNECTED COMPONENTS OR CONTOURS) ----------
# Using connected components here (you could also use findContours)
num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(th_clean)
# stats: [label][x, y, w, h, area]
objects = []
for label in range(1, num_labels): # skip label 0 (background)
x, y, w, h, area = stats[label]
if area < MIN_AREA:
continue
cy = y + h / 2.0
obj = {
"label": label,
"bbox": (x, y, x + w, y + h),
"cy": cy,
"h": h,
}
objects.append(obj)
if not objects:
raise RuntimeError("No objects found after thresholding/connected components")
# In[60]:
# ---------- STEP 3: AGGLOMERATIVE 1D CLUSTERING (AVERAGE LINKAGE) ----------
def agglomerative_1d_average_linkage(values, dist_threshold):
"""
values: list of floats (cy's)
dist_threshold: merge clusters while distance < threshold
Distance between clusters = abs(mean(c1) - mean(c2)).
Returns: list of clusters, each a list of indices into 'values'.
"""
clusters = [[i] for i in range(len(values))]
def cluster_mean(c):
return float(np.mean([values[i] for i in c]))
while True:
n = len(clusters)
if n <= 1:
break
best_d = None
best_pair = None
means = [cluster_mean(c) for c in clusters]
for i in range(n):
for j in range(i + 1, n):
d = abs(means[i] - means[j])
if best_d is None or d < best_d:
best_d = d
best_pair = (i, j)
if best_d is None or best_d >= dist_threshold:
break
i, j = best_pair
new_cluster = clusters[i] + clusters[j]
new_clusters = []
for k, c in enumerate(clusters):
if k == i:
new_clusters.append(new_cluster)
elif k == j:
continue
else:
new_clusters.append(c)
clusters = new_clusters
return clusters
# Prepare data for clustering
heights = [obj["h"] for obj in objects]
avg_h = float(np.mean(heights))
dist_threshold = ROW_GAP_FACTOR * avg_h
values_cy = [obj["cy"] for obj in objects]
clusters = agglomerative_1d_average_linkage(values_cy, dist_threshold)
# In[61]:
# ---------- STEP 4: TURN CLUSTERS INTO ORDERED ROWS ----------
def cluster_mean_cy(cluster_indices):
return float(np.mean([objects[i]["cy"] for i in cluster_indices]))
# sort clusters by their vertical position
clusters_sorted = sorted(clusters, key=cluster_mean_cy)
# rows: list of lists of bboxes (x1, y1, x2, y2), left-to-right order
rows = []
for cluster in clusters_sorted:
cluster_objs = [objects[i] for i in cluster]
cluster_objs_sorted = sorted(cluster_objs, key=lambda o: o["bbox"][0]) # by x1
rows.append([o["bbox"] for o in cluster_objs_sorted])
# Row bounds (top/bottom)
row_bounds = []
for row in rows:
ys_top = [b[1] for b in row]
ys_bottom = [b[3] for b in row]
row_bounds.append((min(ys_top), max(ys_bottom)))
# In[62]:
# ---------- STEP 6: VISUALIZATION WITH PIL (BLUE ROWS, GREEN BOXES) ----------
# Convert original BGR image to RGBA for alpha drawing
rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
pil_img = Image.fromarray(rgb).convert("RGBA")
draw = ImageDraw.Draw(pil_img, "RGBA")
# Row bounds in 50% opacity blue
for top, bottom in row_bounds:
draw.line([(0, top), (w_img, top)], fill=(0, 0, 255, 128), width=1)
draw.line([(0, bottom), (w_img, bottom)], fill=(0, 0, 255, 128), width=1)
# Bounding boxes in 50% opacity green
for row in rows:
for (x1, y1, x2, y2) in row:
# top
draw.line([(x1, y1), (x2, y1)], fill=(0, 255, 0, 128), width=1)
# bottom
draw.line([(x1, y2), (x2, y2)], fill=(0, 255, 0, 128), width=1)
# left
draw.line([(x1, y1), (x1, y2)], fill=(0, 255, 0, 128), width=1)
# right
draw.line([(x2, y1), (x2, y2)], fill=(0, 255, 0, 128), width=1)
pil_img.save(OUTPUT_VIS_PATH)
print(f"\nVisualization saved to {OUTPUT_VIS_PATH}")
# In[65]:
def align_to_grid(img, rows, background=(255, 255, 255)):
"""
img : original PIL image
rows : list of rows; each row is list of (x1, y1, x2, y2) bboxes
background : RGB background color for new image
"""
w_img, h_img = img.size
# --- 1. Column widths: max width in each column across all rows ---
num_cols = max(len(row) for row in rows)
col_widths = [0] * num_cols
for row in rows:
for c, (x1, y1, x2, y2) in enumerate(row):
width = x2 - x1
col_widths[c] = max(col_widths[c], width)
total_col_width = sum(col_widths)
extra = w_img - total_col_width
if extra < 0:
raise ValueError("Columns wider than image width; cannot fit grid.")
# --- 2. Distribute extra padding into C+1 gaps as fairly as possible ---
base_gap = extra // (num_cols + 1)
remainder = extra % (num_cols + 1)
# gaps[i] is the gap before column i (0..C) and after last column (C)
gaps = [base_gap + (1 if i < remainder else 0) for i in range(num_cols + 1)]
# --- 3. Compute left x for each column ---
col_lefts = []
x = gaps[0]
for c in range(num_cols):
col_lefts.append(x)
x += col_widths[c] + gaps[c + 1]
# --- 4. Compute row top/bottom (keep them unchanged) ---
row_bounds = []
for row in rows:
tops = [b[1] for b in row]
bottoms = [b[3] for b in row]
row_bounds.append((min(tops), max(bottoms)))
# --- 5. Create new image and paste each object into its aligned slot ---
new_img = Image.new(img.mode, (w_img, h_img), background)
for r, row in enumerate(rows):
row_top, row_bottom = row_bounds[r]
for c, (x1, y1, x2, y2) in enumerate(row):
if c >= num_cols:
continue
src_w = x2 - x1
src_h = y2 - y1
col_w = col_widths[c]
col_left = col_lefts[c]
# center the object within its column width
new_left = int(round(col_left + (col_w - src_w) / 2.0))
new_right = new_left + src_w
# keep original vertical placement (same top/bottom)
new_top = y1 # or row_top, if you want every object to share exact row top
new_bottom = new_top + src_h
# crop from original and paste into new image
crop = img.crop((x1, y1, x2, y2))
new_img.paste(crop, (new_left, new_top, new_right, new_bottom))
return new_img
# In[68]:
img = Image.open(IMAGE_PATH)
aligned = align_to_grid(img, rows)
aligned.save("beatles_grid.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment