Created
November 13, 2025 03:00
-
-
Save olooney/30fa7ca8aaa7ce720de20f525fc3db60 to your computer and use it in GitHub Desktop.
Grid Alignment with OpenCV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| get_ipython().run_line_magic('pip', 'install opencv-python') | |
| # In[56]: | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image, ImageDraw | |
| # In[57]: | |
| # ---------- CONFIG ---------- | |
| IMAGE_PATH = "beatles.jpg" # path to your original image | |
| OUTPUT_VIS_PATH = "beatles_detect_rows.png" | |
| THRESH_VALUE = 240 # near white | |
| MIN_AREA = 50 | |
| ROW_GAP_FACTOR = 0.5 | |
| EROSION_ITERATIONS = 5 | |
| # In[58]: | |
| # ---------- STEP 1: LOAD IMAGE & THRESHOLD ---------- | |
| # OpenCV loads BGR | |
| bgr = cv2.imread(IMAGE_PATH) | |
| if bgr is None: | |
| raise ValueError(f"Could not read image at {IMAGE_PATH}") | |
| h_img, w_img = bgr.shape[:2] | |
| gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY) | |
| # Threshold: background is near white, so invert | |
| _, th = cv2.threshold(gray, THRESH_VALUE, 255, cv2.THRESH_BINARY_INV) | |
| # Optional: clean up with morphology | |
| kernel = np.ones((3, 3), np.uint8) | |
| th_clean = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=EROSION_ITERATIONS) | |
| # In[59]: | |
| # ---------- STEP 2: FIND OBJECTS (CONNECTED COMPONENTS OR CONTOURS) ---------- | |
| # Using connected components here (you could also use findContours) | |
| num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(th_clean) | |
| # stats: [label][x, y, w, h, area] | |
| objects = [] | |
| for label in range(1, num_labels): # skip label 0 (background) | |
| x, y, w, h, area = stats[label] | |
| if area < MIN_AREA: | |
| continue | |
| cy = y + h / 2.0 | |
| obj = { | |
| "label": label, | |
| "bbox": (x, y, x + w, y + h), | |
| "cy": cy, | |
| "h": h, | |
| } | |
| objects.append(obj) | |
| if not objects: | |
| raise RuntimeError("No objects found after thresholding/connected components") | |
| # In[60]: | |
| # ---------- STEP 3: AGGLOMERATIVE 1D CLUSTERING (AVERAGE LINKAGE) ---------- | |
| def agglomerative_1d_average_linkage(values, dist_threshold): | |
| """ | |
| values: list of floats (cy's) | |
| dist_threshold: merge clusters while distance < threshold | |
| Distance between clusters = abs(mean(c1) - mean(c2)). | |
| Returns: list of clusters, each a list of indices into 'values'. | |
| """ | |
| clusters = [[i] for i in range(len(values))] | |
| def cluster_mean(c): | |
| return float(np.mean([values[i] for i in c])) | |
| while True: | |
| n = len(clusters) | |
| if n <= 1: | |
| break | |
| best_d = None | |
| best_pair = None | |
| means = [cluster_mean(c) for c in clusters] | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| d = abs(means[i] - means[j]) | |
| if best_d is None or d < best_d: | |
| best_d = d | |
| best_pair = (i, j) | |
| if best_d is None or best_d >= dist_threshold: | |
| break | |
| i, j = best_pair | |
| new_cluster = clusters[i] + clusters[j] | |
| new_clusters = [] | |
| for k, c in enumerate(clusters): | |
| if k == i: | |
| new_clusters.append(new_cluster) | |
| elif k == j: | |
| continue | |
| else: | |
| new_clusters.append(c) | |
| clusters = new_clusters | |
| return clusters | |
| # Prepare data for clustering | |
| heights = [obj["h"] for obj in objects] | |
| avg_h = float(np.mean(heights)) | |
| dist_threshold = ROW_GAP_FACTOR * avg_h | |
| values_cy = [obj["cy"] for obj in objects] | |
| clusters = agglomerative_1d_average_linkage(values_cy, dist_threshold) | |
| # In[61]: | |
| # ---------- STEP 4: TURN CLUSTERS INTO ORDERED ROWS ---------- | |
| def cluster_mean_cy(cluster_indices): | |
| return float(np.mean([objects[i]["cy"] for i in cluster_indices])) | |
| # sort clusters by their vertical position | |
| clusters_sorted = sorted(clusters, key=cluster_mean_cy) | |
| # rows: list of lists of bboxes (x1, y1, x2, y2), left-to-right order | |
| rows = [] | |
| for cluster in clusters_sorted: | |
| cluster_objs = [objects[i] for i in cluster] | |
| cluster_objs_sorted = sorted(cluster_objs, key=lambda o: o["bbox"][0]) # by x1 | |
| rows.append([o["bbox"] for o in cluster_objs_sorted]) | |
| # Row bounds (top/bottom) | |
| row_bounds = [] | |
| for row in rows: | |
| ys_top = [b[1] for b in row] | |
| ys_bottom = [b[3] for b in row] | |
| row_bounds.append((min(ys_top), max(ys_bottom))) | |
| # In[62]: | |
| # ---------- STEP 6: VISUALIZATION WITH PIL (BLUE ROWS, GREEN BOXES) ---------- | |
| # Convert original BGR image to RGBA for alpha drawing | |
| rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB) | |
| pil_img = Image.fromarray(rgb).convert("RGBA") | |
| draw = ImageDraw.Draw(pil_img, "RGBA") | |
| # Row bounds in 50% opacity blue | |
| for top, bottom in row_bounds: | |
| draw.line([(0, top), (w_img, top)], fill=(0, 0, 255, 128), width=1) | |
| draw.line([(0, bottom), (w_img, bottom)], fill=(0, 0, 255, 128), width=1) | |
| # Bounding boxes in 50% opacity green | |
| for row in rows: | |
| for (x1, y1, x2, y2) in row: | |
| # top | |
| draw.line([(x1, y1), (x2, y1)], fill=(0, 255, 0, 128), width=1) | |
| # bottom | |
| draw.line([(x1, y2), (x2, y2)], fill=(0, 255, 0, 128), width=1) | |
| # left | |
| draw.line([(x1, y1), (x1, y2)], fill=(0, 255, 0, 128), width=1) | |
| # right | |
| draw.line([(x2, y1), (x2, y2)], fill=(0, 255, 0, 128), width=1) | |
| pil_img.save(OUTPUT_VIS_PATH) | |
| print(f"\nVisualization saved to {OUTPUT_VIS_PATH}") | |
| # In[65]: | |
| def align_to_grid(img, rows, background=(255, 255, 255)): | |
| """ | |
| img : original PIL image | |
| rows : list of rows; each row is list of (x1, y1, x2, y2) bboxes | |
| background : RGB background color for new image | |
| """ | |
| w_img, h_img = img.size | |
| # --- 1. Column widths: max width in each column across all rows --- | |
| num_cols = max(len(row) for row in rows) | |
| col_widths = [0] * num_cols | |
| for row in rows: | |
| for c, (x1, y1, x2, y2) in enumerate(row): | |
| width = x2 - x1 | |
| col_widths[c] = max(col_widths[c], width) | |
| total_col_width = sum(col_widths) | |
| extra = w_img - total_col_width | |
| if extra < 0: | |
| raise ValueError("Columns wider than image width; cannot fit grid.") | |
| # --- 2. Distribute extra padding into C+1 gaps as fairly as possible --- | |
| base_gap = extra // (num_cols + 1) | |
| remainder = extra % (num_cols + 1) | |
| # gaps[i] is the gap before column i (0..C) and after last column (C) | |
| gaps = [base_gap + (1 if i < remainder else 0) for i in range(num_cols + 1)] | |
| # --- 3. Compute left x for each column --- | |
| col_lefts = [] | |
| x = gaps[0] | |
| for c in range(num_cols): | |
| col_lefts.append(x) | |
| x += col_widths[c] + gaps[c + 1] | |
| # --- 4. Compute row top/bottom (keep them unchanged) --- | |
| row_bounds = [] | |
| for row in rows: | |
| tops = [b[1] for b in row] | |
| bottoms = [b[3] for b in row] | |
| row_bounds.append((min(tops), max(bottoms))) | |
| # --- 5. Create new image and paste each object into its aligned slot --- | |
| new_img = Image.new(img.mode, (w_img, h_img), background) | |
| for r, row in enumerate(rows): | |
| row_top, row_bottom = row_bounds[r] | |
| for c, (x1, y1, x2, y2) in enumerate(row): | |
| if c >= num_cols: | |
| continue | |
| src_w = x2 - x1 | |
| src_h = y2 - y1 | |
| col_w = col_widths[c] | |
| col_left = col_lefts[c] | |
| # center the object within its column width | |
| new_left = int(round(col_left + (col_w - src_w) / 2.0)) | |
| new_right = new_left + src_w | |
| # keep original vertical placement (same top/bottom) | |
| new_top = y1 # or row_top, if you want every object to share exact row top | |
| new_bottom = new_top + src_h | |
| # crop from original and paste into new image | |
| crop = img.crop((x1, y1, x2, y2)) | |
| new_img.paste(crop, (new_left, new_top, new_right, new_bottom)) | |
| return new_img | |
| # In[68]: | |
| img = Image.open(IMAGE_PATH) | |
| aligned = align_to_grid(img, rows) | |
| aligned.save("beatles_grid.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment