Skip to content

Instantly share code, notes, and snippets.

@iTrooz
Created March 29, 2026 02:26
Show Gist options
  • Select an option

  • Save iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a to your computer and use it in GitHub Desktop.

Select an option

Save iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a to your computer and use it in GitHub Desktop.
Opinionated diffing tool for ccache input-text files
#!/usr/bin/env python3
"""
https://gist.github.com/iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a
Opinionated diffing tool for ccache input-text files, focused on identifying cache miss reasons between two builds of the same files.
Usage: main.py [-t THRESHOLD] [-c CONTEXT] [CCACHE_DEBUG_DIR]
Enable ccache debug mode first:
debug = true
debug_dir = /home/itrooz/.cache/ccache-debug
"""
from __future__ import annotations
import argparse
import difflib
import re
import sys
from dataclasses import dataclass
from pathlib import Path
# Sentinel value representing no differences between files
NO_DIFF = "<<NO_DIFF>>"
# ANSI color codes
RESET = "\033[0m"
RED = "\033[31m"
GREEN = "\033[32m"
CYAN = "\033[36m"
YELLOW = "\033[33m"
@dataclass
class DiffInfo:
"""Holds information about a unique diff pattern."""
count: int
sample_name: str
original_text: str
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
prog="main.py",
usage="%(prog)s [-t THRESHOLD] [-c CONTEXT] [DIR]",
)
parser.add_argument(
"-t",
"--large-diff-threshold",
type=int,
default=20,
help="Maximum diff line count before counting as too large (default: 20)",
)
parser.add_argument(
"-c",
"--context",
type=int,
default=1,
help="Number of context lines to show around changes (default: 0)",
)
parser.add_argument("directory", nargs="?", default=".")
args = parser.parse_args()
if not Path(args.directory).is_dir():
parser.error(f"directory does not exist: {args.directory}")
return args
def colorize_diff(diff_text: str, use_color: bool) -> str:
"""Apply ANSI color codes to diff output."""
if not use_color:
return diff_text
out_lines: list[str] = []
for line in diff_text.splitlines(keepends=True):
if line.startswith("@@"):
out_lines.append(f"{CYAN}{line}{RESET}")
elif line.startswith("+"):
out_lines.append(f"{GREEN}{line}{RESET}")
elif line.startswith("-"):
out_lines.append(f"{RED}{line}{RESET}")
else:
out_lines.append(line)
return "".join(out_lines)
def collect_records(root: Path) -> dict[str, list[tuple[str, Path]]]:
"""Recursively find and group ccache input-text files by NAME."""
pattern = re.compile(r"^(.+)\.o\.([0-9]{8}_[0-9]{6}_[0-9]{6})\.ccache-input-text$")
grouped: dict[str, list[tuple[str, Path]]] = {}
for filepath in root.rglob("*.ccache-input-text"):
match = pattern.match(filepath.name)
if not match:
continue
name = match.group(1)
timestamp = match.group(2)
if name not in grouped:
grouped[name] = []
grouped[name].append((timestamp, filepath))
return grouped
def read_file_filtered(path: Path, name: str) -> list[str] | None:
"""Read input file, filtering out lines ending with NAME, stopping at '### cpp'."""
try:
with path.open("r", encoding="utf-8", errors="replace") as handle:
lines = handle.readlines()
except OSError as exc:
print(f"Warning: failed to read file: {path} ({exc})", file=sys.stderr)
return None
result = []
for line in lines:
# 'cpp' indicates that the whole file is pasted there. This *should* be the last key, and is very long to diff, so we ignore it.
if "### cpp" in line:
break
if not line.rstrip("\n").endswith(name):
result.append(line)
return result
def get_diff(old_lines: list[str], new_lines: list[str], context: int = 0) -> tuple[str, str]:
"""Return diff without context and diff with context.
Returns tuple of (diff_no_context, diff_with_context).
"""
differ = difflib.Differ()
all_diff = list(differ.compare(old_lines, new_lines))
# get actual diff
diff_no_context = "".join(line for line in all_diff if line.startswith(("+", "-")))
# include surrounding unchanged lines (context) for display
changed_indices = {i for i, line in enumerate(all_diff) if line.startswith(("+", "-"))}
include_indices = set()
for idx in changed_indices:
for offset in range(-context, context + 1):
include_indices.add(max(0, min(len(all_diff) - 1, idx + offset)))
diff_with_context = "".join(all_diff[i] for i in sorted(include_indices))
return diff_no_context, diff_with_context
def process_diff_groups(
grouped: dict[str, list[tuple[str, Path]]],
threshold: int,
context: int = 0,
) -> tuple[dict[str, DiffInfo], int]:
"""Process each NAME group, compute diffs, and collect statistics."""
diffs: dict[str, DiffInfo] = {}
large_diff_count = 0
# Process each NAME group
for name in sorted(grouped.keys()):
# Sort by timestamp (descending) and skip if fewer than 2 records
records = sorted(grouped[name], key=lambda x: x[0], reverse=True)
if len(records) < 2:
continue
# Compare the two most recent files
newest_file = records[0][1]
second_newest_file = records[1][1]
old_lines = read_file_filtered(second_newest_file, name)
new_lines = read_file_filtered(newest_file, name)
if old_lines is None or new_lines is None:
continue
# Compute diff and normalize by sorting lines for consistent deduplication
diff_no_context, diff_with_context = get_diff(old_lines, new_lines, context)
if diff_no_context == "":
normalized = NO_DIFF
else:
# Sort the lines of the diff (based on no-context version for consistent dedup)
sorted_lines = sorted(diff_no_context.splitlines(keepends=True))
normalized = "".join(sorted_lines)
# Skip diffs that exceed the line threshold
if normalized != NO_DIFF:
diff_line_count = len(normalized.splitlines())
if diff_line_count > threshold:
large_diff_count += 1
continue
# Track this diff using DiffInfo - store the context version for printing
if normalized not in diffs:
diffs[normalized] = DiffInfo(count=0, sample_name=name, original_text=diff_with_context)
diffs[normalized].count += 1
return diffs, large_diff_count
def print_results(
diffs: dict[str, DiffInfo],
large_diff_count: int,
threshold: int,
use_color: bool,
) -> None:
"""Print deduped diffs and summary statistics."""
# Print summary statistics
total_compared = sum(info.count for info in diffs.values())
print(f"Compared NAME groups: {total_compared}")
print(f"Unique diffs: {len(diffs)}")
print()
# Sort diffs by frequency (descending), then by content (ascending) for determinism
ordered = sorted(diffs.items(), key=lambda item: (-item[1].count, item[0]))
for normalized, info in ordered:
print(f"=== Diff that occurred {info.count} time(s), SAMPLE NAME: {info.sample_name} ===")
if normalized == NO_DIFF:
print("(no differences)")
else:
# Print colorized original diff
print(colorize_diff(info.original_text, use_color), end="" if info.original_text.endswith("\n") else "\n")
print()
# Print summary of skipped large diffs
if large_diff_count > 0:
message = (
f"{large_diff_count} diff(s) were too large to show "
f"(threshold: {threshold} lines)."
)
if use_color:
print(f"{YELLOW}{message}{RESET}")
else:
print(message)
def main() -> int:
"""Main entry point: find files, compute diffs, deduplicate, and report results."""
# Parse CLI arguments and detect TTY for color output
args = parse_args()
use_color = sys.stdout.isatty()
root = Path(args.directory)
# Collect all matching files grouped by NAME
grouped = collect_records(root)
if not grouped:
print(f"No files matching NAME.o.TIMESTAMP.ccache-input-text were found in: {root}")
return 0
# Process diffs and collect statistics
diffs, large_diff_count = process_diff_groups(
grouped, args.large_diff_threshold, args.context
)
# Check if any diffs were found
if not diffs:
print("No comparable NAME groups were found (need at least two matching files per NAME).")
return 0
# Display results
print_results(diffs, large_diff_count, args.large_diff_threshold, use_color)
return 0
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment