Created
March 29, 2026 02:26
-
-
Save iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a to your computer and use it in GitHub Desktop.
Opinionated diffing tool for ccache input-text files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| https://gist.github.com/iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a | |
| Opinionated diffing tool for ccache input-text files, focused on identifying cache miss reasons between two builds of the same files. | |
| Usage: main.py [-t THRESHOLD] [-c CONTEXT] [CCACHE_DEBUG_DIR] | |
| Enable ccache debug mode first: | |
| debug = true | |
| debug_dir = /home/itrooz/.cache/ccache-debug | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import difflib | |
| import re | |
| import sys | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| # Sentinel value representing no differences between files | |
| NO_DIFF = "<<NO_DIFF>>" | |
| # ANSI color codes | |
| RESET = "\033[0m" | |
| RED = "\033[31m" | |
| GREEN = "\033[32m" | |
| CYAN = "\033[36m" | |
| YELLOW = "\033[33m" | |
| @dataclass | |
| class DiffInfo: | |
| """Holds information about a unique diff pattern.""" | |
| count: int | |
| sample_name: str | |
| original_text: str | |
| def parse_args() -> argparse.Namespace: | |
| """Parse command-line arguments.""" | |
| parser = argparse.ArgumentParser( | |
| prog="main.py", | |
| usage="%(prog)s [-t THRESHOLD] [-c CONTEXT] [DIR]", | |
| ) | |
| parser.add_argument( | |
| "-t", | |
| "--large-diff-threshold", | |
| type=int, | |
| default=20, | |
| help="Maximum diff line count before counting as too large (default: 20)", | |
| ) | |
| parser.add_argument( | |
| "-c", | |
| "--context", | |
| type=int, | |
| default=1, | |
| help="Number of context lines to show around changes (default: 0)", | |
| ) | |
| parser.add_argument("directory", nargs="?", default=".") | |
| args = parser.parse_args() | |
| if not Path(args.directory).is_dir(): | |
| parser.error(f"directory does not exist: {args.directory}") | |
| return args | |
| def colorize_diff(diff_text: str, use_color: bool) -> str: | |
| """Apply ANSI color codes to diff output.""" | |
| if not use_color: | |
| return diff_text | |
| out_lines: list[str] = [] | |
| for line in diff_text.splitlines(keepends=True): | |
| if line.startswith("@@"): | |
| out_lines.append(f"{CYAN}{line}{RESET}") | |
| elif line.startswith("+"): | |
| out_lines.append(f"{GREEN}{line}{RESET}") | |
| elif line.startswith("-"): | |
| out_lines.append(f"{RED}{line}{RESET}") | |
| else: | |
| out_lines.append(line) | |
| return "".join(out_lines) | |
| def collect_records(root: Path) -> dict[str, list[tuple[str, Path]]]: | |
| """Recursively find and group ccache input-text files by NAME.""" | |
| pattern = re.compile(r"^(.+)\.o\.([0-9]{8}_[0-9]{6}_[0-9]{6})\.ccache-input-text$") | |
| grouped: dict[str, list[tuple[str, Path]]] = {} | |
| for filepath in root.rglob("*.ccache-input-text"): | |
| match = pattern.match(filepath.name) | |
| if not match: | |
| continue | |
| name = match.group(1) | |
| timestamp = match.group(2) | |
| if name not in grouped: | |
| grouped[name] = [] | |
| grouped[name].append((timestamp, filepath)) | |
| return grouped | |
| def read_file_filtered(path: Path, name: str) -> list[str] | None: | |
| """Read input file, filtering out lines ending with NAME, stopping at '### cpp'.""" | |
| try: | |
| with path.open("r", encoding="utf-8", errors="replace") as handle: | |
| lines = handle.readlines() | |
| except OSError as exc: | |
| print(f"Warning: failed to read file: {path} ({exc})", file=sys.stderr) | |
| return None | |
| result = [] | |
| for line in lines: | |
| # 'cpp' indicates that the whole file is pasted there. This *should* be the last key, and is very long to diff, so we ignore it. | |
| if "### cpp" in line: | |
| break | |
| if not line.rstrip("\n").endswith(name): | |
| result.append(line) | |
| return result | |
| def get_diff(old_lines: list[str], new_lines: list[str], context: int = 0) -> tuple[str, str]: | |
| """Return diff without context and diff with context. | |
| Returns tuple of (diff_no_context, diff_with_context). | |
| """ | |
| differ = difflib.Differ() | |
| all_diff = list(differ.compare(old_lines, new_lines)) | |
| # get actual diff | |
| diff_no_context = "".join(line for line in all_diff if line.startswith(("+", "-"))) | |
| # include surrounding unchanged lines (context) for display | |
| changed_indices = {i for i, line in enumerate(all_diff) if line.startswith(("+", "-"))} | |
| include_indices = set() | |
| for idx in changed_indices: | |
| for offset in range(-context, context + 1): | |
| include_indices.add(max(0, min(len(all_diff) - 1, idx + offset))) | |
| diff_with_context = "".join(all_diff[i] for i in sorted(include_indices)) | |
| return diff_no_context, diff_with_context | |
| def process_diff_groups( | |
| grouped: dict[str, list[tuple[str, Path]]], | |
| threshold: int, | |
| context: int = 0, | |
| ) -> tuple[dict[str, DiffInfo], int]: | |
| """Process each NAME group, compute diffs, and collect statistics.""" | |
| diffs: dict[str, DiffInfo] = {} | |
| large_diff_count = 0 | |
| # Process each NAME group | |
| for name in sorted(grouped.keys()): | |
| # Sort by timestamp (descending) and skip if fewer than 2 records | |
| records = sorted(grouped[name], key=lambda x: x[0], reverse=True) | |
| if len(records) < 2: | |
| continue | |
| # Compare the two most recent files | |
| newest_file = records[0][1] | |
| second_newest_file = records[1][1] | |
| old_lines = read_file_filtered(second_newest_file, name) | |
| new_lines = read_file_filtered(newest_file, name) | |
| if old_lines is None or new_lines is None: | |
| continue | |
| # Compute diff and normalize by sorting lines for consistent deduplication | |
| diff_no_context, diff_with_context = get_diff(old_lines, new_lines, context) | |
| if diff_no_context == "": | |
| normalized = NO_DIFF | |
| else: | |
| # Sort the lines of the diff (based on no-context version for consistent dedup) | |
| sorted_lines = sorted(diff_no_context.splitlines(keepends=True)) | |
| normalized = "".join(sorted_lines) | |
| # Skip diffs that exceed the line threshold | |
| if normalized != NO_DIFF: | |
| diff_line_count = len(normalized.splitlines()) | |
| if diff_line_count > threshold: | |
| large_diff_count += 1 | |
| continue | |
| # Track this diff using DiffInfo - store the context version for printing | |
| if normalized not in diffs: | |
| diffs[normalized] = DiffInfo(count=0, sample_name=name, original_text=diff_with_context) | |
| diffs[normalized].count += 1 | |
| return diffs, large_diff_count | |
| def print_results( | |
| diffs: dict[str, DiffInfo], | |
| large_diff_count: int, | |
| threshold: int, | |
| use_color: bool, | |
| ) -> None: | |
| """Print deduped diffs and summary statistics.""" | |
| # Print summary statistics | |
| total_compared = sum(info.count for info in diffs.values()) | |
| print(f"Compared NAME groups: {total_compared}") | |
| print(f"Unique diffs: {len(diffs)}") | |
| print() | |
| # Sort diffs by frequency (descending), then by content (ascending) for determinism | |
| ordered = sorted(diffs.items(), key=lambda item: (-item[1].count, item[0])) | |
| for normalized, info in ordered: | |
| print(f"=== Diff that occurred {info.count} time(s), SAMPLE NAME: {info.sample_name} ===") | |
| if normalized == NO_DIFF: | |
| print("(no differences)") | |
| else: | |
| # Print colorized original diff | |
| print(colorize_diff(info.original_text, use_color), end="" if info.original_text.endswith("\n") else "\n") | |
| print() | |
| # Print summary of skipped large diffs | |
| if large_diff_count > 0: | |
| message = ( | |
| f"{large_diff_count} diff(s) were too large to show " | |
| f"(threshold: {threshold} lines)." | |
| ) | |
| if use_color: | |
| print(f"{YELLOW}{message}{RESET}") | |
| else: | |
| print(message) | |
| def main() -> int: | |
| """Main entry point: find files, compute diffs, deduplicate, and report results.""" | |
| # Parse CLI arguments and detect TTY for color output | |
| args = parse_args() | |
| use_color = sys.stdout.isatty() | |
| root = Path(args.directory) | |
| # Collect all matching files grouped by NAME | |
| grouped = collect_records(root) | |
| if not grouped: | |
| print(f"No files matching NAME.o.TIMESTAMP.ccache-input-text were found in: {root}") | |
| return 0 | |
| # Process diffs and collect statistics | |
| diffs, large_diff_count = process_diff_groups( | |
| grouped, args.large_diff_threshold, args.context | |
| ) | |
| # Check if any diffs were found | |
| if not diffs: | |
| print("No comparable NAME groups were found (need at least two matching files per NAME).") | |
| return 0 | |
| # Display results | |
| print_results(diffs, large_diff_count, args.large_diff_threshold, use_color) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment