iTrooz · March 29, 2026 02:26
diff --git a/main.py b/main.py
 #!/usr/bin/env python3
 """
 https://gist.github.com/iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a
 Opinionated diffing tool for ccache input-text files, focused on identifying cache miss reasons between two builds of the same files.
 Usage: main.py [-t THRESHOLD] [-c CONTEXT] [CCACHE_DEBUG_DIR]
 Enable ccache debug mode first:
 debug = true
 debug_dir = /home/itrooz/.cache/ccache-debug
 """

 from __future__ import annotations

 import argparse
 import difflib
 import re
 import sys
 from dataclasses import dataclass
 from pathlib import Path


 # Sentinel value representing no differences between files
 NO_DIFF = "<<NO_DIFF>>"

 # ANSI color codes
 RESET = "\033[0m"
 RED = "\033[31m"
 GREEN = "\033[32m"
 CYAN = "\033[36m"
 YELLOW = "\033[33m"

 @dataclass
 class DiffInfo:
    """Holds information about a unique diff pattern."""
    count: int
    sample_name: str
    original_text: str


 def parse_args() -> argparse.Namespace:
    """Parse command-line arguments."""
    parser = argparse.ArgumentParser(
        prog="main.py",
        usage="%(prog)s [-t THRESHOLD] [-c CONTEXT] [DIR]",
    )
    parser.add_argument(
        "-t",
        "--large-diff-threshold",
        type=int,
        default=20,
        help="Maximum diff line count before counting as too large (default: 20)",
    )
    parser.add_argument(
        "-c",
        "--context",
        type=int,
        default=1,
        help="Number of context lines to show around changes (default: 0)",
    )
    parser.add_argument("directory", nargs="?", default=".")
    args = parser.parse_args()

    if not Path(args.directory).is_dir():
        parser.error(f"directory does not exist: {args.directory}")

    return args


 def colorize_diff(diff_text: str, use_color: bool) -> str:
    """Apply ANSI color codes to diff output."""
    if not use_color:
        return diff_text

    out_lines: list[str] = []
    for line in diff_text.splitlines(keepends=True):
        if line.startswith("@@"):
            out_lines.append(f"{CYAN}{line}{RESET}")
        elif line.startswith("+"):
            out_lines.append(f"{GREEN}{line}{RESET}")
        elif line.startswith("-"):
            out_lines.append(f"{RED}{line}{RESET}")
        else:
            out_lines.append(line)

    return "".join(out_lines)


 def collect_records(root: Path) -> dict[str, list[tuple[str, Path]]]:
    """Recursively find and group ccache input-text files by NAME."""
    pattern = re.compile(r"^(.+)\.o\.([0-9]{8}_[0-9]{6}_[0-9]{6})\.ccache-input-text$")
    grouped: dict[str, list[tuple[str, Path]]] = {}

    for filepath in root.rglob("*.ccache-input-text"):
        match = pattern.match(filepath.name)
        if not match:
            continue
        name = match.group(1)
        timestamp = match.group(2)
        if name not in grouped:
            grouped[name] = []
        grouped[name].append((timestamp, filepath))

    return grouped


 def read_file_filtered(path: Path, name: str) -> list[str] | None:
    """Read input file, filtering out lines ending with NAME, stopping at '### cpp'."""
    try:
        with path.open("r", encoding="utf-8", errors="replace") as handle:
            lines = handle.readlines()
    except OSError as exc:
        print(f"Warning: failed to read file: {path} ({exc})", file=sys.stderr)
        return None

    result = []
    for line in lines:
        # 'cpp' indicates that the whole file is pasted there. This *should* be the last key, and is very long to diff, so we ignore it.
        if "### cpp" in line:
            break
        if not line.rstrip("\n").endswith(name):
            result.append(line)
    return result


 def get_diff(old_lines: list[str], new_lines: list[str], context: int = 0) -> tuple[str, str]:
    """Return diff without context and diff with context.
    
    Returns tuple of (diff_no_context, diff_with_context).
    """
    differ = difflib.Differ()
    all_diff = list(differ.compare(old_lines, new_lines))
    
    # get actual diff
    diff_no_context = "".join(line for line in all_diff if line.startswith(("+", "-")))

    # include surrounding unchanged lines (context) for display
    changed_indices = {i for i, line in enumerate(all_diff) if line.startswith(("+", "-"))}
    include_indices = set()
    for idx in changed_indices:
        for offset in range(-context, context + 1):
            include_indices.add(max(0, min(len(all_diff) - 1, idx + offset)))
    
    diff_with_context = "".join(all_diff[i] for i in sorted(include_indices))
    
    return diff_no_context, diff_with_context


 def process_diff_groups(
    grouped: dict[str, list[tuple[str, Path]]],
    threshold: int,
    context: int = 0,
 ) -> tuple[dict[str, DiffInfo], int]:
    """Process each NAME group, compute diffs, and collect statistics."""
    diffs: dict[str, DiffInfo] = {}
    large_diff_count = 0

    # Process each NAME group
    for name in sorted(grouped.keys()):
        # Sort by timestamp (descending) and skip if fewer than 2 records
        records = sorted(grouped[name], key=lambda x: x[0], reverse=True)
        if len(records) < 2:
            continue

        # Compare the two most recent files
        newest_file = records[0][1]
        second_newest_file = records[1][1]

        old_lines = read_file_filtered(second_newest_file, name)
        new_lines = read_file_filtered(newest_file, name)
        if old_lines is None or new_lines is None:
            continue

        # Compute diff and normalize by sorting lines for consistent deduplication
        diff_no_context, diff_with_context = get_diff(old_lines, new_lines, context)
        if diff_no_context == "":
            normalized = NO_DIFF
        else:
            # Sort the lines of the diff (based on no-context version for consistent dedup)
            sorted_lines = sorted(diff_no_context.splitlines(keepends=True))
            normalized = "".join(sorted_lines)

        # Skip diffs that exceed the line threshold
        if normalized != NO_DIFF:
            diff_line_count = len(normalized.splitlines())
            if diff_line_count > threshold:
                large_diff_count += 1
                continue

        # Track this diff using DiffInfo - store the context version for printing
        if normalized not in diffs:
            diffs[normalized] = DiffInfo(count=0, sample_name=name, original_text=diff_with_context)
        diffs[normalized].count += 1

    return diffs, large_diff_count


 def print_results(
    diffs: dict[str, DiffInfo],
    large_diff_count: int,
    threshold: int,
    use_color: bool,
 ) -> None:
    """Print deduped diffs and summary statistics."""
    # Print summary statistics
    total_compared = sum(info.count for info in diffs.values())
    print(f"Compared NAME groups: {total_compared}")
    print(f"Unique diffs: {len(diffs)}")
    print()

    # Sort diffs by frequency (descending), then by content (ascending) for determinism
    ordered = sorted(diffs.items(), key=lambda item: (-item[1].count, item[0]))
    for normalized, info in ordered:
        print(f"=== Diff that occurred {info.count} time(s), SAMPLE NAME: {info.sample_name} ===")
        if normalized == NO_DIFF:
            print("(no differences)")
        else:
            # Print colorized original diff
            print(colorize_diff(info.original_text, use_color), end="" if info.original_text.endswith("\n") else "\n")
        print()

    # Print summary of skipped large diffs
    if large_diff_count > 0:
        message = (
            f"{large_diff_count} diff(s) were too large to show "
            f"(threshold: {threshold} lines)."
        )
        if use_color:
            print(f"{YELLOW}{message}{RESET}")
        else:
            print(message)


 def main() -> int:
    """Main entry point: find files, compute diffs, deduplicate, and report results."""
    # Parse CLI arguments and detect TTY for color output
    args = parse_args()
    use_color = sys.stdout.isatty()
    root = Path(args.directory)

    # Collect all matching files grouped by NAME
    grouped = collect_records(root)
    if not grouped:
        print(f"No files matching NAME.o.TIMESTAMP.ccache-input-text were found in: {root}")
        return 0

    # Process diffs and collect statistics
    diffs, large_diff_count = process_diff_groups(
        grouped, args.large_diff_threshold, args.context
    )

    # Check if any diffs were found
    if not diffs:
        print("No comparable NAME groups were found (need at least two matching files per NAME).")
        return 0

    # Display results
    print_results(diffs, large_diff_count, args.large_diff_threshold, use_color)

    return 0


 if __name__ == "__main__":
    raise SystemExit(main())
	#!/usr/bin/env python3
	"""
	https://gist.github.com/iTrooz/b5b02e7bde8509bcbbbabdfac8c1bd1a
	Opinionated diffing tool for ccache input-text files, focused on identifying cache miss reasons between two builds of the same files.
	Usage: main.py [-t THRESHOLD] [-c CONTEXT] [CCACHE_DEBUG_DIR]
	Enable ccache debug mode first:
	debug = true
	debug_dir = /home/itrooz/.cache/ccache-debug
	"""

	from __future__ import annotations

	import argparse
	import difflib
	import re
	import sys
	from dataclasses import dataclass
	from pathlib import Path


	# Sentinel value representing no differences between files
	NO_DIFF = "<<NO_DIFF>>"

	# ANSI color codes
	RESET = "\033[0m"
	RED = "\033[31m"
	GREEN = "\033[32m"
	CYAN = "\033[36m"
	YELLOW = "\033[33m"

	@dataclass
	class DiffInfo:
	"""Holds information about a unique diff pattern."""
	count: int
	sample_name: str
	original_text: str


	def parse_args() -> argparse.Namespace:
	"""Parse command-line arguments."""
	parser = argparse.ArgumentParser(
	prog="main.py",
	usage="%(prog)s [-t THRESHOLD] [-c CONTEXT] [DIR]",
	)
	parser.add_argument(
	"-t",
	"--large-diff-threshold",
	type=int,
	default=20,
	help="Maximum diff line count before counting as too large (default: 20)",
	)
	parser.add_argument(
	"-c",
	"--context",
	type=int,
	default=1,
	help="Number of context lines to show around changes (default: 0)",
	)
	parser.add_argument("directory", nargs="?", default=".")
	args = parser.parse_args()

	if not Path(args.directory).is_dir():
	parser.error(f"directory does not exist: {args.directory}")

	return args


	def colorize_diff(diff_text: str, use_color: bool) -> str:
	"""Apply ANSI color codes to diff output."""
	if not use_color:
	return diff_text

	out_lines: list[str] = []
	for line in diff_text.splitlines(keepends=True):
	if line.startswith("@@"):
	out_lines.append(f"{CYAN}{line}{RESET}")
	elif line.startswith("+"):
	out_lines.append(f"{GREEN}{line}{RESET}")
	elif line.startswith("-"):
	out_lines.append(f"{RED}{line}{RESET}")
	else:
	out_lines.append(line)

	return "".join(out_lines)


	def collect_records(root: Path) -> dict[str, list[tuple[str, Path]]]:
	"""Recursively find and group ccache input-text files by NAME."""
	pattern = re.compile(r"^(.+)\.o\.([0-9]{8}_[0-9]{6}_[0-9]{6})\.ccache-input-text$")
	grouped: dict[str, list[tuple[str, Path]]] = {}

	for filepath in root.rglob("*.ccache-input-text"):
	match = pattern.match(filepath.name)
	if not match:
	continue
	name = match.group(1)
	timestamp = match.group(2)
	if name not in grouped:
	grouped[name] = []
	grouped[name].append((timestamp, filepath))

	return grouped


	def read_file_filtered(path: Path, name: str) -> list[str] \| None:
	"""Read input file, filtering out lines ending with NAME, stopping at '### cpp'."""
	try:
	with path.open("r", encoding="utf-8", errors="replace") as handle:
	lines = handle.readlines()
	except OSError as exc:
	print(f"Warning: failed to read file: {path} ({exc})", file=sys.stderr)
	return None

	result = []
	for line in lines:
	# 'cpp' indicates that the whole file is pasted there. This should be the last key, and is very long to diff, so we ignore it.
	if "### cpp" in line:
	break
	if not line.rstrip("\n").endswith(name):
	result.append(line)
	return result


	def get_diff(old_lines: list[str], new_lines: list[str], context: int = 0) -> tuple[str, str]:
	"""Return diff without context and diff with context.

	Returns tuple of (diff_no_context, diff_with_context).
	"""
	differ = difflib.Differ()
	all_diff = list(differ.compare(old_lines, new_lines))

	# get actual diff
	diff_no_context = "".join(line for line in all_diff if line.startswith(("+", "-")))

	# include surrounding unchanged lines (context) for display
	changed_indices = {i for i, line in enumerate(all_diff) if line.startswith(("+", "-"))}
	include_indices = set()
	for idx in changed_indices:
	for offset in range(-context, context + 1):
	include_indices.add(max(0, min(len(all_diff) - 1, idx + offset)))

	diff_with_context = "".join(all_diff[i] for i in sorted(include_indices))

	return diff_no_context, diff_with_context


	def process_diff_groups(
	grouped: dict[str, list[tuple[str, Path]]],
	threshold: int,
	context: int = 0,
	) -> tuple[dict[str, DiffInfo], int]:
	"""Process each NAME group, compute diffs, and collect statistics."""
	diffs: dict[str, DiffInfo] = {}
	large_diff_count = 0

	# Process each NAME group
	for name in sorted(grouped.keys()):
	# Sort by timestamp (descending) and skip if fewer than 2 records
	records = sorted(grouped[name], key=lambda x: x[0], reverse=True)
	if len(records) < 2:
	continue

	# Compare the two most recent files
	newest_file = records[0][1]
	second_newest_file = records[1][1]

	old_lines = read_file_filtered(second_newest_file, name)
	new_lines = read_file_filtered(newest_file, name)
	if old_lines is None or new_lines is None:
	continue

	# Compute diff and normalize by sorting lines for consistent deduplication
	diff_no_context, diff_with_context = get_diff(old_lines, new_lines, context)
	if diff_no_context == "":
	normalized = NO_DIFF
	else:
	# Sort the lines of the diff (based on no-context version for consistent dedup)
	sorted_lines = sorted(diff_no_context.splitlines(keepends=True))
	normalized = "".join(sorted_lines)

	# Skip diffs that exceed the line threshold
	if normalized != NO_DIFF:
	diff_line_count = len(normalized.splitlines())
	if diff_line_count > threshold:
	large_diff_count += 1
	continue

	# Track this diff using DiffInfo - store the context version for printing
	if normalized not in diffs:
	diffs[normalized] = DiffInfo(count=0, sample_name=name, original_text=diff_with_context)
	diffs[normalized].count += 1

	return diffs, large_diff_count


	def print_results(
	diffs: dict[str, DiffInfo],
	large_diff_count: int,
	threshold: int,
	use_color: bool,
	) -> None:
	"""Print deduped diffs and summary statistics."""
	# Print summary statistics
	total_compared = sum(info.count for info in diffs.values())
	print(f"Compared NAME groups: {total_compared}")
	print(f"Unique diffs: {len(diffs)}")
	print()

	# Sort diffs by frequency (descending), then by content (ascending) for determinism
	ordered = sorted(diffs.items(), key=lambda item: (-item[1].count, item[0]))
	for normalized, info in ordered:
	print(f"=== Diff that occurred {info.count} time(s), SAMPLE NAME: {info.sample_name} ===")
	if normalized == NO_DIFF:
	print("(no differences)")
	else:
	# Print colorized original diff
	print(colorize_diff(info.original_text, use_color), end="" if info.original_text.endswith("\n") else "\n")
	print()

	# Print summary of skipped large diffs
	if large_diff_count > 0:
	message = (
	f"{large_diff_count} diff(s) were too large to show "
	f"(threshold: {threshold} lines)."
	)
	if use_color:
	print(f"{YELLOW}{message}{RESET}")
	else:
	print(message)


	def main() -> int:
	"""Main entry point: find files, compute diffs, deduplicate, and report results."""
	# Parse CLI arguments and detect TTY for color output
	args = parse_args()
	use_color = sys.stdout.isatty()
	root = Path(args.directory)

	# Collect all matching files grouped by NAME
	grouped = collect_records(root)
	if not grouped:
	print(f"No files matching NAME.o.TIMESTAMP.ccache-input-text were found in: {root}")
	return 0

	# Process diffs and collect statistics
	diffs, large_diff_count = process_diff_groups(
	grouped, args.large_diff_threshold, args.context
	)

	# Check if any diffs were found
	if not diffs:
	print("No comparable NAME groups were found (need at least two matching files per NAME).")
	return 0

	# Display results
	print_results(diffs, large_diff_count, args.large_diff_threshold, use_color)

	return 0


	if __name__ == "__main__":
	raise SystemExit(main())
No results found