Created
September 12, 2024 23:01
-
-
Save dmyersturnbull/f0d167b5018dbf1961ede6a78b359bdb to your computer and use it in GitHub Desktop.
Order-independent diff tool
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: Copyright 2020-2024, Contributors | |
# SPDX-PackageHomePage: https://gist.github.com/dmyersturnbull | |
# SPDX-License-Identifier: Apache-2.0 | |
""" | |
Tool to diff two order-independent sets of lines. | |
""" | |
import math | |
import sys | |
from argparse import ArgumentParser, Namespace | |
from dataclasses import dataclass, KW_ONLY | |
from pathlib import Path | |
from typing import Iterator, Self | |
try: | |
import colorama | |
except ImportError: | |
colorama = None | |
@dataclass(frozen=True, slots=True) | |
class DiffResult: | |
"""The difference between two sets of lines.""" | |
added: list[str] | |
removed: list[str] | |
retained: list[str] | |
def summary(self: Self) -> Iterator[str]: | |
title_label = "SUMMARY" | |
added_label = "Added: " | |
removed_label = "Removed: " | |
retained_label = "Retained: " | |
max_value = max(self.n_added, self.n_removed, self.n_retained) | |
title_width = len(title_label) | |
label_width = max(len(added_label), len(removed_label), len(retained_label)) | |
value_width = len(str(max_value)) | |
half_fill = "=" * int(math.ceil(title_width / 2 + value_width / 2)) | |
title_fill = "=" * title_width | |
yield from [ | |
f"{half_fill}{title_label}{half_fill}", | |
f"{added_label:<{label_width}}{self.n_added:>{value_width}}", | |
f"{removed_label:<{label_width}}{self.n_removed:>{value_width}}", | |
f"{retained_label:<{label_width}}{self.n_retained:>{value_width}}", | |
f"{half_fill*2}{title_fill}", | |
] | |
def formatted_output( | |
self: Self, | |
*, | |
sort: bool = False, | |
show_retained: bool = False, | |
color: bool = False, | |
) -> Iterator[str]: | |
"""Generates lines that describe the changes.""" | |
# we'll move the [+-=] to the front after optionally sorting | |
# that's faster than a bespoke sorted(combined, key=...) | |
combined = ( | |
*(line + "+" for line in self.added), | |
*(line + "-" for line in self.removed), | |
*(line + "." for line in self.retained if show_retained), | |
) | |
colorize = { | |
"+": f"{colorama.Style.BRIGHT}{colorama.Fore.GREEN}+ " if color else "+ ", | |
"-": f"{colorama.Style.BRIGHT}{colorama.Fore.RED}+ " if color else "- ", | |
".": f"{colorama.Style.NORMAL}{colorama.Fore.BLACK}. " if color else ". ", | |
} | |
if sort: | |
combined = sorted(combined) | |
yield from (colorize[line[-1]] + line[:-1] for line in combined) | |
@property | |
def n_total(self: Self) -> int: | |
return len(self.added) + len(self.removed) + len(self.retained) | |
@property | |
def n_modified(self: Self) -> int: | |
return len(self.added) + len(self.removed) | |
@property | |
def n_added(self: Self) -> int: | |
return len(self.added) | |
@property | |
def n_removed(self: Self) -> int: | |
return len(self.removed) | |
@property | |
def n_retained(self: Self) -> int: | |
return len(self.retained) | |
@dataclass(frozen=True, slots=True) | |
class Reader: | |
"""""" | |
strip_whitespace: bool = False | |
def __call__(self: Self, file: Path) -> list[str]: | |
"""Read a file and return its lines as a set.""" | |
lines = file.read_text(encoding="utf-8").splitlines() | |
if self.strip_whitespace: | |
return [line.strip() for line in lines] | |
return [line for line in lines if line] # always remove blank lines | |
@dataclass(frozen=True, slots=True) | |
class Differ: | |
"""Class to compare two files as unordered sets of lines.""" | |
def __call__(self: Self, lines1: list[str], lines2: list[str]) -> DiffResult: | |
"""Compute the unordered difference between two sets of lines.""" | |
set1 = set(lines1) | |
set2 = set(lines2) | |
added = [line for line in lines2 if line not in set1] | |
removed = [line for line in lines1 if line not in set2] | |
retained = [line for line in lines1 if line in set2] | |
return DiffResult(added, removed, retained) | |
@dataclass(frozen=True, slots=True, order=True) | |
class Args: | |
"""""" | |
file1: Path | |
file2: Path | |
_: KW_ONLY | |
pretty: bool | |
retained: bool | |
sort: bool | |
error: bool | |
@dataclass(frozen=True, slots=True) | |
class Parser: | |
"""""" | |
def __call__(self: Self, args: list[str]) -> Args: | |
parser = ArgumentParser( | |
description=""" | |
Compares two files as unordered sets of lines. | |
The diff is asymmetric: Lines in `file2` but not `file1` are considered added. | |
Although the diff is order-independent, | |
lines are printed in the order they appear in `file1` (or `file2` for added lines). | |
Empty lines are ignored. | |
Use `--pretty` for colored output. | |
""" | |
) | |
parser.add_argument("file1", type=Path, help="Path to the first file.") | |
parser.add_argument("file2", type=Path, help="Path to the second file.") | |
parser.add_argument("-p", "--pretty", action="store_true", help="Color output and write a summary.") | |
parser.add_argument("-r", "--retained", action="store_true", help="Output lines that are retained.") | |
parser.add_argument("-s", "--sort", action="store_true", help="Sort the lines lexigraphically.") | |
parser.add_argument("-e", "--error", action="store_true", help="Exit 1 if any lines were added or removed.") | |
ns = parser.parse_args(args[1:]) | |
return Args(ns.file1, ns.file2, pretty=ns.pretty, retained=ns.retained, sort=ns.sort, error=ns.error) | |
@dataclass(frozen=True, slots=True, kw_only=True) | |
class Printer: | |
"""""" | |
sort: bool | |
pretty: bool | |
show_retained: bool | |
def __call__(self: Self, diff: DiffResult) -> Iterator[str]: | |
yield from ( | |
*self._summary(diff), | |
*self._header(diff), | |
*self._data(diff), | |
*self._footer(diff), | |
) | |
def _summary(self: Self, diff: DiffResult) -> Iterator[str]: | |
if self.pretty: | |
yield "" | |
yield from diff.summary() | |
def _header(self: Self, diff: DiffResult) -> Iterator[str]: | |
if self.pretty: | |
yield "" | |
if self.pretty and diff.n_total == 0: | |
yield "[[ no data to diff ]]" | |
elif self.pretty and diff.n_modified == 0 and not self.show_retained: | |
yield "[[ no changes in diff ]]" | |
elif self.pretty: | |
yield "[[ start of diff ]]" | |
def _footer(self: Self, diff: DiffResult) -> Iterator[str]: | |
if self.pretty and (diff.n_total == 0 or diff.n_modified == 0 and self.show_retained): | |
yield "[[ end of diff ]]" | |
if self.pretty: | |
yield "" | |
def _data(self: Self, diff: DiffResult) -> Iterator[str]: | |
yield from diff.formatted_output(sort=self.sort, show_retained=self.show_retained, color=self.pretty) | |
@dataclass(frozen=True, slots=True) | |
class Program: | |
""" | |
The diffset program. | |
""" | |
def run(self: Self, cli_args: list[str]) -> int: | |
args = Parser()(cli_args) | |
if args.pretty and not colorama: | |
print("Cannot import colorama. Install with `pip install colorama`.", file=sys.stderr) | |
return 1 | |
diff = Differ()(Reader()(args.file1), Reader()(args.file2)) | |
printer = Printer(sort=args.sort, pretty=args.pretty, show_retained=args.retained) | |
for line in printer(diff): | |
print(line) | |
return int(args.error and diff.n_modified != 0 or 0) | |
if __name__ == "__main__": | |
sys.exit(Program().run(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment