Last active
December 26, 2024 12:06
-
-
Save mbollmann/827a079023ebdd18b4d06c28566fac0d to your computer and use it in GitHub Desktop.
Script for acl-org/acl-anthology, to help discover differences in generated YAML files between the old and new build pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright 2024 Marcel Bollmann <[email protected]> | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Usage: diff_hugo_yaml.py BUILD_OLD BUILD_NEW [options] | |
This script diffs the auto-generated YAML files between two build directories. | |
The script defines a number of expected differences that will not be considered | |
for the diff. Read the code in this file to see all the changes that are being | |
ignored, and run this file on two build directories to see any remaining | |
differences. | |
Arguments: | |
BUILD_OLD Old build data directory, created on branch 'master' | |
BUILD_NEW New build data directory, created on branch 'build-pipeline-with-new-library' | |
Options: | |
-c, --ignore-citations Ignore differences in citation keys. | |
-e, --ignore-events Ignore _added_ events on papers. | |
-n, --ignore-name-split Ignore differences in first/last name splits. | |
-o, --ignore-order Ignore differences in the order of certain lists. | |
-w, --ignore-whitespace Ignore differences in whitespace. | |
-A, --skip-authors Don't diff authors (people). | |
-P, --skip-papers Don't diff papers. | |
-h, --help Display this helpful text. | |
""" | |
from deepdiff import DeepDiff | |
from deepdiff.operator import BaseOperator | |
from docopt import docopt | |
import logging as log | |
from pathlib import Path | |
import re | |
from rich import print | |
from rich.console import Console | |
from rich.logging import RichHandler | |
import types | |
import yaml | |
try: | |
from yaml import CSafeLoader as Loader | |
except ImportError: | |
from yaml import SafeLoader as Loader | |
MARKDOWN_LINK = re.compile(r"(\[.*\]\([^)]*[^\/])\)") | |
HTML_LINK = re.compile(r'(<a href="[^"]*[^\/])">') | |
IGNORE_CITATIONS = False | |
IGNORE_EVENTS = False | |
IGNORE_NAME_SPLIT = False | |
IGNORE_ORDER = False | |
IGNORE_WHITESPACE = False | |
def print_pretty(diff, filename, key): | |
for line in diff.pretty().splitlines(): | |
print(f"[purple]{filename}[/]['{key}']: {line}") | |
# print(diff.pretty()) | |
class WhitespaceOperator(BaseOperator): | |
# Ignores leading/trailing whitespace, as well as double spaces (can happen | |
# due to change in how single-author names are rendered) | |
def __init__(self): | |
super().__init__(types=(str,)) | |
def give_up_diffing(self, level, diff_instance): | |
t1 = level.t1.strip().replace(" ", " ") | |
t2 = level.t2.strip().replace(" ", " ") | |
if t1 == t2: | |
return True | |
class NoneVsEmptyStringOperator(BaseOperator): | |
# Considers None and '' to be equivalent | |
def match(self, level): | |
if (level.t1 is None or isinstance(level.t1, str)) and ( | |
level.t2 is None or isinstance(level.t2, str) | |
): | |
return True | |
def give_up_diffing(self, level, diff_instance): | |
if (not level.t1) and (not level.t2): | |
return True | |
class NameSplitOperator(BaseOperator): | |
# Considers names that are equivalent except for first/last split to be | |
# equivalent | |
def match(self, level): | |
if (isinstance(level.t1, dict) and isinstance(level.t2, dict)) and ( | |
"full" in level.t1 and "full" in level.t2 | |
): | |
return True | |
def give_up_diffing(self, level, diff_instance): | |
if (level.t1["full"] == level.t2["full"]) and ( | |
level.t1.get("id", None) == level.t2.get("id", None) | |
): | |
return True | |
DEEPDIFF_ARGS = { | |
"verbose_level": 2, | |
"custom_operators": [NoneVsEmptyStringOperator()], | |
} | |
def filter_diff(diff): | |
# [diff] ignore changes in whitespace | |
if IGNORE_WHITESPACE: | |
remove_roots = [] | |
for root, changes in diff.get("values_changed", {}).items(): | |
if changes["new_value"] == changes["old_value"].strip(): | |
remove_roots.append(root) | |
for root in remove_roots: | |
del diff["values_changed"][root] | |
return diff | |
def diff_papers_yaml(build_old, build_new): | |
# [diff] We ignore the order in which attachments are listed | |
def ignore_order_func(level): | |
return IGNORE_ORDER and "'attachment'" in level.path() | |
for yamlfile in build_old.glob("./papers/*.yaml"): | |
log.info(f"Diffing papers/{yamlfile.name}") | |
with open(yamlfile, "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "papers" / yamlfile.name, "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
assert list(a.keys()) == list( | |
b.keys() | |
), "{yamlfile.name} should have identical keys" | |
for key, value_old in a.items(): | |
value_new = b[key] | |
# [improvement] URLs don't have the final slash in the old version | |
if "url" in value_old and "aclanthology.org" in value_old["url"]: | |
value_old["url"] += "/" | |
if IGNORE_CITATIONS: | |
del value_old["citation"] | |
del value_old["citation_acl"] | |
del value_new["citation"] | |
del value_new["citation_acl"] | |
else: | |
if "citation" in value_old: | |
value_old["citation"] = MARKDOWN_LINK.sub( | |
r"\1/)", value_old["citation"] | |
) | |
value_old["citation"] = value_old["citation"].replace("\n", " ") | |
if "citation_acl" in value_old: | |
value_old["citation_acl"] = HTML_LINK.sub( | |
r'\1/">', value_old["citation_acl"] | |
) | |
value_old["citation_acl"] = value_old["citation_acl"].replace( | |
", edition.", "." | |
) | |
# [improvement] some empty tags are no longer serialized | |
if "address" in value_old and not value_old["address"]: | |
del value_old["address"] | |
if "language" in value_old and not value_old["language"]: | |
del value_old["language"] | |
if "publisher" in value_old and not value_old["publisher"]: | |
del value_old["publisher"] | |
# [change] "mrf" is currently not handled | |
if "mrf" in value_old: | |
del value_old["mrf"] | |
# [improvement] if "pdf" tag is missing, "thumbnail" tag is no longer serialized | |
if "pdf" not in value_old and "thumbnail" in value_old: | |
del value_old["thumbnail"] | |
# [improvement] attachment types are now capitalized | |
for attachment in value_old.get("attachment", []): | |
attachment["type"] = attachment["type"].capitalize() | |
# [improvement] superfluous "journal-title" key no longer exists | |
if "journal-title" in value_old: | |
del value_old["journal-title"] | |
# [improvement] superfluous "booktitle_html" key no longer exists | |
if "booktitle_html" in value_old: | |
del value_old["booktitle_html"] | |
# [improvement] single-page articles did not previously serialize page_first + page_last | |
# [improvement/change] single-pages articles output "42" instead of "42-42" in pages | |
if value_new.get("page_first", 0) == value_new.get("page_last", 1): | |
value_old["page_first"] = value_new["page_first"] | |
value_old["page_last"] = value_new["page_last"] | |
value_old["pages"] = value_old["page_first"] | |
# [improvement] frontmatter of journals now has bibkey "book" instead of "proceedings" | |
if value_old.get("bibtype", "") == "proceedings" and key.endswith("0"): | |
# we can't directly _see_ if it's a journal here, so we peek at the next item... | |
if a.get(key[:-1] + "1", {}).get("bibtype", "") == "article": | |
value_old["bibtype"] = "book" | |
# [diff] more events are explicitly connected with papers | |
if IGNORE_EVENTS and (events := value_new.get("events", [])): | |
if "events" not in value_old: | |
value_old["events"] = value_new["events"] | |
else: | |
for event in events: | |
if event not in value_old["events"]: | |
value_old["events"].append(event) | |
if diff := DeepDiff( | |
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS | |
): | |
print_pretty(diff, yamlfile.name, key) | |
def diff_people_yaml(build_old, build_new): | |
# [diff] We ignore the order of co-authors, venues, and paper; while they | |
# should of course be ordered based on their counts or years, the order | |
# within those groups often differs | |
def ignore_order_func(level): | |
return IGNORE_ORDER and ( | |
"'coauthors'" in level.path() | |
or "'venues'" in level.path() | |
or "'papers'" in level.path() | |
or "'variant_entries'" in level.path() | |
or "'similar'" in level.path() | |
) | |
for yamlfile in build_old.glob("./people/*.yaml"): | |
log.info(f"Diffing people/{yamlfile.name}") | |
with open(yamlfile, "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "people" / yamlfile.name, "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
if list(a.keys()) != list(b.keys()): | |
for key in set(a.keys()) - set(b.keys()): | |
print(f"[purple]{yamlfile.name}[/]: Key '{key}' removed from the file.") | |
for key in set(b.keys()) - set(a.keys()): | |
print(f"[purple]{yamlfile.name}[/]: Key '{key}' added to the file.") | |
for key, value_old in a.items(): | |
if key not in b: | |
continue | |
value_new = b[key] | |
if diff := DeepDiff( | |
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS | |
): | |
print_pretty(diff, yamlfile.name, key) | |
del a, b | |
def diff_volumes_yaml(build_old, build_new): | |
with open(build_old / "volumes.yaml", "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "volumes.yaml", "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
assert list(a.keys()) == list(b.keys()), "volumes.yaml should have identical keys" | |
for key, value_old in a.items(): | |
value_new = b[key] | |
# [improvement] URLs don't have the final slash in the old version | |
if "url" in value_old and "aclanthology.org" in value_old["url"]: | |
value_old["url"] += "/" | |
# [improvement] Single-value names had a space prepended in the old version | |
for name in value_old.get("editor", []): | |
if name["first"] == "": | |
name["first"] = None | |
name["full"] = name["full"][1:] | |
# [improvement] superfluous "venue" key no longer exists | |
if "venue" in value_old: | |
del value_old["venue"] | |
# [improvement] empty "address" and "publisher" tags are no longer serialized | |
if "address" in value_old and not value_old["address"]: | |
del value_old["address"] | |
if "publisher" in value_old and not value_old["publisher"]: | |
del value_old["publisher"] | |
# [improvement] superfluous "journal-title" key no longer exists | |
if "journal-title" in value_old: | |
del value_old["journal-title"] | |
# [improvement] The new pipeline lists more events than before | |
for event in value_new["events"]: | |
if event not in value_old["events"]: | |
value_old["events"].append(event) | |
# [improvement] The new pipeline lists meta_issue and meta_volume | |
if "meta_issue" in value_new: | |
value_old["meta_issue"] = value_new["meta_issue"] | |
if "meta_volume" in value_new: | |
value_old["meta_volume"] = value_new["meta_volume"] | |
# [diff] We ignore the order of SIGs | |
def ignore_order_func(level): | |
return IGNORE_ORDER and "'sigs'" in level.path() | |
if diff := DeepDiff( | |
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS | |
): | |
print_pretty(diff, "volumes.yaml", key) | |
def diff_venues_yaml(build_old, build_new): | |
# [diff] We ignore the order of volumes within the years | |
# (but there should only be minor differences) | |
def ignore_order_func(level): | |
return IGNORE_ORDER and "volumes_by_year']['" in level.path() | |
with open(build_old / "venues.yaml", "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "venues.yaml", "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
assert list(a.keys()) == list(b.keys()), "venues.yaml should have identical keys" | |
for key, value_old in a.items(): | |
value_new = b[key] | |
if diff := DeepDiff( | |
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS | |
): | |
print_pretty(diff, "venues.yaml", key) | |
def diff_events_yaml(build_old, build_new): | |
# [diff] We ignore the order of volumes | |
# (but there should only be minor differences) | |
def ignore_order_func(level): | |
return IGNORE_ORDER and "'volumes'" in level.path() | |
with open(build_old / "events.yaml", "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "events.yaml", "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
assert list(a.keys()) == list(b.keys()), "events.yaml should have identical keys" | |
for key, value_old in a.items(): | |
value_new = b[key] | |
if diff := DeepDiff( | |
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS | |
): | |
print_pretty(diff, "events.yaml", key) | |
def diff_sigs_yaml(build_old, build_new): | |
with open(build_old / "sigs.yaml", "r") as f: | |
a = yaml.load(f, Loader=Loader) | |
with open(build_new / "sigs.yaml", "r") as f: | |
b = yaml.load(f, Loader=Loader) | |
assert list(a.keys()) == list(b.keys()), "sigs.yaml should have identical keys" | |
for key, value_old in a.items(): | |
value_new = b[key] | |
# [diff] Year keys are serialized as strings now | |
for year in value_new.get("volumes_by_year", {}).keys(): | |
value_old["volumes_by_year"][year] = value_old["volumes_by_year"][int(year)] | |
del value_old["volumes_by_year"][int(year)] | |
# [diff] url: None is omitted | |
if "url" in value_old and not value_old["url"]: | |
del value_old["url"] | |
for volumes in value_old.get("volumes_by_year", {}).values(): | |
for volume in volumes: | |
if isinstance(volume, dict) and "url" in volume and not volume["url"]: | |
del volume["url"] | |
if diff := DeepDiff(value_old, value_new, **DEEPDIFF_ARGS): | |
print_pretty(diff, filename="sigs.yaml", key=key) | |
if __name__ == "__main__": | |
args = docopt(__doc__) | |
log.basicConfig( | |
format="%(message)s", | |
level=log.INFO, | |
handlers=[RichHandler(console=Console(stderr=True))], | |
) | |
if args["--ignore-citations"]: | |
IGNORE_CITATIONS = True | |
if args["--ignore-events"]: | |
IGNORE_EVENTS = True | |
if args["--ignore-order"]: | |
IGNORE_ORDER = True | |
if args["--ignore-whitespace"]: | |
IGNORE_WHITESPACE = True | |
DEEPDIFF_ARGS["custom_operators"].append(WhitespaceOperator()) | |
if args["--ignore-name-split"]: | |
IGNORE_NAME_SPLIT = True | |
DEEPDIFF_ARGS["custom_operators"].append(NameSplitOperator()) | |
build_old = Path(args["BUILD_OLD"]) | |
build_new = Path(args["BUILD_NEW"]) | |
assert build_old.is_dir() | |
assert build_new.is_dir() | |
if not args["--skip-papers"]: | |
diff_papers_yaml(build_old, build_new) | |
if not args["--skip-authors"]: | |
diff_people_yaml(build_old, build_new) | |
log.info("Diffing events.yaml") | |
diff_events_yaml(build_old, build_new) | |
log.info("Diffing sigs.yaml") | |
diff_sigs_yaml(build_old, build_new) | |
log.info("Diffing venues.yaml") | |
diff_venues_yaml(build_old, build_new) | |
log.info("Diffing volumes.yaml") | |
diff_volumes_yaml(build_old, build_new) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment