Skip to content

Instantly share code, notes, and snippets.

@mbollmann
Last active December 26, 2024 12:06
Show Gist options
  • Save mbollmann/827a079023ebdd18b4d06c28566fac0d to your computer and use it in GitHub Desktop.
Save mbollmann/827a079023ebdd18b4d06c28566fac0d to your computer and use it in GitHub Desktop.
Script for acl-org/acl-anthology, to help discover differences in generated YAML files between the old and new build pipeline
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2024 Marcel Bollmann <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Usage: diff_hugo_yaml.py BUILD_OLD BUILD_NEW [options]
This script diffs the auto-generated YAML files between two build directories.
The script defines a number of expected differences that will not be considered
for the diff. Read the code in this file to see all the changes that are being
ignored, and run this file on two build directories to see any remaining
differences.
Arguments:
BUILD_OLD Old build data directory, created on branch 'master'
BUILD_NEW New build data directory, created on branch 'build-pipeline-with-new-library'
Options:
-c, --ignore-citations Ignore differences in citation keys.
-e, --ignore-events Ignore _added_ events on papers.
-n, --ignore-name-split Ignore differences in first/last name splits.
-o, --ignore-order Ignore differences in the order of certain lists.
-w, --ignore-whitespace Ignore differences in whitespace.
-A, --skip-authors Don't diff authors (people).
-P, --skip-papers Don't diff papers.
-h, --help Display this helpful text.
"""
from deepdiff import DeepDiff
from deepdiff.operator import BaseOperator
from docopt import docopt
import logging as log
from pathlib import Path
import re
from rich import print
from rich.console import Console
from rich.logging import RichHandler
import types
import yaml
try:
from yaml import CSafeLoader as Loader
except ImportError:
from yaml import SafeLoader as Loader
MARKDOWN_LINK = re.compile(r"(\[.*\]\([^)]*[^\/])\)")
HTML_LINK = re.compile(r'(<a href="[^"]*[^\/])">')
IGNORE_CITATIONS = False
IGNORE_EVENTS = False
IGNORE_NAME_SPLIT = False
IGNORE_ORDER = False
IGNORE_WHITESPACE = False
def print_pretty(diff, filename, key):
for line in diff.pretty().splitlines():
print(f"[purple]{filename}[/]['{key}']: {line}")
# print(diff.pretty())
class WhitespaceOperator(BaseOperator):
# Ignores leading/trailing whitespace, as well as double spaces (can happen
# due to change in how single-author names are rendered)
def __init__(self):
super().__init__(types=(str,))
def give_up_diffing(self, level, diff_instance):
t1 = level.t1.strip().replace(" ", " ")
t2 = level.t2.strip().replace(" ", " ")
if t1 == t2:
return True
class NoneVsEmptyStringOperator(BaseOperator):
# Considers None and '' to be equivalent
def match(self, level):
if (level.t1 is None or isinstance(level.t1, str)) and (
level.t2 is None or isinstance(level.t2, str)
):
return True
def give_up_diffing(self, level, diff_instance):
if (not level.t1) and (not level.t2):
return True
class NameSplitOperator(BaseOperator):
# Considers names that are equivalent except for first/last split to be
# equivalent
def match(self, level):
if (isinstance(level.t1, dict) and isinstance(level.t2, dict)) and (
"full" in level.t1 and "full" in level.t2
):
return True
def give_up_diffing(self, level, diff_instance):
if (level.t1["full"] == level.t2["full"]) and (
level.t1.get("id", None) == level.t2.get("id", None)
):
return True
DEEPDIFF_ARGS = {
"verbose_level": 2,
"custom_operators": [NoneVsEmptyStringOperator()],
}
def filter_diff(diff):
# [diff] ignore changes in whitespace
if IGNORE_WHITESPACE:
remove_roots = []
for root, changes in diff.get("values_changed", {}).items():
if changes["new_value"] == changes["old_value"].strip():
remove_roots.append(root)
for root in remove_roots:
del diff["values_changed"][root]
return diff
def diff_papers_yaml(build_old, build_new):
# [diff] We ignore the order in which attachments are listed
def ignore_order_func(level):
return IGNORE_ORDER and "'attachment'" in level.path()
for yamlfile in build_old.glob("./papers/*.yaml"):
log.info(f"Diffing papers/{yamlfile.name}")
with open(yamlfile, "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "papers" / yamlfile.name, "r") as f:
b = yaml.load(f, Loader=Loader)
assert list(a.keys()) == list(
b.keys()
), "{yamlfile.name} should have identical keys"
for key, value_old in a.items():
value_new = b[key]
# [improvement] URLs don't have the final slash in the old version
if "url" in value_old and "aclanthology.org" in value_old["url"]:
value_old["url"] += "/"
if IGNORE_CITATIONS:
del value_old["citation"]
del value_old["citation_acl"]
del value_new["citation"]
del value_new["citation_acl"]
else:
if "citation" in value_old:
value_old["citation"] = MARKDOWN_LINK.sub(
r"\1/)", value_old["citation"]
)
value_old["citation"] = value_old["citation"].replace("\n", " ")
if "citation_acl" in value_old:
value_old["citation_acl"] = HTML_LINK.sub(
r'\1/">', value_old["citation_acl"]
)
value_old["citation_acl"] = value_old["citation_acl"].replace(
", edition.", "."
)
# [improvement] some empty tags are no longer serialized
if "address" in value_old and not value_old["address"]:
del value_old["address"]
if "language" in value_old and not value_old["language"]:
del value_old["language"]
if "publisher" in value_old and not value_old["publisher"]:
del value_old["publisher"]
# [change] "mrf" is currently not handled
if "mrf" in value_old:
del value_old["mrf"]
# [improvement] if "pdf" tag is missing, "thumbnail" tag is no longer serialized
if "pdf" not in value_old and "thumbnail" in value_old:
del value_old["thumbnail"]
# [improvement] attachment types are now capitalized
for attachment in value_old.get("attachment", []):
attachment["type"] = attachment["type"].capitalize()
# [improvement] superfluous "journal-title" key no longer exists
if "journal-title" in value_old:
del value_old["journal-title"]
# [improvement] superfluous "booktitle_html" key no longer exists
if "booktitle_html" in value_old:
del value_old["booktitle_html"]
# [improvement] single-page articles did not previously serialize page_first + page_last
# [improvement/change] single-pages articles output "42" instead of "42-42" in pages
if value_new.get("page_first", 0) == value_new.get("page_last", 1):
value_old["page_first"] = value_new["page_first"]
value_old["page_last"] = value_new["page_last"]
value_old["pages"] = value_old["page_first"]
# [improvement] frontmatter of journals now has bibkey "book" instead of "proceedings"
if value_old.get("bibtype", "") == "proceedings" and key.endswith("0"):
# we can't directly _see_ if it's a journal here, so we peek at the next item...
if a.get(key[:-1] + "1", {}).get("bibtype", "") == "article":
value_old["bibtype"] = "book"
# [diff] more events are explicitly connected with papers
if IGNORE_EVENTS and (events := value_new.get("events", [])):
if "events" not in value_old:
value_old["events"] = value_new["events"]
else:
for event in events:
if event not in value_old["events"]:
value_old["events"].append(event)
if diff := DeepDiff(
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS
):
print_pretty(diff, yamlfile.name, key)
def diff_people_yaml(build_old, build_new):
# [diff] We ignore the order of co-authors, venues, and paper; while they
# should of course be ordered based on their counts or years, the order
# within those groups often differs
def ignore_order_func(level):
return IGNORE_ORDER and (
"'coauthors'" in level.path()
or "'venues'" in level.path()
or "'papers'" in level.path()
or "'variant_entries'" in level.path()
or "'similar'" in level.path()
)
for yamlfile in build_old.glob("./people/*.yaml"):
log.info(f"Diffing people/{yamlfile.name}")
with open(yamlfile, "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "people" / yamlfile.name, "r") as f:
b = yaml.load(f, Loader=Loader)
if list(a.keys()) != list(b.keys()):
for key in set(a.keys()) - set(b.keys()):
print(f"[purple]{yamlfile.name}[/]: Key '{key}' removed from the file.")
for key in set(b.keys()) - set(a.keys()):
print(f"[purple]{yamlfile.name}[/]: Key '{key}' added to the file.")
for key, value_old in a.items():
if key not in b:
continue
value_new = b[key]
if diff := DeepDiff(
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS
):
print_pretty(diff, yamlfile.name, key)
del a, b
def diff_volumes_yaml(build_old, build_new):
with open(build_old / "volumes.yaml", "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "volumes.yaml", "r") as f:
b = yaml.load(f, Loader=Loader)
assert list(a.keys()) == list(b.keys()), "volumes.yaml should have identical keys"
for key, value_old in a.items():
value_new = b[key]
# [improvement] URLs don't have the final slash in the old version
if "url" in value_old and "aclanthology.org" in value_old["url"]:
value_old["url"] += "/"
# [improvement] Single-value names had a space prepended in the old version
for name in value_old.get("editor", []):
if name["first"] == "":
name["first"] = None
name["full"] = name["full"][1:]
# [improvement] superfluous "venue" key no longer exists
if "venue" in value_old:
del value_old["venue"]
# [improvement] empty "address" and "publisher" tags are no longer serialized
if "address" in value_old and not value_old["address"]:
del value_old["address"]
if "publisher" in value_old and not value_old["publisher"]:
del value_old["publisher"]
# [improvement] superfluous "journal-title" key no longer exists
if "journal-title" in value_old:
del value_old["journal-title"]
# [improvement] The new pipeline lists more events than before
for event in value_new["events"]:
if event not in value_old["events"]:
value_old["events"].append(event)
# [improvement] The new pipeline lists meta_issue and meta_volume
if "meta_issue" in value_new:
value_old["meta_issue"] = value_new["meta_issue"]
if "meta_volume" in value_new:
value_old["meta_volume"] = value_new["meta_volume"]
# [diff] We ignore the order of SIGs
def ignore_order_func(level):
return IGNORE_ORDER and "'sigs'" in level.path()
if diff := DeepDiff(
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS
):
print_pretty(diff, "volumes.yaml", key)
def diff_venues_yaml(build_old, build_new):
# [diff] We ignore the order of volumes within the years
# (but there should only be minor differences)
def ignore_order_func(level):
return IGNORE_ORDER and "volumes_by_year']['" in level.path()
with open(build_old / "venues.yaml", "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "venues.yaml", "r") as f:
b = yaml.load(f, Loader=Loader)
assert list(a.keys()) == list(b.keys()), "venues.yaml should have identical keys"
for key, value_old in a.items():
value_new = b[key]
if diff := DeepDiff(
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS
):
print_pretty(diff, "venues.yaml", key)
def diff_events_yaml(build_old, build_new):
# [diff] We ignore the order of volumes
# (but there should only be minor differences)
def ignore_order_func(level):
return IGNORE_ORDER and "'volumes'" in level.path()
with open(build_old / "events.yaml", "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "events.yaml", "r") as f:
b = yaml.load(f, Loader=Loader)
assert list(a.keys()) == list(b.keys()), "events.yaml should have identical keys"
for key, value_old in a.items():
value_new = b[key]
if diff := DeepDiff(
value_old, value_new, ignore_order_func=ignore_order_func, **DEEPDIFF_ARGS
):
print_pretty(diff, "events.yaml", key)
def diff_sigs_yaml(build_old, build_new):
with open(build_old / "sigs.yaml", "r") as f:
a = yaml.load(f, Loader=Loader)
with open(build_new / "sigs.yaml", "r") as f:
b = yaml.load(f, Loader=Loader)
assert list(a.keys()) == list(b.keys()), "sigs.yaml should have identical keys"
for key, value_old in a.items():
value_new = b[key]
# [diff] Year keys are serialized as strings now
for year in value_new.get("volumes_by_year", {}).keys():
value_old["volumes_by_year"][year] = value_old["volumes_by_year"][int(year)]
del value_old["volumes_by_year"][int(year)]
# [diff] url: None is omitted
if "url" in value_old and not value_old["url"]:
del value_old["url"]
for volumes in value_old.get("volumes_by_year", {}).values():
for volume in volumes:
if isinstance(volume, dict) and "url" in volume and not volume["url"]:
del volume["url"]
if diff := DeepDiff(value_old, value_new, **DEEPDIFF_ARGS):
print_pretty(diff, filename="sigs.yaml", key=key)
if __name__ == "__main__":
args = docopt(__doc__)
log.basicConfig(
format="%(message)s",
level=log.INFO,
handlers=[RichHandler(console=Console(stderr=True))],
)
if args["--ignore-citations"]:
IGNORE_CITATIONS = True
if args["--ignore-events"]:
IGNORE_EVENTS = True
if args["--ignore-order"]:
IGNORE_ORDER = True
if args["--ignore-whitespace"]:
IGNORE_WHITESPACE = True
DEEPDIFF_ARGS["custom_operators"].append(WhitespaceOperator())
if args["--ignore-name-split"]:
IGNORE_NAME_SPLIT = True
DEEPDIFF_ARGS["custom_operators"].append(NameSplitOperator())
build_old = Path(args["BUILD_OLD"])
build_new = Path(args["BUILD_NEW"])
assert build_old.is_dir()
assert build_new.is_dir()
if not args["--skip-papers"]:
diff_papers_yaml(build_old, build_new)
if not args["--skip-authors"]:
diff_people_yaml(build_old, build_new)
log.info("Diffing events.yaml")
diff_events_yaml(build_old, build_new)
log.info("Diffing sigs.yaml")
diff_sigs_yaml(build_old, build_new)
log.info("Diffing venues.yaml")
diff_venues_yaml(build_old, build_new)
log.info("Diffing volumes.yaml")
diff_volumes_yaml(build_old, build_new)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment