Last active
March 3, 2023 14:33
-
-
Save adefossez/85cd1b2183f63088aea463a0f36b3a2c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Author: Alexandre Défossez, 2020 | |
# This is free and unencumbered software released into the public domain. | |
# For more information, please refer to <http://unlicense.org/> | |
""" | |
Merge multiple bibfiles, remove duplicates and unused references, matching bibtex entries | |
based on the 'title' field. Rewrite all the .tex files in the current directory | |
to reflect the elimination of duplicates. | |
Finally, this will rewrite all the arXiv references to use the @unpublished category. | |
To use this, go to the main() function where everything is hardcoded and update it | |
to suite your case :D | |
IMPORTANT: Make a copy of your files before running this, as this script will | |
overwrite all the .tex files and I can't garantee it is bug proof. | |
""" | |
import glob | |
import re | |
import sys | |
def colorize(text, color): | |
""" | |
Return `text` wrapped in ANSI color code sequence, with the given color. | |
`color` should be a string, see the following link for a reference: | |
https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences | |
""" | |
code = f"\033[{color}m" | |
restore = f"\033[0m" | |
return "".join([code, text, restore]) | |
def bold(text): | |
""" | |
Wrap text so as to display it in bold in the terminal. | |
""" | |
return colorize(text, "1") | |
def fatal(in_bold, *args): | |
print(bold(in_bold), *args, file=sys.stderr) | |
sys.exit(1) | |
def consumer(buf): | |
""" | |
Returns a `consume` function over the given string `buf`. | |
The returned function `consume` can be repeatidly called with | |
a single regex pattern as argument. It will match the pattern | |
on the current value of `buf` and if it gets a match, will remove | |
the matched prefix from `buf` and return the match. Otherwise, return | |
`None` and does not change `buf`. | |
""" | |
def consume(pattern): | |
nonlocal buf | |
match = re.match(pattern, buf, re.MULTILINE | re.DOTALL) | |
if match is None: | |
return None | |
buf = buf[match.end():] | |
return match | |
return consume | |
def consume_block(consume): | |
""" | |
Consume a block of the form `content}` from a bibtex consumer. | |
Inner blocks like `a {inner} b}` are also matched and added | |
verbatim to the output. | |
Note that this function will not match the opening `{`. | |
""" | |
content = "" | |
while True: | |
match = consume("([^{}]*)([{}])") | |
if not match: | |
fatal("Missing }", consume(r".{,50}").group(0)) | |
content += match.group(1) | |
if match.group(2) == "{": | |
sub = consume_block(consume) | |
content += "{" + sub + "}" | |
else: | |
return content | |
def parse_bib(bib): | |
""" | |
Parse a bibtex from the string `bib`. | |
Return a list of dict, with the following keys: | |
- kind: lowercased of what is after the @. | |
- alias: the name of the bibtex entry. | |
- title, author, year, etc.: any entry in the bibtex is also added. | |
Comments are only supported when at the beginning of the line and are | |
added as a special entry of kind "comment" and with a key "content" | |
containing the value of the comment without the initial '%'. | |
Comments can also start with '#' | |
""" | |
content = [] | |
consume = consumer(bib) | |
while bib: | |
match = consume(r"\s*^\s*[%#]([^\n]*)$") | |
if match: | |
content.append({"kind": "comment", "content": match.group(1)}) | |
continue | |
match = consume(r"\s*@(\w+)\{\s*([^\s,]+)") | |
if not match: | |
break | |
entry = { | |
"alias": match.group(2), | |
"kind": match.group(1).lower(), | |
} | |
content.append(entry) | |
while True: | |
match = consume(r"\s*(?:,?\s*(})|,\s*(\w+)\s*=\s*({?))") | |
if not match: | |
fatal("Parsing error, next chars in buffer:", | |
consume(r".{,50}").group(0)) | |
sys.exit(1) | |
if match.group(1) == "}": | |
break | |
key = match.group(2) | |
if match.group(3) == "{": | |
value = "{" + consume_block(consume) + "}" | |
else: | |
# The value is not surrounded by {}, | |
match = consume(r"\s*(\w+)") | |
if not match: | |
fatal("Missing value for", entry['alias'], key) | |
value = match.group(1) | |
entry[key.lower()] = value | |
consume(r"\s*") | |
remaining = consume(r".{,50}").group(0) | |
if remaining: | |
fatal("Parsing error, unparsed leftovers:", remaining) | |
return content | |
def find_used(root): | |
""" | |
Search all tex files for citation with cite, citet and citep. | |
Multiple comma separated citations are supported. | |
""" | |
texs = glob.glob(root + '/**/*.tex', recursive=True) | |
out = set([]) | |
for tex in texs: | |
buf = open(tex).read() | |
for match in re.finditer(r'\\cite[tp]?{([^}]+)}', buf): | |
cites = [i.strip() for i in match.group(1).split(",")] | |
out |= set(cites) | |
return out | |
def replace(root, replacements): | |
""" | |
Parse all the .tex files under the root folder, search for cite[tp] | |
and apply replacement rules from `replacements`. | |
`replacements` should be a dict `old_name: new_name`. | |
.tex files are modified inplace. | |
""" | |
texs = glob.glob(root + '/**/*.tex', recursive=True) | |
for tex in texs: | |
buf = open(tex).read() | |
processed = [] | |
while buf: | |
match = re.search(r'\\(cite[tp]?){([^}]+)}', buf) | |
if not match: | |
break | |
processed.append(buf[:match.start()]) | |
buf = buf[match.end():] | |
command = match.group(1) | |
cites = match.group(2).split(",") | |
ncites = [] | |
for cite in cites: | |
if cite in replacements: | |
cite = replacements[cite] | |
ncites.append(cite) | |
o = "\\" + command + "{" + ",".join(ncites) + "}" | |
processed.append(o) | |
processed.append(buf) | |
open(tex, "w").write("".join(processed)) | |
def remove_unsused(entries, used): | |
removed = [] | |
for entry in list(entries): | |
# only remove things with titles, the rest could be comments | |
# or other weird bibtex | |
if 'title' in entry and entry['alias'] not in used: | |
entries.remove(entry) | |
removed.append(entry['alias']) | |
return removed | |
def normalize(title): | |
return title.strip("{}").strip().lower() | |
def is_same(a, b): | |
return normalize(a['title']) == normalize(b['title']) | |
def find_duplicates(entries, used): | |
replacements = {} | |
for entry in list(entries): | |
if 'title' not in entry: | |
continue | |
if entry['alias'] not in used: | |
continue | |
found = False | |
for candidate in entries: | |
if 'title' not in candidate: | |
continue | |
if is_same(candidate, entry): | |
break | |
if candidate is not entry: | |
replacements[entry['alias']] = candidate['alias'] | |
entries.remove(entry) | |
used.add(candidate['alias']) | |
return replacements | |
def dumps_bib(entries): | |
""" | |
Format the given entries to bibtex. | |
""" | |
out = [] | |
for entry in entries: | |
if entry['kind'] == 'comment': | |
out.append('%' + entry['content'] + '\n\n') | |
continue | |
entry = dict(entry) | |
kind = entry.pop('kind') | |
alias = entry.pop('alias') | |
o = f'@{kind}{{{alias},\n' | |
for k, v in entry.items(): | |
o += f'\t{k}={v},\n' | |
o += '}\n\n' | |
out.append(o) | |
return "".join(out) | |
def replace_arxiv(entries): | |
for entry in entries: | |
if 'title' not in entry: | |
continue | |
number = None | |
if entry['kind'] == 'techreport' and entry.get('institution') == '{arXiv}': | |
number = entry['number'].strip('{}') | |
del entry['institution'] | |
elif entry['kind'] == 'article' and 'journal' in entry: | |
match = re.match(r'{*(?:arXiv )?preprint arXiv:([\d.]+)}*$', entry['journal']) | |
if match is not None: | |
del entry['journal'] | |
number = match.group(1) | |
if number is not None: | |
entry['kind'] = 'unpublished' | |
entry['note'] = '{Preprint on arXiv:' + number + '}' | |
def main(): | |
# Add all the bib files you ever used | |
# Order is important, in case of duplicates, the first alias matching a title | |
# will be kept as the final one. | |
bibfiles = ["ref_optim.bib", "adam/references.bib", "adabatch/references.bib", | |
"ref_audio.bib", "sing/references.bib", "demucs/references.bib"] | |
refs = [] | |
for bibfile in bibfiles: | |
entries = parse_bib(open(bibfile).read()) | |
entries.insert(0, {"kind": "comment", "content": "%" * 79}) | |
entries.insert(0, { | |
"kind": "comment", | |
"content": " The following references were extracted from " + bibfile}) | |
entries.insert(0, {"kind": "comment", "content": "%" * 79}) | |
for entry in entries: | |
entry['_source'] = bibfile | |
refs += entries | |
# Find all used references in the current folder, and sub directories | |
used = find_used(".") | |
print(bold("Used references:"), list(used)) | |
# set up replacement rules for duplicates, also update used aliases | |
replacements = find_duplicates(refs, used) | |
print(bold("Duplicates found:")) | |
for old, new in replacements.items(): | |
print(old, "->", new) | |
# remove all unused entries | |
removed = remove_unsused(refs, used) | |
print(bold("Removed references:"), removed) | |
# Apply replacement rules to all the cite/citet/citep in the current folder | |
# This will override the tex files, so please save a copy before use. | |
replace(".", replacements) | |
# Replace all arxiv references to use the @unpublished kind, with a note | |
replace_arxiv(refs) | |
# Split back into multiple files based on whatever rule you hardcode. | |
refs_optim = [] | |
refs_audio = [] | |
for ref in refs: | |
source = ref.pop('_source').split('.')[0].split('/')[0] | |
if source in ["ref_optim", "adam", "adabatch"]: | |
refs_optim.append(ref) | |
else: | |
refs_audio.append(ref) | |
open("clean_audio.bib", "w").write(dumps_bib(refs_audio)) | |
open("clean_optim.bib", "w").write(dumps_bib(refs_optim)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment