Skip to content

Instantly share code, notes, and snippets.

@AnirudhDagar
Last active November 8, 2021 19:21
Show Gist options
  • Save AnirudhDagar/9c6e236af5db02d5ba07b38d260d3d06 to your computer and use it in GitHub Desktop.
Save AnirudhDagar/9c6e236af5db02d5ba07b38d260d3d06 to your computer and use it in GitHub Desktop.
##########################################
# Requires the following modules:
# d2lbook, bibtexparser, pybtex
##########################################
import re
from d2lbook import slides, notebook, markdown
from collections import defaultdict
import nbformat
import os
import glob
import bibtexparser
import logging
import subprocess
class PrepareNotebooks():
def __init__(self):
self.unnumbered_chapters = ("chapter_preface", "chapter_installation",
"chapter_notation", "chapter_references")
def find_files(self):
"""
glob and collect all ".ipynb" files.
Returns
-------
chapter_fnames : list
list of all files under a particular chapter.
Note: Doesn't include ".ipynb" files in the root dir.
chapter_files_dict : dict
default dictionary containing keys as chapter names and
values as lists of section names.
Example: {
"chapter_linear_networks": ["image-classification-dataset.ipynb", "index.ipynb", ...],
"chapter_preliminaries": [...],
...
}
"""
chapter_fnames = []
# Use default dict to create a list in case the key is missing
# when adding the file_name for the first time in a chapter
chapter_files_dict = defaultdict(list)
for full_name in glob.glob('**/*.ipynb', recursive=True):
if os.path.isfile(full_name):
if "chapter" in full_name:
# This filters out getting_started.ipynb, index.ipynb
# in the root etc. Only append notebooks which are part
# of some chapter
chapter_fnames.append(full_name)
chapter_name, file_name = full_name.split("/")
chapter_files_dict[chapter_name].append(file_name)
return chapter_fnames, chapter_files_dict
def get_per_chapter_order(self):
"""
glob and collect all '.ipynb' files.
Returns
-------
ordered_ch_dict : dict
default dictionary containing keys as chapter names in ORDER
OF THEIR SEQUENCE in the book and the values as lists of
section names also in the ORDER OF THEIR SEQUENCE in the book.
Example: {
"chapter_preface": ['index.ipynb']
"chapter_installation": ['index.ipynb']
"chapter_notation": ['index.ipynb']
"chapter_introduction": ['index.ipynb']
"chapter_preliminaries": ['index.ipynb', 'ndarray.ipynb', 'pandas.ipynb', 'linear-algebra.ipynb', ...]
"chapter_linear-networks ['index.ipynb', 'linear-regression.ipynb', 'linear-regression-scratch.ipynb', ...]
...
}
chapter_files_dict : dict
default dictionary containing keys as chapter names and
values as lists of section names.
Example: {
"chapter_linear_networks": ["image-classification-dataset.ipynb", "index.ipynb", ...],
"chapter_preliminaries": [...],
...
}
"""
ordered_ch_dict = defaultdict(list)
# Get all notebooks
chapter_fnames, chapter_files_dict = self.find_files()
# Parse the root 'index.ipynb' notebook for chapter ordering.
root_nb = notebook.read("index.ipynb")
for line in root_nb.cells[0].source.split('\n'):
if "chapter" in line:
ch_name = line.split("]")[0].split("[")[1].split("/")[0]
if len(chapter_files_dict[ch_name]) < 2:
# Only a single notebook present and
# no other sections in this chapter.
# No need to check the order; continue the loop.
# Eg: 'chapter_preface'
# 'chapter_introduction'
# 'chapter_installation'
# 'chapter_references' etc.
ordered_ch_dict[ch_name] = chapter_files_dict[ch_name]
continue
ch_index_name = ch_name + "/" + "index.ipynb"
ch_index_nb = notebook.read(ch_index_name)
# Add index.ipynb if exists to ordered_ch_dict
if os.path.isfile(ch_index_name):
ordered_ch_dict[ch_name] = ["index.ipynb"]
# Use ":begin_tab:toc" and ":end_tab:" as markers to
# start and end the search for section ordering.
toc = False
# Parse each chapter 'index.ipynb' notebook for section ordering.
for line in ch_index_nb.cells[0].source.split('\n'):
if ":begin_tab:toc" in line:
toc = True
continue
if ":end_tab:" in line:
break
if toc:
file_name = line.split("(")[1].replace(")", "")
ordered_ch_dict[ch_name].append(file_name)
return ordered_ch_dict, chapter_files_dict
def validate_and_generate_numbering(self):
"""
Validate that all notebooks are covered.
Also add chapter numbers and subsection numbers to orderded_ch_dict.
Returns
-------
ordered_ch_dict_with_num : dict
dictionary containing keys as chapter names in ORDER
"""
ordered_ch_dict, chapter_files_dict = self.get_per_chapter_order()
ordered_ch_dict_with_num = {}
idx = 1
for key, value in ordered_ch_dict.items():
if key in self.unnumbered_chapters:
idx -= idx # Assign zero numbering to unnumbered chapters
print(idx, key, value)
ordered_ch_dict_with_num[key] = (str(idx), [(str(idx) + "." + str(sub_idx), nb_name)
for sub_idx, nb_name in enumerate(value)])
if len(ordered_ch_dict[key]) > 0: assert len(chapter_files_dict[key]) == len(value)
idx += 1
return ordered_ch_dict_with_num
print("*"*40)
print("Preparing Notebooks...\n")
get_notebooks = PrepareNotebooks()
ordered_ch_dict_with_num = get_notebooks.validate_and_generate_numbering()
print("\n","*"*40,"\n")
print("Done ")
######################################################################################################
################################# PARSE AND REPLACE CONTENT ##########################################
######################################################################################################
print("\n","*"*40,"\n")
print("Starting with Reading Notebooks...")
print("\n","*"*40,"\n")
# Our special mark in markdown, e.g. :label:`chapter_intro`
md_mark_pattern = re.compile(':([-\/\\._\w]+):(`[\ \*-\/\\\._\w]+`)?')
skip_num_chapters = ["chapter_preface", "chapter_notation", "chapter_installation"]
class TableUpdater():
def __init__(self, subsec_num, ch_name, subsec_name):
self.ch_name = ch_name
self.subsec_name = subsec_name
self.subsec_num = subsec_num
self.path = "../" + ch_name + "/" + subsec_name
def eq(self, eq_table, eq_key, per_subsec_eq_count):
eq_id_num = str(self.subsec_num) + "." + str(per_subsec_eq_count)
eq_id = "eq" + eq_id_num
replacement_eq_name = "[(" + eq_id_num + f")]({self.path}#{eq_id})"
if ".0." in replacement_eq_name:
replacement_eq_name = replacement_eq_name.replace(".0.", ".")
eq_table[eq_key] = replacement_eq_name
return eq_table, eq_id
def fig(self, fig_table, fig_key, per_subsec_fig_count):
# Special handle some sections
fig_id_num = str(self.subsec_num) + "." + str(per_subsec_fig_count)
fig_id = "fig" + fig_id_num
if self.ch_name == "chapter_preface":
fig_table[fig_key] = "[Fig. 1" + f"]({self.path}#{fig_id})"
return fig_table, fig_id
replacement_fig_name = "[Fig. " + fig_id_num + f"]({self.path}#{fig_id})"
if ".0." in replacement_fig_name:
replacement_fig_name = replacement_fig_name.replace(".0.", ".")
fig_id = fig_id.replace(".0.", ".")
fig_table[fig_key] = replacement_fig_name
return fig_table, fig_id
def table(self, table_table, table_key, per_subsec_table_count):
# Special handle some sections
table_id_num = str(self.subsec_num) + "." + str(per_subsec_table_count)
table_id = "table" + table_id_num
if ".0." in table_id:
table_id = table_id.replace(".0.", ".")
table_id_num = table_id_num.replace(".0.", ".")
if self.ch_name == "chapter_preface":
table_table[table_key] = "[Table. 1" + f"]({self.path}#{table_id})"
return table_table, table_id
replacement_table_name = "[Table. " + table_id_num + f"]({self.path}#{table_id})"
table_table[table_key] = replacement_table_name
return table_table, table_id
def header(self, head_table, sec_key, ref_name):
if "index.ipynb"==self.subsec_name:
sec_id = self.subsec_num.split(".")[0]
replacement_name = "[Chapter " + sec_id + f"]({self.path})"
else:
sec_id = self.subsec_num
replacement_name = "[Section " + sec_id + f"]({self.path})"
ref_name = "[" + ref_name + f"]({self.path})"
head_table[sec_key] = (replacement_name, ref_name)
return head_table, sec_id
def sec(self, sec_table, sec_key, two_down, three_down, line_sec_name):
sec_id = self.get_sec_id(two_down, three_down, 0)
if self.ch_name in skip_num_chapters:
link_id = '-'.join(line_sec_name.split("# ")[1].split(" "))
else:
link_id = sec_id + "-" + '-'.join(line_sec_name.split("# ")[1].split(" "))
replacement_sec_name = "[Section " + sec_id + f"]({self.path}#{link_id})"
ref_name = "[" + line_sec_name.split("# ")[1] + f"]({self.path}#{link_id})"
sec_table[sec_key] = replacement_sec_name, ref_name
return sec_table, sec_id
def get_sec_id(self, two_down, three_down, four_down):
sec_id = ".".join([self.subsec_num, str(two_down), str(three_down), str(four_down)])
sec_id = sec_id.replace(".0", "")
return sec_id
def get_table_caption(self, lines, curr_line_num):
lines_of_interest = lines[:curr_line_num+1]
lines_of_interest.reverse()
for idx, line in enumerate(lines_of_interest):
if line.startswith(":label:"):
continue
if line.startswith(":"):
return line[1:], idx
def element_center_formatting(self, id, num, caption):
return f"<center id=\"{id}\"><i>" + num + " " + caption + "</i></center>"
def element_eq_formatting(self, per_subsec_count, eq_id=None):
if eq_id:
return f"<i id=\"{eq_id}\" style=\"float: right\">" + "(" + eq_id.strip("eq") + ")" + "</i><br/><br/>"
else:
return f"<i style=\"float: right\">" + "(" + str(self.subsec_num) + "." + str(per_subsec_count) + ")" + "</i><br/><br/>"
def generate_tables_and_replace_unlabeled(ordered_ch_dict_with_num):
eq_match = re.compile("\$\$(.*?)\$\$")
sec_match = re.compile("\#\s(\D*?)$")
fig_table, table_table, eq_table, head_table, sec_table = {}, {}, {}, {}, {}
count_match = 0
for ch_name, value in ordered_ch_dict_with_num.items():
for idx in range(len(value[1])):
per_subsec_fig_count, per_subsec_table_count, per_subsec_eq_count = 0, 0, 0
two_down, three_down, four_down = 0, 0, 0
dollar = 0
subsec_name = value[1][idx][1]
subsec_num = value[1][idx][0]
new_cells = []
sec_path = os.path.join(os.getcwd(), ch_name, subsec_name)
nb = notebook.read(sec_path)
updater = TableUpdater(subsec_num, ch_name, subsec_name)
for cell in nb.cells:
if cell.cell_type=='markdown':
md_cell = cell.source
lines = md_cell.split('\n')
for j, line in enumerate(lines):
eq_unlabeled = eq_match.search(line)
sec_unlabeled = sec_match.search(line)
if eq_unlabeled is not None and ("eqlabel" not in lines[j+1]):
# Handle equations which are not labeled but
# during the numbering they are used
per_subsec_eq_count += 1
eq_num = updater.element_eq_formatting(per_subsec_eq_count)
lines[j] = lines[j] + eq_num
elif lines[j]=="$$":
dollar += 1
if dollar==2:
# Handle equations which have $$ syntax
# in lines above and below
per_subsec_eq_count += 1
dollar = 0
eq_num = updater.element_eq_formatting(per_subsec_eq_count)
lines[j] = "$$\n" + eq_num
if len(lines)<2:
conditional = sec_unlabeled is not None
elif len(lines)==2:
conditional = sec_unlabeled is not None and (":label:" not in lines[1])
else:
try:
conditional = sec_unlabeled is not None and (":label:" not in lines[j+1]) and (":label:" not in lines[j+2])
except Exception as e:
# Some notebooks dont't have pytorch sections but they still have headings
# These need to be handled explicitly. They have headers on the last line
# See example chapter computational performance
conditional = sec_unlabeled is not None and (":label:" not in lines[-1])
if conditional and ch_name not in skip_num_chapters:
num_pounds = line.count("#")
if num_pounds == 2:
two_down += 1
three_down = 0
four_down = 0
if num_pounds == 3:
three_down += 1
four_down = 0
if num_pounds == 4:
four_down += 1
sec_id = updater.get_sec_id(two_down, three_down, four_down)
lines[j] = lines[j].replace("# ", f"# {sec_id} ")
m = md_mark_pattern.search(line)
if (m is not None
and m[1] not in ('ref', 'numref', 'eqref')
and m.end() == len(line)):
count_match += 1
# Remove width
if m[1] == 'width':
lines[j] = ''
# Save equation labels
if m[1] == 'eqlabel':
per_subsec_eq_count += 1
eq_table, eq_id = updater.eq(eq_table, m[2], per_subsec_eq_count)
eq_label_num_right = updater.element_eq_formatting(per_subsec_eq_count, eq_id=eq_id)
lines[j] = lines[j].replace(m[0], eq_label_num_right)
# Save figure, sec, subsec, chap, tab labels
if m[1] == 'label':
if j<2:
conditional = len(lines[j-1]) > 0 and "# " in lines[j-1]
else:
conditional = (len(lines[j-1]) > 0 and "# " in lines[j-1]) or (len(lines[j-2]) > 0 and "# " in lines[j-2])
if (conditional):
# Found chapter, section or subsection
num_pounds = lines[j-1].count("#")
if num_pounds == 0: num_pounds = lines[j-2].count("#")
if num_pounds == 1:
# Handling top Headers
if "# " in lines[j-1]: ref_name = lines[j-1].split("# ")[1]
else: ref_name = lines[j-2].split("# ")[1]
head_table, head_id = updater.header(head_table, m[2], ref_name)
lines[j] = ''
if ch_name not in skip_num_chapters:
if "# " in lines[j-1]:
lines[j-1] = lines[j-1].replace("# ", f"# {head_id} ")
ref_name = lines[j-1].split("# ")[1]
else:
lines[j-2] = lines[j-2].replace("# ", f"# {head_id} ")
ref_name = lines[j-2].split("# ")[1]
else:
if num_pounds == 2:
two_down += 1
three_down = 0
if num_pounds == 3:
three_down += 1
if "# " in lines[j-1]:
sec_table, sec_id = updater.sec(sec_table, m[2], two_down, three_down, lines[j-1])
lines[j] = ''
if ch_name not in skip_num_chapters:
lines[j-1] = lines[j-1].replace("# ", f"# {sec_id} ")
else:
sec_table, sec_id = updater.sec(sec_table, m[2], two_down, three_down, lines[j-2])
lines[j] = ''
if ch_name not in skip_num_chapters:
lines[j-2] = lines[j-2].replace("# ", f"# {sec_id} ")
# Can't use the following logic to find figures
# since `lstm0`, `lstm1`... are also figures.
# if ('fig' in m[2]) or ('img' in m[2]):
# The if logic below checks for markdown img syntax
# in the two lines above
elif ("![" in lines[j-1]) or ("![" in lines[j-2]):
if "![" in lines[j-1]:
fig_caption_raw = lines[j-1]
else:
fig_caption_raw = lines[j-2]
fig_caption = fig_caption_raw.split("]")[0].strip("![")
per_subsec_fig_count += 1
# Add figure to fig_table dictionary
fig_table, fig_id = updater.fig(fig_table, m[2], per_subsec_fig_count)
# Replace labels of the figure
fig_label_num = fig_table[m[2]].split("]")[0].replace("[", "")
fig_label_num_centered = updater.element_center_formatting(fig_id, fig_label_num, fig_caption)
lines[j] = lines[j].replace(m[0], fig_label_num_centered)
elif ("|"==lines[j-1][0]):
table_caption, caption_line_num = updater.get_table_caption(lines, j)
per_subsec_table_count += 1
# Add table to table_table dictionary
table_table, table_id = updater.table(table_table, m[2], per_subsec_table_count)
# Replace labels of the table
table_label_num = table_table[m[2]].split("]")[0].replace("[", "")
table_label_num_centered = updater.element_center_formatting(table_id, table_label_num, table_caption)
lines[j-caption_line_num] = "\n" + table_label_num_centered
lines[j] = "\n\n"
else:
if ch_name not in skip_num_chapters:
assert "fig" not in m[2]
assert "img" not in m[2]
assert "sec" not in m[2]
assert "chap" not in m[2]
print("0:", m[2])
md_cell = '\n'.join(lines)
new_cells.append(nbformat.v4.new_markdown_cell(md_cell))
else:
new_cells.append(cell)
new_nb = notebook.create_new_notebook(nb, new_cells)
with open(sec_path, 'w') as f:
f.write(nbformat.writes(new_nb))
return fig_table, eq_table, sec_table, head_table, table_table
def shorten_replacement(replace_old, subsec_name, ch_name):
subsec_name_link=replace_old.split("(..")[1].split("/")[2].split("#")[0]
ch_name_link=replace_old.split("(..")[1].split("/")[1]
if subsec_name==subsec_name_link and ch_name==ch_name_link:
replace_name = replace_old.split("(..")[0]
replace_link = "(#" + replace_old.split("(..")[1].split("/")[2].split("#")[1]
replacement = replace_name + replace_link
else:
replacement = replace_old
return replacement
def get_citations():
"""
Get Citations!
"""
# Generate temporary file out.txt from d2l.bib
# containing formatted references output.
cmd = "pybtex-format --label-style apa --abbreviate-names d2l.bib out.txt"
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if process.returncode != 0:
logging.error('%s', stderr.decode())
exit(-1)
with open("out.txt", "r") as ref_file:
new_lines = ref_file.readlines()
with open('d2l.bib', 'r') as bibtex_file:
d2l_bib_db = bibtexparser.load(bibtex_file)
# Create cite_table
cite_table = {}
for i, (line, entries) in enumerate(zip(new_lines, d2l_bib_db.entries)):
key = entries["ID"]
cite_name = line.split("]")[0] + "]"
link = f"(../chapter_references/zreferences.ipynb#{key})"
replacement_name = cite_name + link
cite_table[key] = replacement_name
new_lines[i] = f"<p id={key}>{line}</p>"
print("Citations Table Created!")
os.remove("out.txt")
print("Citations temporary file removed!")
sec_references_path = os.path.join(os.getcwd(), "chapter_references", "zreferences.ipynb")
new_cells = []
nb = notebook.read(sec_references_path)
md_cell = '\n'.join(new_lines)
new_cells.append(nbformat.v4.new_markdown_cell(md_cell))
new_nb = notebook.create_new_notebook(nb, new_cells)
with open(sec_references_path, 'w') as f:
f.write(nbformat.writes(new_nb))
return cite_table
def replace_references(fig_table, eq_table, sec_table, head_table, table_table, cite_table):
md_mark_pattern_ref = re.compile(':ref:(`[\ \*-\/\\\._\w]+`)?')
md_mark_pattern_numref = re.compile(':numref:(`[\ \*-\/\\\._\w]+`)?')
md_mark_pattern_eqref = re.compile(':eqref:(`[\ \*-\/\\\._\w]+`)?')
md_mark_pattern_cite = re.compile(':cite:(`[\ \*-\/\\\._\w]+`)?')
for ch_name, value in ordered_ch_dict_with_num.items():
for idx in range(len(value[1])):
subsec_name = value[1][idx][1]
subsec_num = value[1][idx][0]
sec_path = os.path.join(os.getcwd(), ch_name, subsec_name)
new_cells = []
nb = notebook.read(sec_path)
for cell in nb.cells:
if cell.cell_type=='markdown':
md_cell = cell.source
lines = md_cell.split('\n')
for j, line in enumerate(lines):
m_all_numref = md_mark_pattern_numref.findall(line)
m_all_ref = md_mark_pattern_ref.findall(line)
m_all_eqref = md_mark_pattern_eqref.findall(line)
m_all_cite = md_mark_pattern_cite.findall(line)
for m in m_all_numref:
if m in fig_table.keys():
replacement = shorten_replacement(fig_table[m], subsec_name, ch_name)
lines[j] = lines[j].replace(":numref:" + m, replacement)
if m in sec_table.keys():
replacement = shorten_replacement(sec_table[m][0], subsec_name, ch_name)
lines[j] = lines[j].replace(":numref:" + m, replacement)
if m in head_table.keys():
replacement = shorten_replacement(head_table[m][0], subsec_name, ch_name)
lines[j] = lines[j].replace(":numref:" + m, replacement)
if m in table_table.keys():
replacement = shorten_replacement(table_table[m], subsec_name, ch_name)
lines[j] = lines[j].replace(":numref:" + m, replacement)
for m in m_all_eqref:
if m in eq_table.keys():
replacement = shorten_replacement(eq_table[m], subsec_name, ch_name)
lines[j] = lines[j].replace(":eqref:" + m, replacement)
for m in m_all_ref:
if m in sec_table.keys():
replacement = shorten_replacement(sec_table[m][1], subsec_name, ch_name)
lines[j] = lines[j].replace(":ref:" + m, replacement)
if m in head_table.keys():
replacement = shorten_replacement(head_table[m][1], subsec_name, ch_name)
lines[j] = lines[j].replace(":ref:" + m, replacement)
for m in m_all_cite:
m_stripped = m.strip("`")
if "," in m:
replacement_list = []
all_cited = m_stripped.split(",")
replacement_list = [cite_table[cited] for cited in all_cited]
replacement = ','.join(replacement_list)
else:
replacement = cite_table[m_stripped]
lines[j] = lines[j].replace(":cite:" + m, replacement)
md_cell = '\n'.join(lines)
new_cells.append(nbformat.v4.new_markdown_cell(md_cell))
else:
new_cells.append(cell)
new_nb = notebook.create_new_notebook(nb, new_cells)
with open(sec_path, 'w') as f:
f.write(nbformat.writes(new_nb))
fig_table, eq_table, sec_table, head_table, table_table = generate_tables_and_replace_unlabeled(ordered_ch_dict_with_num)
cite_table = get_citations()
replace_references(fig_table, eq_table, sec_table, head_table, table_table, cite_table)
print("Done!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment