Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Created February 18, 2026 12:33
Show Gist options
  • Select an option

  • Save BexTuychiev/1c7b944a906ea0ea81935e88ea2a1159 to your computer and use it in GitHub Desktop.

Select an option

Save BexTuychiev/1c7b944a906ea0ea81935e88ea2a1159 to your computer and use it in GitHub Desktop.
Notebook to DOCX converter - converts Jupyter notebooks to Word documents with proper formatting
#!/usr/bin/env python3
"""
Notebook to DOCX Converter
Converts Jupyter notebooks to Word documents with proper formatting:
- Markdown formatting preserved as Word styles
- Backticks preserved around inline code
- Code blocks with triple backticks visible, Courier New font
- Non-code text in Poppins font
- Images with alt text built-in
- Clickable hyperlinks
"""
import sys
import re
import os
from pathlib import Path
import nbformat
from docx import Document
from docx.shared import Pt, Inches, Twips
from docx.enum.style import WD_STYLE_TYPE
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from PIL import Image
def add_hyperlink(paragraph, text, url, bold=False):
"""Add a clickable hyperlink to a paragraph."""
part = paragraph.part
r_id = part.relate_to(
url,
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
is_external=True
)
hyperlink = OxmlElement('w:hyperlink')
hyperlink.set(qn('r:id'), r_id)
new_run = OxmlElement('w:r')
rPr = OxmlElement('w:rPr')
# Blue color for links
color = OxmlElement('w:color')
color.set(qn('w:val'), '0563C1')
rPr.append(color)
# Underline
u = OxmlElement('w:u')
u.set(qn('w:val'), 'single')
rPr.append(u)
# Font
rFonts = OxmlElement('w:rFonts')
rFonts.set(qn('w:ascii'), 'Poppins')
rFonts.set(qn('w:hAnsi'), 'Poppins')
rPr.append(rFonts)
# Bold if needed
if bold:
b = OxmlElement('w:b')
rPr.append(b)
new_run.append(rPr)
text_elem = OxmlElement('w:t')
text_elem.text = text
new_run.append(text_elem)
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
def set_image_alt_text(inline, alt_text):
"""Set alt text on an inline image by modifying the XML."""
# Find the docPr element and set the descr attribute
for child in inline.iter():
if 'docPr' in child.tag:
child.set('descr', alt_text)
child.set('title', alt_text)
break
def create_styles(doc):
"""Create custom styles for the document."""
styles = doc.styles
# Code block style with tight line spacing
if 'CodeBlock' not in [s.name for s in styles]:
code_style = styles.add_style('CodeBlock', WD_STYLE_TYPE.PARAGRAPH)
code_style.font.name = 'Courier New'
code_style.font.size = Pt(10)
code_style.paragraph_format.space_before = Pt(0)
code_style.paragraph_format.space_after = Pt(0)
code_style.paragraph_format.line_spacing = 1.0
def process_inline_formatting(paragraph, text, base_font='Poppins', inherited_bold=False, inherited_italic=False):
"""Process inline markdown formatting and add to paragraph.
Handles nested formatting like **[link](url)** by recursively processing.
"""
if not text:
return
# Patterns - order matters
patterns = [
# Bold links: **[text](url)**
(r'\*\*\[([^\]]+)\]\(([^)]+)\)\*\*', 'bold_link'),
# Italic links: *[text](url)*
(r'\*\[([^\]]+)\]\(([^)]+)\)\*', 'italic_link'),
# Bold text (may contain nested formatting)
(r'\*\*(.+?)\*\*', 'bold'),
# Italic text
(r'\*([^*]+?)\*', 'italic'),
# Inline code with backticks - preserve backticks
(r'`([^`]+)`', 'code'),
# Regular links
(r'\[([^\]]+)\]\(([^)]+)\)', 'link'),
]
remaining = text
while remaining:
earliest_match = None
earliest_pos = len(remaining)
match_type = None
match_pattern = None
for pattern, ptype in patterns:
match = re.search(pattern, remaining)
if match and match.start() < earliest_pos:
earliest_match = match
earliest_pos = match.start()
match_type = ptype
if earliest_match is None:
# No more patterns, add remaining text
if remaining:
run = paragraph.add_run(remaining)
run.font.name = base_font
run.font.size = Pt(11)
if inherited_bold:
run.bold = True
if inherited_italic:
run.italic = True
break
# Add text before the match
if earliest_pos > 0:
run = paragraph.add_run(remaining[:earliest_pos])
run.font.name = base_font
run.font.size = Pt(11)
if inherited_bold:
run.bold = True
if inherited_italic:
run.italic = True
# Process the match
if match_type == 'bold_link':
link_text = earliest_match.group(1)
link_url = earliest_match.group(2)
add_hyperlink(paragraph, link_text, link_url, bold=True)
elif match_type == 'italic_link':
link_text = earliest_match.group(1)
link_url = earliest_match.group(2)
# Add italic hyperlink (need custom handling)
add_hyperlink(paragraph, link_text, link_url, bold=False)
elif match_type == 'bold':
inner_text = earliest_match.group(1)
# Check if inner text contains a link
link_match = re.match(r'\[([^\]]+)\]\(([^)]+)\)', inner_text)
if link_match:
add_hyperlink(paragraph, link_match.group(1), link_match.group(2), bold=True)
else:
# Recursively process for other nested formatting
process_inline_formatting(paragraph, inner_text, base_font,
inherited_bold=True, inherited_italic=inherited_italic)
elif match_type == 'italic':
inner_text = earliest_match.group(1)
process_inline_formatting(paragraph, inner_text, base_font,
inherited_bold=inherited_bold, inherited_italic=True)
elif match_type == 'code':
# Preserve backticks around inline code
code_text = earliest_match.group(1)
run = paragraph.add_run(f'`{code_text}`')
run.font.name = 'Courier New'
run.font.size = Pt(10)
elif match_type == 'link':
link_text = earliest_match.group(1)
link_url = earliest_match.group(2)
add_hyperlink(paragraph, link_text, link_url, bold=inherited_bold)
remaining = remaining[earliest_match.end():]
def add_image_with_alt(doc, image_path, alt_text, base_path=None):
"""Add an image to the document with alt text."""
# Resolve image path
if base_path and not os.path.isabs(image_path):
full_path = os.path.join(base_path, image_path)
else:
full_path = image_path
if not os.path.exists(full_path):
# Try without leading path components
if base_path:
filename = os.path.basename(image_path)
for root, dirs, files in os.walk(base_path):
if filename in files:
full_path = os.path.join(root, filename)
break
if not os.path.exists(full_path):
p = doc.add_paragraph()
run = p.add_run(f'[Image not found: {image_path}]')
run.font.name = 'Poppins'
run.italic = True
return
# Get image dimensions and scale appropriately
try:
with Image.open(full_path) as img:
width, height = img.size
max_width = Inches(6)
if width > 600:
doc_width = max_width
else:
doc_width = Inches(width / 100)
if doc_width > max_width:
doc_width = max_width
except Exception:
doc_width = Inches(5)
# Add image
paragraph = doc.add_paragraph()
run = paragraph.add_run()
try:
inline = run.add_picture(full_path, width=doc_width)
# Set alt text via XML manipulation
inline_element = inline._inline
set_image_alt_text(inline_element, alt_text or 'Image')
except Exception as e:
run = paragraph.add_run(f'[Error loading image: {e}]')
run.font.name = 'Poppins'
run.italic = True
def add_code_block(doc, code_lines):
"""Add a code block with tight line spacing."""
for code_line in code_lines:
p = doc.add_paragraph()
# Set tight line spacing
p.paragraph_format.space_before = Pt(0)
p.paragraph_format.space_after = Pt(0)
p.paragraph_format.line_spacing = 1.0
run = p.add_run(code_line if code_line else ' ') # Empty lines need a space
run.font.name = 'Courier New'
run.font.size = Pt(10)
def parse_markdown_table(lines, start_index):
"""Parse a markdown table starting at the given index.
Returns (table_data, end_index) where table_data is a list of rows,
each row being a list of cell contents.
"""
table_rows = []
i = start_index
while i < len(lines):
line = lines[i].strip()
# Check if this line is part of the table
if not line.startswith('|') and not line.endswith('|'):
break
# Skip separator lines (|---|---|)
if re.match(r'^\|[\s\-:|\s]+\|$', line) or re.match(r'^\|?[\s\-:]+\|[\s\-:|]+\|?$', line):
i += 1
continue
# Parse cells from the row
# Remove leading/trailing pipes and split by |
cells = line.strip('|').split('|')
cells = [cell.strip() for cell in cells]
if cells:
table_rows.append(cells)
i += 1
return table_rows, i - 1
def add_table_to_doc(doc, table_data):
"""Add a table to the document from parsed markdown table data."""
if not table_data or not table_data[0]:
return
num_cols = len(table_data[0])
num_rows = len(table_data)
# Create table
table = doc.add_table(rows=num_rows, cols=num_cols)
table.style = 'Table Grid'
# Fill in cells
for row_idx, row_data in enumerate(table_data):
row = table.rows[row_idx]
for col_idx, cell_text in enumerate(row_data):
if col_idx < len(row.cells):
cell = row.cells[col_idx]
# Clear existing content and add formatted text
cell.text = ''
paragraph = cell.paragraphs[0]
process_inline_formatting(paragraph, cell_text)
# Make header row bold
if row_idx == 0:
for run in paragraph.runs:
run.bold = True
# Add some space after the table
doc.add_paragraph()
def process_markdown_cell(doc, content, base_path=None):
"""Process a markdown cell and add to document."""
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i]
# Headers
if line.startswith('######'):
p = doc.add_heading(line[6:].strip(), level=6)
for run in p.runs:
run.font.name = 'Poppins'
elif line.startswith('#####'):
p = doc.add_heading(line[5:].strip(), level=5)
for run in p.runs:
run.font.name = 'Poppins'
elif line.startswith('####'):
p = doc.add_heading(line[4:].strip(), level=4)
for run in p.runs:
run.font.name = 'Poppins'
elif line.startswith('###'):
p = doc.add_heading(line[3:].strip(), level=3)
for run in p.runs:
run.font.name = 'Poppins'
elif line.startswith('##'):
p = doc.add_heading(line[2:].strip(), level=2)
for run in p.runs:
run.font.name = 'Poppins'
elif line.startswith('#'):
p = doc.add_heading(line[1:].strip(), level=1)
for run in p.runs:
run.font.name = 'Poppins'
# Code blocks
elif line.startswith('```'):
lang = line[3:].strip()
code_lines = [f'```{lang}']
i += 1
while i < len(lines) and not lines[i].startswith('```'):
code_lines.append(lines[i])
i += 1
code_lines.append('```')
# Add code block with tight spacing
add_code_block(doc, code_lines)
# Blockquotes
elif line.startswith('>'):
quote_text = line[1:].strip()
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.5)
process_inline_formatting(p, quote_text)
# Unordered lists
elif line.strip().startswith('- ') or line.strip().startswith('* '):
list_text = line.strip()[2:]
p = doc.add_paragraph(style='List Bullet')
process_inline_formatting(p, list_text)
# Ordered lists
elif re.match(r'^\d+\.\s', line.strip()):
list_text = re.sub(r'^\d+\.\s', '', line.strip())
p = doc.add_paragraph(style='List Number')
process_inline_formatting(p, list_text)
# Images (standalone)
elif re.match(r'^!\[([^\]]*)\]\(([^)]+)\)$', line.strip()):
match = re.match(r'^!\[([^\]]*)\]\(([^)]+)\)$', line.strip())
alt_text = match.group(1)
image_path = match.group(2)
add_image_with_alt(doc, image_path, alt_text, base_path)
# Horizontal rule
elif line.strip() in ['---', '***', '___']:
p = doc.add_paragraph()
p.add_run('─' * 50)
# Empty line
elif not line.strip():
pass # Skip empty lines
# Tables (lines starting with |)
elif line.strip().startswith('|'):
table_data, end_index = parse_markdown_table(lines, i)
if table_data:
add_table_to_doc(doc, table_data)
i = end_index
# Regular paragraph
else:
p = doc.add_paragraph()
process_inline_formatting(p, line)
i += 1
def process_code_cell(doc, source, outputs=None):
"""Process a code cell - show code with triple backticks."""
code_lines = ['```python'] + source.split('\n') + ['```']
add_code_block(doc, code_lines)
def convert_notebook_to_docx(notebook_path, output_path=None):
"""Convert a Jupyter notebook to a Word document."""
notebook_path = Path(notebook_path)
if output_path is None:
output_path = notebook_path.with_suffix('.docx')
else:
output_path = Path(output_path)
# Read notebook
with open(notebook_path, 'r', encoding='utf-8') as f:
nb = nbformat.read(f, as_version=4)
# Create document
doc = Document()
create_styles(doc)
# Set default font for Normal style
style = doc.styles['Normal']
style.font.name = 'Poppins'
style.font.size = Pt(11)
# Base path for resolving relative image paths
base_path = notebook_path.parent
# Process cells
for cell in nb.cells:
if cell.cell_type == 'markdown':
process_markdown_cell(doc, cell.source, base_path)
elif cell.cell_type == 'code':
process_code_cell(doc, cell.source, cell.get('outputs', []))
# Save document
doc.save(output_path)
print(f'Converted: {notebook_path} -> {output_path}')
return output_path
def main():
if len(sys.argv) < 2:
print('Usage: python notebook_to_docx.py <notebook_path> [output_path]')
sys.exit(1)
notebook_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else None
convert_notebook_to_docx(notebook_path, output_path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment