Created
February 18, 2026 12:33
-
-
Save BexTuychiev/1c7b944a906ea0ea81935e88ea2a1159 to your computer and use it in GitHub Desktop.
Notebook to DOCX converter - converts Jupyter notebooks to Word documents with proper formatting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Notebook to DOCX Converter | |
| Converts Jupyter notebooks to Word documents with proper formatting: | |
| - Markdown formatting preserved as Word styles | |
| - Backticks preserved around inline code | |
| - Code blocks with triple backticks visible, Courier New font | |
| - Non-code text in Poppins font | |
| - Images with alt text built-in | |
| - Clickable hyperlinks | |
| """ | |
| import sys | |
| import re | |
| import os | |
| from pathlib import Path | |
| import nbformat | |
| from docx import Document | |
| from docx.shared import Pt, Inches, Twips | |
| from docx.enum.style import WD_STYLE_TYPE | |
| from docx.oxml.ns import qn | |
| from docx.oxml import OxmlElement | |
| from PIL import Image | |
| def add_hyperlink(paragraph, text, url, bold=False): | |
| """Add a clickable hyperlink to a paragraph.""" | |
| part = paragraph.part | |
| r_id = part.relate_to( | |
| url, | |
| "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink", | |
| is_external=True | |
| ) | |
| hyperlink = OxmlElement('w:hyperlink') | |
| hyperlink.set(qn('r:id'), r_id) | |
| new_run = OxmlElement('w:r') | |
| rPr = OxmlElement('w:rPr') | |
| # Blue color for links | |
| color = OxmlElement('w:color') | |
| color.set(qn('w:val'), '0563C1') | |
| rPr.append(color) | |
| # Underline | |
| u = OxmlElement('w:u') | |
| u.set(qn('w:val'), 'single') | |
| rPr.append(u) | |
| # Font | |
| rFonts = OxmlElement('w:rFonts') | |
| rFonts.set(qn('w:ascii'), 'Poppins') | |
| rFonts.set(qn('w:hAnsi'), 'Poppins') | |
| rPr.append(rFonts) | |
| # Bold if needed | |
| if bold: | |
| b = OxmlElement('w:b') | |
| rPr.append(b) | |
| new_run.append(rPr) | |
| text_elem = OxmlElement('w:t') | |
| text_elem.text = text | |
| new_run.append(text_elem) | |
| hyperlink.append(new_run) | |
| paragraph._p.append(hyperlink) | |
| return hyperlink | |
| def set_image_alt_text(inline, alt_text): | |
| """Set alt text on an inline image by modifying the XML.""" | |
| # Find the docPr element and set the descr attribute | |
| for child in inline.iter(): | |
| if 'docPr' in child.tag: | |
| child.set('descr', alt_text) | |
| child.set('title', alt_text) | |
| break | |
| def create_styles(doc): | |
| """Create custom styles for the document.""" | |
| styles = doc.styles | |
| # Code block style with tight line spacing | |
| if 'CodeBlock' not in [s.name for s in styles]: | |
| code_style = styles.add_style('CodeBlock', WD_STYLE_TYPE.PARAGRAPH) | |
| code_style.font.name = 'Courier New' | |
| code_style.font.size = Pt(10) | |
| code_style.paragraph_format.space_before = Pt(0) | |
| code_style.paragraph_format.space_after = Pt(0) | |
| code_style.paragraph_format.line_spacing = 1.0 | |
| def process_inline_formatting(paragraph, text, base_font='Poppins', inherited_bold=False, inherited_italic=False): | |
| """Process inline markdown formatting and add to paragraph. | |
| Handles nested formatting like **[link](url)** by recursively processing. | |
| """ | |
| if not text: | |
| return | |
| # Patterns - order matters | |
| patterns = [ | |
| # Bold links: **[text](url)** | |
| (r'\*\*\[([^\]]+)\]\(([^)]+)\)\*\*', 'bold_link'), | |
| # Italic links: *[text](url)* | |
| (r'\*\[([^\]]+)\]\(([^)]+)\)\*', 'italic_link'), | |
| # Bold text (may contain nested formatting) | |
| (r'\*\*(.+?)\*\*', 'bold'), | |
| # Italic text | |
| (r'\*([^*]+?)\*', 'italic'), | |
| # Inline code with backticks - preserve backticks | |
| (r'`([^`]+)`', 'code'), | |
| # Regular links | |
| (r'\[([^\]]+)\]\(([^)]+)\)', 'link'), | |
| ] | |
| remaining = text | |
| while remaining: | |
| earliest_match = None | |
| earliest_pos = len(remaining) | |
| match_type = None | |
| match_pattern = None | |
| for pattern, ptype in patterns: | |
| match = re.search(pattern, remaining) | |
| if match and match.start() < earliest_pos: | |
| earliest_match = match | |
| earliest_pos = match.start() | |
| match_type = ptype | |
| if earliest_match is None: | |
| # No more patterns, add remaining text | |
| if remaining: | |
| run = paragraph.add_run(remaining) | |
| run.font.name = base_font | |
| run.font.size = Pt(11) | |
| if inherited_bold: | |
| run.bold = True | |
| if inherited_italic: | |
| run.italic = True | |
| break | |
| # Add text before the match | |
| if earliest_pos > 0: | |
| run = paragraph.add_run(remaining[:earliest_pos]) | |
| run.font.name = base_font | |
| run.font.size = Pt(11) | |
| if inherited_bold: | |
| run.bold = True | |
| if inherited_italic: | |
| run.italic = True | |
| # Process the match | |
| if match_type == 'bold_link': | |
| link_text = earliest_match.group(1) | |
| link_url = earliest_match.group(2) | |
| add_hyperlink(paragraph, link_text, link_url, bold=True) | |
| elif match_type == 'italic_link': | |
| link_text = earliest_match.group(1) | |
| link_url = earliest_match.group(2) | |
| # Add italic hyperlink (need custom handling) | |
| add_hyperlink(paragraph, link_text, link_url, bold=False) | |
| elif match_type == 'bold': | |
| inner_text = earliest_match.group(1) | |
| # Check if inner text contains a link | |
| link_match = re.match(r'\[([^\]]+)\]\(([^)]+)\)', inner_text) | |
| if link_match: | |
| add_hyperlink(paragraph, link_match.group(1), link_match.group(2), bold=True) | |
| else: | |
| # Recursively process for other nested formatting | |
| process_inline_formatting(paragraph, inner_text, base_font, | |
| inherited_bold=True, inherited_italic=inherited_italic) | |
| elif match_type == 'italic': | |
| inner_text = earliest_match.group(1) | |
| process_inline_formatting(paragraph, inner_text, base_font, | |
| inherited_bold=inherited_bold, inherited_italic=True) | |
| elif match_type == 'code': | |
| # Preserve backticks around inline code | |
| code_text = earliest_match.group(1) | |
| run = paragraph.add_run(f'`{code_text}`') | |
| run.font.name = 'Courier New' | |
| run.font.size = Pt(10) | |
| elif match_type == 'link': | |
| link_text = earliest_match.group(1) | |
| link_url = earliest_match.group(2) | |
| add_hyperlink(paragraph, link_text, link_url, bold=inherited_bold) | |
| remaining = remaining[earliest_match.end():] | |
| def add_image_with_alt(doc, image_path, alt_text, base_path=None): | |
| """Add an image to the document with alt text.""" | |
| # Resolve image path | |
| if base_path and not os.path.isabs(image_path): | |
| full_path = os.path.join(base_path, image_path) | |
| else: | |
| full_path = image_path | |
| if not os.path.exists(full_path): | |
| # Try without leading path components | |
| if base_path: | |
| filename = os.path.basename(image_path) | |
| for root, dirs, files in os.walk(base_path): | |
| if filename in files: | |
| full_path = os.path.join(root, filename) | |
| break | |
| if not os.path.exists(full_path): | |
| p = doc.add_paragraph() | |
| run = p.add_run(f'[Image not found: {image_path}]') | |
| run.font.name = 'Poppins' | |
| run.italic = True | |
| return | |
| # Get image dimensions and scale appropriately | |
| try: | |
| with Image.open(full_path) as img: | |
| width, height = img.size | |
| max_width = Inches(6) | |
| if width > 600: | |
| doc_width = max_width | |
| else: | |
| doc_width = Inches(width / 100) | |
| if doc_width > max_width: | |
| doc_width = max_width | |
| except Exception: | |
| doc_width = Inches(5) | |
| # Add image | |
| paragraph = doc.add_paragraph() | |
| run = paragraph.add_run() | |
| try: | |
| inline = run.add_picture(full_path, width=doc_width) | |
| # Set alt text via XML manipulation | |
| inline_element = inline._inline | |
| set_image_alt_text(inline_element, alt_text or 'Image') | |
| except Exception as e: | |
| run = paragraph.add_run(f'[Error loading image: {e}]') | |
| run.font.name = 'Poppins' | |
| run.italic = True | |
| def add_code_block(doc, code_lines): | |
| """Add a code block with tight line spacing.""" | |
| for code_line in code_lines: | |
| p = doc.add_paragraph() | |
| # Set tight line spacing | |
| p.paragraph_format.space_before = Pt(0) | |
| p.paragraph_format.space_after = Pt(0) | |
| p.paragraph_format.line_spacing = 1.0 | |
| run = p.add_run(code_line if code_line else ' ') # Empty lines need a space | |
| run.font.name = 'Courier New' | |
| run.font.size = Pt(10) | |
| def parse_markdown_table(lines, start_index): | |
| """Parse a markdown table starting at the given index. | |
| Returns (table_data, end_index) where table_data is a list of rows, | |
| each row being a list of cell contents. | |
| """ | |
| table_rows = [] | |
| i = start_index | |
| while i < len(lines): | |
| line = lines[i].strip() | |
| # Check if this line is part of the table | |
| if not line.startswith('|') and not line.endswith('|'): | |
| break | |
| # Skip separator lines (|---|---|) | |
| if re.match(r'^\|[\s\-:|\s]+\|$', line) or re.match(r'^\|?[\s\-:]+\|[\s\-:|]+\|?$', line): | |
| i += 1 | |
| continue | |
| # Parse cells from the row | |
| # Remove leading/trailing pipes and split by | | |
| cells = line.strip('|').split('|') | |
| cells = [cell.strip() for cell in cells] | |
| if cells: | |
| table_rows.append(cells) | |
| i += 1 | |
| return table_rows, i - 1 | |
| def add_table_to_doc(doc, table_data): | |
| """Add a table to the document from parsed markdown table data.""" | |
| if not table_data or not table_data[0]: | |
| return | |
| num_cols = len(table_data[0]) | |
| num_rows = len(table_data) | |
| # Create table | |
| table = doc.add_table(rows=num_rows, cols=num_cols) | |
| table.style = 'Table Grid' | |
| # Fill in cells | |
| for row_idx, row_data in enumerate(table_data): | |
| row = table.rows[row_idx] | |
| for col_idx, cell_text in enumerate(row_data): | |
| if col_idx < len(row.cells): | |
| cell = row.cells[col_idx] | |
| # Clear existing content and add formatted text | |
| cell.text = '' | |
| paragraph = cell.paragraphs[0] | |
| process_inline_formatting(paragraph, cell_text) | |
| # Make header row bold | |
| if row_idx == 0: | |
| for run in paragraph.runs: | |
| run.bold = True | |
| # Add some space after the table | |
| doc.add_paragraph() | |
| def process_markdown_cell(doc, content, base_path=None): | |
| """Process a markdown cell and add to document.""" | |
| lines = content.split('\n') | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| # Headers | |
| if line.startswith('######'): | |
| p = doc.add_heading(line[6:].strip(), level=6) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| elif line.startswith('#####'): | |
| p = doc.add_heading(line[5:].strip(), level=5) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| elif line.startswith('####'): | |
| p = doc.add_heading(line[4:].strip(), level=4) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| elif line.startswith('###'): | |
| p = doc.add_heading(line[3:].strip(), level=3) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| elif line.startswith('##'): | |
| p = doc.add_heading(line[2:].strip(), level=2) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| elif line.startswith('#'): | |
| p = doc.add_heading(line[1:].strip(), level=1) | |
| for run in p.runs: | |
| run.font.name = 'Poppins' | |
| # Code blocks | |
| elif line.startswith('```'): | |
| lang = line[3:].strip() | |
| code_lines = [f'```{lang}'] | |
| i += 1 | |
| while i < len(lines) and not lines[i].startswith('```'): | |
| code_lines.append(lines[i]) | |
| i += 1 | |
| code_lines.append('```') | |
| # Add code block with tight spacing | |
| add_code_block(doc, code_lines) | |
| # Blockquotes | |
| elif line.startswith('>'): | |
| quote_text = line[1:].strip() | |
| p = doc.add_paragraph() | |
| p.paragraph_format.left_indent = Inches(0.5) | |
| process_inline_formatting(p, quote_text) | |
| # Unordered lists | |
| elif line.strip().startswith('- ') or line.strip().startswith('* '): | |
| list_text = line.strip()[2:] | |
| p = doc.add_paragraph(style='List Bullet') | |
| process_inline_formatting(p, list_text) | |
| # Ordered lists | |
| elif re.match(r'^\d+\.\s', line.strip()): | |
| list_text = re.sub(r'^\d+\.\s', '', line.strip()) | |
| p = doc.add_paragraph(style='List Number') | |
| process_inline_formatting(p, list_text) | |
| # Images (standalone) | |
| elif re.match(r'^!\[([^\]]*)\]\(([^)]+)\)$', line.strip()): | |
| match = re.match(r'^!\[([^\]]*)\]\(([^)]+)\)$', line.strip()) | |
| alt_text = match.group(1) | |
| image_path = match.group(2) | |
| add_image_with_alt(doc, image_path, alt_text, base_path) | |
| # Horizontal rule | |
| elif line.strip() in ['---', '***', '___']: | |
| p = doc.add_paragraph() | |
| p.add_run('─' * 50) | |
| # Empty line | |
| elif not line.strip(): | |
| pass # Skip empty lines | |
| # Tables (lines starting with |) | |
| elif line.strip().startswith('|'): | |
| table_data, end_index = parse_markdown_table(lines, i) | |
| if table_data: | |
| add_table_to_doc(doc, table_data) | |
| i = end_index | |
| # Regular paragraph | |
| else: | |
| p = doc.add_paragraph() | |
| process_inline_formatting(p, line) | |
| i += 1 | |
| def process_code_cell(doc, source, outputs=None): | |
| """Process a code cell - show code with triple backticks.""" | |
| code_lines = ['```python'] + source.split('\n') + ['```'] | |
| add_code_block(doc, code_lines) | |
| def convert_notebook_to_docx(notebook_path, output_path=None): | |
| """Convert a Jupyter notebook to a Word document.""" | |
| notebook_path = Path(notebook_path) | |
| if output_path is None: | |
| output_path = notebook_path.with_suffix('.docx') | |
| else: | |
| output_path = Path(output_path) | |
| # Read notebook | |
| with open(notebook_path, 'r', encoding='utf-8') as f: | |
| nb = nbformat.read(f, as_version=4) | |
| # Create document | |
| doc = Document() | |
| create_styles(doc) | |
| # Set default font for Normal style | |
| style = doc.styles['Normal'] | |
| style.font.name = 'Poppins' | |
| style.font.size = Pt(11) | |
| # Base path for resolving relative image paths | |
| base_path = notebook_path.parent | |
| # Process cells | |
| for cell in nb.cells: | |
| if cell.cell_type == 'markdown': | |
| process_markdown_cell(doc, cell.source, base_path) | |
| elif cell.cell_type == 'code': | |
| process_code_cell(doc, cell.source, cell.get('outputs', [])) | |
| # Save document | |
| doc.save(output_path) | |
| print(f'Converted: {notebook_path} -> {output_path}') | |
| return output_path | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print('Usage: python notebook_to_docx.py <notebook_path> [output_path]') | |
| sys.exit(1) | |
| notebook_path = sys.argv[1] | |
| output_path = sys.argv[2] if len(sys.argv) > 2 else None | |
| convert_notebook_to_docx(notebook_path, output_path) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment