Created
April 12, 2026 21:38
-
-
Save chiperific/d1cba1ea64b7367164fc9ace14f71599 to your computer and use it in GitHub Desktop.
Python script to take Blogger files and generate PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Convert a Google Takeout Blogger export (feed.atom) to individual PDFs.""" | |
| # Put this in the directory that contains the exported files from Google Takeout | |
| # You can change `theme-classic.html` to whatever your theme file is called. | |
| # Outputs all files to the `/pdfs` directory. | |
| # Relies on playwright and chromium: `run python3 -m playwright install chromium` | |
| # Command: `python3 blogger_to_pdf.py`. Use `--test` flag to only generate the first PDF | |
| import html | |
| import re | |
| import xml.etree.ElementTree as ET | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| FEED = Path(__file__).parent / "feed.atom" | |
| THEME = Path(__file__).parent / "theme-classic.html" | |
| OUT_DIR = Path(__file__).parent / "pdfs" | |
| NS = { | |
| "atom": "http://www.w3.org/2005/Atom", | |
| "blogger": "http://schemas.google.com/blogger/2018", | |
| } | |
| def prepare_css(theme_path: Path) -> str: | |
| source = theme_path.read_text(encoding="utf-8") | |
| # Extract the <style> block | |
| match = re.search(r"<style[^>]*>(.*?)</style>", source, re.DOTALL) | |
| css = match.group(1) if match else "" | |
| # Remove <ItemPage>...</ItemPage> conditional blocks (keep their contents out) | |
| css = re.sub(r"<ItemPage>.*?</ItemPage>", "", css, flags=re.DOTALL) | |
| # Strip remaining Blogger template tags | |
| css = re.sub(r"<[^>]+>", "", css) | |
| # Remove blogblog.com background image references (dead decorative GIFs) | |
| css = re.sub(r"url\(http://www\.blogblog\.com/[^)]+\)", "none", css) | |
| # Append print/layout overrides | |
| css += """ | |
| /* --- PDF layout overrides --- */ | |
| @page { margin: 15mm; size: A4; } | |
| body { | |
| max-width: 680px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| font-size: 11pt; | |
| line-height: 1.6; | |
| text-align: left; | |
| background: #fff; | |
| } | |
| h1.post-title { | |
| font-family: "Lucida Grande", "Trebuchet MS", sans-serif; | |
| font-size: 22pt; | |
| color: #f63; | |
| letter-spacing: -0.5px; | |
| margin: 0 0 6px 0; | |
| } | |
| .post-meta { | |
| font-size: 9pt; | |
| color: #999; | |
| margin-bottom: 18px; | |
| padding-bottom: 10px; | |
| border-bottom: 1px solid #ddd; | |
| } | |
| .post-body { | |
| line-height: 1.6; | |
| } | |
| img { | |
| max-width: 100%; | |
| height: auto; | |
| } | |
| .separator { | |
| clear: both; | |
| margin: 8px 0; | |
| } | |
| .tr-caption-container { | |
| display: block; | |
| margin: 0 auto; | |
| text-align: center; | |
| } | |
| .tr-caption { | |
| font-size: 9pt; | |
| color: #999; | |
| } | |
| a { | |
| word-break: break-all; | |
| } | |
| """ | |
| return css | |
| def parse_posts(feed_path: Path) -> list[dict]: | |
| tree = ET.parse(feed_path) | |
| root = tree.getroot() | |
| posts = [] | |
| for entry in root.findall("atom:entry", NS): | |
| if entry.findtext("blogger:type", namespaces=NS) != "POST": | |
| continue | |
| if entry.findtext("blogger:status", namespaces=NS) != "LIVE": | |
| continue | |
| title = entry.findtext("atom:title", namespaces=NS) or "Untitled" | |
| content_el = entry.find("atom:content", NS) | |
| content_html = html.unescape(content_el.text or "") if content_el is not None else "" | |
| published = entry.findtext("atom:published", namespaces=NS) or "1970-01-01T00:00:00Z" | |
| dt = datetime.fromisoformat(published.replace("Z", "+00:00")) | |
| author = entry.findtext("atom:author/atom:name", namespaces=NS) or "" | |
| labels = [ | |
| cat.get("term", "") | |
| for cat in entry.findall("atom:category", NS) | |
| if cat.get("term") | |
| ] | |
| blogger_path = entry.findtext("blogger:filename", namespaces=NS) or "" | |
| if blogger_path: | |
| slug = blogger_path.rstrip("/").split("/")[-1].replace(".html", "") | |
| else: | |
| slug = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")[:60] | |
| posts.append({ | |
| "title": title, | |
| "content_html": content_html, | |
| "dt": dt, | |
| "author": author, | |
| "labels": labels, | |
| "slug": slug, | |
| }) | |
| posts.sort(key=lambda p: p["dt"]) | |
| return posts | |
| def make_output_path(post: dict, out_dir: Path) -> Path: | |
| date_str = post["dt"].strftime("%Y-%m-%d") | |
| return out_dir / f"{date_str}_{post['slug']}.pdf" | |
| def build_html(post: dict, css: str) -> str: | |
| title_escaped = html.escape(post["title"]) | |
| author_escaped = html.escape(post["author"]) | |
| date_str = post["dt"].strftime("%B %-d, %Y") | |
| labels_html = "" | |
| if post["labels"]: | |
| labels_escaped = ", ".join(html.escape(l) for l in post["labels"]) | |
| labels_html = f' · <span class="post-labels">{labels_escaped}</span>' | |
| return f"""<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>{title_escaped}</title> | |
| <style>{css}</style> | |
| </head> | |
| <body> | |
| <h1 class="post-title">{title_escaped}</h1> | |
| <div class="post-meta"> | |
| {date_str} · {author_escaped}{labels_html} | |
| </div> | |
| <div class="post-body"> | |
| {post["content_html"]} | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--test", action="store_true", help="Render only the first post") | |
| args = parser.parse_args() | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| print("Preparing CSS...") | |
| css = prepare_css(THEME) | |
| print("Parsing feed.atom...") | |
| posts = parse_posts(FEED) | |
| print(f"Found {len(posts)} live posts.") | |
| if args.test: | |
| posts = posts[:1] | |
| print("Test mode: rendering first post only.") | |
| # Determine which posts need rendering | |
| to_render = [] | |
| skipped = 0 | |
| for post in posts: | |
| out_path = make_output_path(post, OUT_DIR) | |
| if out_path.exists(): | |
| skipped += 1 | |
| else: | |
| to_render.append((post, out_path)) | |
| if skipped: | |
| print(f"Skipping {skipped} already-rendered posts.") | |
| print(f"Rendering {len(to_render)} posts...") | |
| if not to_render: | |
| print("Nothing to do.") | |
| return | |
| from playwright.sync_api import sync_playwright | |
| errors = [] | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch() | |
| page = browser.new_page() | |
| for i, (post, out_path) in enumerate(to_render, 1): | |
| pct = i / len(to_render) * 100 | |
| print(f"[{i}/{len(to_render)}] {pct:.0f}% — {post['title'][:70]}") | |
| try: | |
| html_doc = build_html(post, css) | |
| page.set_content(html_doc, wait_until="networkidle") | |
| page.pdf(path=str(out_path), format="A4") | |
| except Exception as e: | |
| print(f" ERROR: {e}") | |
| errors.append((post["title"], str(e))) | |
| browser.close() | |
| print(f"\nDone. PDFs written to: {OUT_DIR}") | |
| if errors: | |
| error_log = OUT_DIR / "conversion_errors.log" | |
| error_log.write_text( | |
| "\n".join(f"{title}\n {err}\n" for title, err in errors), | |
| encoding="utf-8", | |
| ) | |
| print(f"{len(errors)} errors logged to {error_log}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment