Skip to content

Instantly share code, notes, and snippets.

@chiperific
Created April 12, 2026 21:38
Show Gist options
  • Select an option

  • Save chiperific/d1cba1ea64b7367164fc9ace14f71599 to your computer and use it in GitHub Desktop.

Select an option

Save chiperific/d1cba1ea64b7367164fc9ace14f71599 to your computer and use it in GitHub Desktop.
Python script to take Blogger files and generate PDFs
#!/usr/bin/env python3
"""Convert a Google Takeout Blogger export (feed.atom) to individual PDFs."""
# Put this in the directory that contains the exported files from Google Takeout
# You can change `theme-classic.html` to whatever your theme file is called.
# Outputs all files to the `/pdfs` directory.
# Relies on playwright and chromium: `run python3 -m playwright install chromium`
# Command: `python3 blogger_to_pdf.py`. Use `--test` flag to only generate the first PDF
import html
import re
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from pathlib import Path
FEED = Path(__file__).parent / "feed.atom"
THEME = Path(__file__).parent / "theme-classic.html"
OUT_DIR = Path(__file__).parent / "pdfs"
NS = {
"atom": "http://www.w3.org/2005/Atom",
"blogger": "http://schemas.google.com/blogger/2018",
}
def prepare_css(theme_path: Path) -> str:
source = theme_path.read_text(encoding="utf-8")
# Extract the <style> block
match = re.search(r"<style[^>]*>(.*?)</style>", source, re.DOTALL)
css = match.group(1) if match else ""
# Remove <ItemPage>...</ItemPage> conditional blocks (keep their contents out)
css = re.sub(r"<ItemPage>.*?</ItemPage>", "", css, flags=re.DOTALL)
# Strip remaining Blogger template tags
css = re.sub(r"<[^>]+>", "", css)
# Remove blogblog.com background image references (dead decorative GIFs)
css = re.sub(r"url\(http://www\.blogblog\.com/[^)]+\)", "none", css)
# Append print/layout overrides
css += """
/* --- PDF layout overrides --- */
@page { margin: 15mm; size: A4; }
body {
max-width: 680px;
margin: 0 auto;
padding: 20px;
font-size: 11pt;
line-height: 1.6;
text-align: left;
background: #fff;
}
h1.post-title {
font-family: "Lucida Grande", "Trebuchet MS", sans-serif;
font-size: 22pt;
color: #f63;
letter-spacing: -0.5px;
margin: 0 0 6px 0;
}
.post-meta {
font-size: 9pt;
color: #999;
margin-bottom: 18px;
padding-bottom: 10px;
border-bottom: 1px solid #ddd;
}
.post-body {
line-height: 1.6;
}
img {
max-width: 100%;
height: auto;
}
.separator {
clear: both;
margin: 8px 0;
}
.tr-caption-container {
display: block;
margin: 0 auto;
text-align: center;
}
.tr-caption {
font-size: 9pt;
color: #999;
}
a {
word-break: break-all;
}
"""
return css
def parse_posts(feed_path: Path) -> list[dict]:
tree = ET.parse(feed_path)
root = tree.getroot()
posts = []
for entry in root.findall("atom:entry", NS):
if entry.findtext("blogger:type", namespaces=NS) != "POST":
continue
if entry.findtext("blogger:status", namespaces=NS) != "LIVE":
continue
title = entry.findtext("atom:title", namespaces=NS) or "Untitled"
content_el = entry.find("atom:content", NS)
content_html = html.unescape(content_el.text or "") if content_el is not None else ""
published = entry.findtext("atom:published", namespaces=NS) or "1970-01-01T00:00:00Z"
dt = datetime.fromisoformat(published.replace("Z", "+00:00"))
author = entry.findtext("atom:author/atom:name", namespaces=NS) or ""
labels = [
cat.get("term", "")
for cat in entry.findall("atom:category", NS)
if cat.get("term")
]
blogger_path = entry.findtext("blogger:filename", namespaces=NS) or ""
if blogger_path:
slug = blogger_path.rstrip("/").split("/")[-1].replace(".html", "")
else:
slug = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")[:60]
posts.append({
"title": title,
"content_html": content_html,
"dt": dt,
"author": author,
"labels": labels,
"slug": slug,
})
posts.sort(key=lambda p: p["dt"])
return posts
def make_output_path(post: dict, out_dir: Path) -> Path:
date_str = post["dt"].strftime("%Y-%m-%d")
return out_dir / f"{date_str}_{post['slug']}.pdf"
def build_html(post: dict, css: str) -> str:
title_escaped = html.escape(post["title"])
author_escaped = html.escape(post["author"])
date_str = post["dt"].strftime("%B %-d, %Y")
labels_html = ""
if post["labels"]:
labels_escaped = ", ".join(html.escape(l) for l in post["labels"])
labels_html = f' &middot; <span class="post-labels">{labels_escaped}</span>'
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title_escaped}</title>
<style>{css}</style>
</head>
<body>
<h1 class="post-title">{title_escaped}</h1>
<div class="post-meta">
{date_str} &middot; {author_escaped}{labels_html}
</div>
<div class="post-body">
{post["content_html"]}
</div>
</body>
</html>
"""
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true", help="Render only the first post")
args = parser.parse_args()
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Preparing CSS...")
css = prepare_css(THEME)
print("Parsing feed.atom...")
posts = parse_posts(FEED)
print(f"Found {len(posts)} live posts.")
if args.test:
posts = posts[:1]
print("Test mode: rendering first post only.")
# Determine which posts need rendering
to_render = []
skipped = 0
for post in posts:
out_path = make_output_path(post, OUT_DIR)
if out_path.exists():
skipped += 1
else:
to_render.append((post, out_path))
if skipped:
print(f"Skipping {skipped} already-rendered posts.")
print(f"Rendering {len(to_render)} posts...")
if not to_render:
print("Nothing to do.")
return
from playwright.sync_api import sync_playwright
errors = []
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
for i, (post, out_path) in enumerate(to_render, 1):
pct = i / len(to_render) * 100
print(f"[{i}/{len(to_render)}] {pct:.0f}% — {post['title'][:70]}")
try:
html_doc = build_html(post, css)
page.set_content(html_doc, wait_until="networkidle")
page.pdf(path=str(out_path), format="A4")
except Exception as e:
print(f" ERROR: {e}")
errors.append((post["title"], str(e)))
browser.close()
print(f"\nDone. PDFs written to: {OUT_DIR}")
if errors:
error_log = OUT_DIR / "conversion_errors.log"
error_log.write_text(
"\n".join(f"{title}\n {err}\n" for title, err in errors),
encoding="utf-8",
)
print(f"{len(errors)} errors logged to {error_log}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment