chiperific · April 12, 2026 21:38
diff --git a/blogger_to_pdf.py b/blogger_to_pdf.py
 #!/usr/bin/env python3
 """Convert a Google Takeout Blogger export (feed.atom) to individual PDFs."""

 # Put this in the directory that contains the exported files from Google Takeout
 # You can change `theme-classic.html` to whatever your theme file is called.
 # Outputs all files to the `/pdfs` directory.
 # Relies on playwright and chromium: `run python3 -m playwright install chromium`
 # Command: `python3 blogger_to_pdf.py`. Use `--test` flag to only generate the first PDF

 import html
 import re
 import xml.etree.ElementTree as ET
 from datetime import datetime, timezone
 from pathlib import Path

 FEED = Path(__file__).parent / "feed.atom"
 THEME = Path(__file__).parent / "theme-classic.html"
 OUT_DIR = Path(__file__).parent / "pdfs"

 NS = {
    "atom": "http://www.w3.org/2005/Atom",
    "blogger": "http://schemas.google.com/blogger/2018",
 }


 def prepare_css(theme_path: Path) -> str:
    source = theme_path.read_text(encoding="utf-8")

    # Extract the <style> block
    match = re.search(r"<style[^>]*>(.*?)</style>", source, re.DOTALL)
    css = match.group(1) if match else ""

    # Remove <ItemPage>...</ItemPage> conditional blocks (keep their contents out)
    css = re.sub(r"<ItemPage>.*?</ItemPage>", "", css, flags=re.DOTALL)

    # Strip remaining Blogger template tags
    css = re.sub(r"<[^>]+>", "", css)

    # Remove blogblog.com background image references (dead decorative GIFs)
    css = re.sub(r"url\(http://www\.blogblog\.com/[^)]+\)", "none", css)

    # Append print/layout overrides
    css += """
 /* --- PDF layout overrides --- */
 @page { margin: 15mm; size: A4; }

 body {
  max-width: 680px;
  margin: 0 auto;
  padding: 20px;
  font-size: 11pt;
  line-height: 1.6;
  text-align: left;
  background: #fff;
 }

 h1.post-title {
  font-family: "Lucida Grande", "Trebuchet MS", sans-serif;
  font-size: 22pt;
  color: #f63;
  letter-spacing: -0.5px;
  margin: 0 0 6px 0;
 }

 .post-meta {
  font-size: 9pt;
  color: #999;
  margin-bottom: 18px;
  padding-bottom: 10px;
  border-bottom: 1px solid #ddd;
 }

 .post-body {
  line-height: 1.6;
 }

 img {
  max-width: 100%;
  height: auto;
 }

 .separator {
  clear: both;
  margin: 8px 0;
 }

 .tr-caption-container {
  display: block;
  margin: 0 auto;
  text-align: center;
 }

 .tr-caption {
  font-size: 9pt;
  color: #999;
 }

 a {
  word-break: break-all;
 }
 """
    return css


 def parse_posts(feed_path: Path) -> list[dict]:
    tree = ET.parse(feed_path)
    root = tree.getroot()

    posts = []
    for entry in root.findall("atom:entry", NS):
        if entry.findtext("blogger:type", namespaces=NS) != "POST":
            continue
        if entry.findtext("blogger:status", namespaces=NS) != "LIVE":
            continue

        title = entry.findtext("atom:title", namespaces=NS) or "Untitled"

        content_el = entry.find("atom:content", NS)
        content_html = html.unescape(content_el.text or "") if content_el is not None else ""

        published = entry.findtext("atom:published", namespaces=NS) or "1970-01-01T00:00:00Z"
        dt = datetime.fromisoformat(published.replace("Z", "+00:00"))

        author = entry.findtext("atom:author/atom:name", namespaces=NS) or ""

        labels = [
            cat.get("term", "")
            for cat in entry.findall("atom:category", NS)
            if cat.get("term")
        ]

        blogger_path = entry.findtext("blogger:filename", namespaces=NS) or ""
        if blogger_path:
            slug = blogger_path.rstrip("/").split("/")[-1].replace(".html", "")
        else:
            slug = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")[:60]

        posts.append({
            "title": title,
            "content_html": content_html,
            "dt": dt,
            "author": author,
            "labels": labels,
            "slug": slug,
        })

    posts.sort(key=lambda p: p["dt"])
    return posts


 def make_output_path(post: dict, out_dir: Path) -> Path:
    date_str = post["dt"].strftime("%Y-%m-%d")
    return out_dir / f"{date_str}_{post['slug']}.pdf"


 def build_html(post: dict, css: str) -> str:
    title_escaped = html.escape(post["title"])
    author_escaped = html.escape(post["author"])
    date_str = post["dt"].strftime("%B %-d, %Y")

    labels_html = ""
    if post["labels"]:
        labels_escaped = ", ".join(html.escape(l) for l in post["labels"])
        labels_html = f' &middot; <span class="post-labels">{labels_escaped}</span>'

    return f"""<!DOCTYPE html>
 <html lang="en">
 <head>
  <meta charset="UTF-8">
  <title>{title_escaped}</title>
  <style>{css}</style>
 </head>
 <body>
  <h1 class="post-title">{title_escaped}</h1>
  <div class="post-meta">
    {date_str} &middot; {author_escaped}{labels_html}
  </div>
  <div class="post-body">
    {post["content_html"]}
  </div>
 </body>
 </html>
 """


 def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--test", action="store_true", help="Render only the first post")
    args = parser.parse_args()

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    print("Preparing CSS...")
    css = prepare_css(THEME)

    print("Parsing feed.atom...")
    posts = parse_posts(FEED)
    print(f"Found {len(posts)} live posts.")

    if args.test:
        posts = posts[:1]
        print("Test mode: rendering first post only.")

    # Determine which posts need rendering
    to_render = []
    skipped = 0
    for post in posts:
        out_path = make_output_path(post, OUT_DIR)
        if out_path.exists():
            skipped += 1
        else:
            to_render.append((post, out_path))

    if skipped:
        print(f"Skipping {skipped} already-rendered posts.")
    print(f"Rendering {len(to_render)} posts...")

    if not to_render:
        print("Nothing to do.")
        return

    from playwright.sync_api import sync_playwright

    errors = []
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()

        for i, (post, out_path) in enumerate(to_render, 1):
            pct = i / len(to_render) * 100
            print(f"[{i}/{len(to_render)}] {pct:.0f}% — {post['title'][:70]}")
            try:
                html_doc = build_html(post, css)
                page.set_content(html_doc, wait_until="networkidle")
                page.pdf(path=str(out_path), format="A4")
            except Exception as e:
                print(f"  ERROR: {e}")
                errors.append((post["title"], str(e)))

        browser.close()

    print(f"\nDone. PDFs written to: {OUT_DIR}")
    if errors:
        error_log = OUT_DIR / "conversion_errors.log"
        error_log.write_text(
            "\n".join(f"{title}\n  {err}\n" for title, err in errors),
            encoding="utf-8",
        )
        print(f"{len(errors)} errors logged to {error_log}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""Convert a Google Takeout Blogger export (feed.atom) to individual PDFs."""

	# Put this in the directory that contains the exported files from Google Takeout
	# You can change `theme-classic.html` to whatever your theme file is called.
	# Outputs all files to the `/pdfs` directory.
	# Relies on playwright and chromium: `run python3 -m playwright install chromium`
	# Command: `python3 blogger_to_pdf.py`. Use `--test` flag to only generate the first PDF

	import html
	import re
	import xml.etree.ElementTree as ET
	from datetime import datetime, timezone
	from pathlib import Path

	FEED = Path(__file__).parent / "feed.atom"
	THEME = Path(__file__).parent / "theme-classic.html"
	OUT_DIR = Path(__file__).parent / "pdfs"

	NS = {
	"atom": "http://www.w3.org/2005/Atom",
	"blogger": "http://schemas.google.com/blogger/2018",
	}


	def prepare_css(theme_path: Path) -> str:
	source = theme_path.read_text(encoding="utf-8")

	# Extract the <style> block
	match = re.search(r"<style[^>]>(.?)</style>", source, re.DOTALL)
	css = match.group(1) if match else ""

	# Remove <ItemPage>...</ItemPage> conditional blocks (keep their contents out)
	css = re.sub(r"<ItemPage>.*?</ItemPage>", "", css, flags=re.DOTALL)

	# Strip remaining Blogger template tags
	css = re.sub(r"<[^>]+>", "", css)

	# Remove blogblog.com background image references (dead decorative GIFs)
	css = re.sub(r"url\(http://www\.blogblog\.com/[^)]+\)", "none", css)

	# Append print/layout overrides
	css += """
	/* --- PDF layout overrides --- */
	@page { margin: 15mm; size: A4; }

	body {
	max-width: 680px;
	margin: 0 auto;
	padding: 20px;
	font-size: 11pt;
	line-height: 1.6;
	text-align: left;
	background: #fff;
	}

	h1.post-title {
	font-family: "Lucida Grande", "Trebuchet MS", sans-serif;
	font-size: 22pt;
	color: #f63;
	letter-spacing: -0.5px;
	margin: 0 0 6px 0;
	}

	.post-meta {
	font-size: 9pt;
	color: #999;
	margin-bottom: 18px;
	padding-bottom: 10px;
	border-bottom: 1px solid #ddd;
	}

	.post-body {
	line-height: 1.6;
	}

	img {
	max-width: 100%;
	height: auto;
	}

	.separator {
	clear: both;
	margin: 8px 0;
	}

	.tr-caption-container {
	display: block;
	margin: 0 auto;
	text-align: center;
	}

	.tr-caption {
	font-size: 9pt;
	color: #999;
	}

	a {
	word-break: break-all;
	}
	"""
	return css


	def parse_posts(feed_path: Path) -> list[dict]:
	tree = ET.parse(feed_path)
	root = tree.getroot()

	posts = []
	for entry in root.findall("atom:entry", NS):
	if entry.findtext("blogger:type", namespaces=NS) != "POST":
	continue
	if entry.findtext("blogger:status", namespaces=NS) != "LIVE":
	continue

	title = entry.findtext("atom:title", namespaces=NS) or "Untitled"

	content_el = entry.find("atom:content", NS)
	content_html = html.unescape(content_el.text or "") if content_el is not None else ""

	published = entry.findtext("atom:published", namespaces=NS) or "1970-01-01T00:00:00Z"
	dt = datetime.fromisoformat(published.replace("Z", "+00:00"))

	author = entry.findtext("atom:author/atom:name", namespaces=NS) or ""

	labels = [
	cat.get("term", "")
	for cat in entry.findall("atom:category", NS)
	if cat.get("term")
	]

	blogger_path = entry.findtext("blogger:filename", namespaces=NS) or ""
	if blogger_path:
	slug = blogger_path.rstrip("/").split("/")[-1].replace(".html", "")
	else:
	slug = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")[:60]

	posts.append({
	"title": title,
	"content_html": content_html,
	"dt": dt,
	"author": author,
	"labels": labels,
	"slug": slug,
	})

	posts.sort(key=lambda p: p["dt"])
	return posts


	def make_output_path(post: dict, out_dir: Path) -> Path:
	date_str = post["dt"].strftime("%Y-%m-%d")
	return out_dir / f"{date_str}_{post['slug']}.pdf"


	def build_html(post: dict, css: str) -> str:
	title_escaped = html.escape(post["title"])
	author_escaped = html.escape(post["author"])
	date_str = post["dt"].strftime("%B %-d, %Y")

	labels_html = ""
	if post["labels"]:
	labels_escaped = ", ".join(html.escape(l) for l in post["labels"])
	labels_html = f' · <span class="post-labels">{labels_escaped}</span>'

	return f"""<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<title>{title_escaped}</title>
	<style>{css}</style>
	</head>
	<body>
	<h1 class="post-title">{title_escaped}</h1>
	<div class="post-meta">
	{date_str} · {author_escaped}{labels_html}
	</div>
	<div class="post-body">
	{post["content_html"]}
	</div>
	</body>
	</html>
	"""


	def main():
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--test", action="store_true", help="Render only the first post")
	args = parser.parse_args()

	OUT_DIR.mkdir(parents=True, exist_ok=True)

	print("Preparing CSS...")
	css = prepare_css(THEME)

	print("Parsing feed.atom...")
	posts = parse_posts(FEED)
	print(f"Found {len(posts)} live posts.")

	if args.test:
	posts = posts[:1]
	print("Test mode: rendering first post only.")

	# Determine which posts need rendering
	to_render = []
	skipped = 0
	for post in posts:
	out_path = make_output_path(post, OUT_DIR)
	if out_path.exists():
	skipped += 1
	else:
	to_render.append((post, out_path))

	if skipped:
	print(f"Skipping {skipped} already-rendered posts.")
	print(f"Rendering {len(to_render)} posts...")

	if not to_render:
	print("Nothing to do.")
	return

	from playwright.sync_api import sync_playwright

	errors = []
	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()

	for i, (post, out_path) in enumerate(to_render, 1):
	pct = i / len(to_render) * 100
	print(f"[{i}/{len(to_render)}] {pct:.0f}% — {post['title'][:70]}")
	try:
	html_doc = build_html(post, css)
	page.set_content(html_doc, wait_until="networkidle")
	page.pdf(path=str(out_path), format="A4")
	except Exception as e:
	print(f" ERROR: {e}")
	errors.append((post["title"], str(e)))

	browser.close()

	print(f"\nDone. PDFs written to: {OUT_DIR}")
	if errors:
	error_log = OUT_DIR / "conversion_errors.log"
	error_log.write_text(
	"\n".join(f"{title}\n {err}\n" for title, err in errors),
	encoding="utf-8",
	)
	print(f"{len(errors)} errors logged to {error_log}")


	if __name__ == "__main__":
	main()
No results found