Skip to content

Instantly share code, notes, and snippets.

@abhishekmishra
Last active January 11, 2022 07:05
Show Gist options
  • Save abhishekmishra/4b3707a05de728f4052e84c965112695 to your computer and use it in GitHub Desktop.
Save abhishekmishra/4b3707a05de728f4052e84c965112695 to your computer and use it in GitHub Desktop.
Import Substack Newsletter/Blog Posts into Pelican Posts Folder
#!/usr/bin/env python3
# UNLICENSE
#
# This is free and unencumbered software released into the public domain.
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# For more information, please refer to <http://unlicense.org/>
## Author: Abhishek Mishra <[email protected]>
## Date: 29th Dec 2021
##
## Program to process newsletter export downloaded from substack
## and import them as markdown files in pelican content folder.
## It should also work with other static-site generators.
##
## Download the exported posts zip from substack, and unzip on
## folder on disk.
##
## For the purposes of this script, I extracted the posts zip
## file at ./ext_work/<newsletter>_import/.
## If this folder is different for you, change the definition
## of NEWSLETTER_IMPORT_DIR
import csv
import os
from pathlib import Path
# REVIEW AND EDIT THESE CONSTANTS
# Change this to your newsletter name
# the newsletter part from https://<newsletter>.substack.com
# in your newsletter's url
NEWSLETTER_NAME = "artnoob"
# Create the paths to the exported posts folder and target folder
HOMEPAGE_PROJ_DIR = "."
# The folder where the substack export is extracted
NEWSLETTER_IMPORT_DIR = os.path.join(
HOMEPAGE_PROJ_DIR, "ext_work/{}_import".format(NEWSLETTER_NAME)
)
NEWSLETTER_POSTS_CSV = os.path.join(NEWSLETTER_IMPORT_DIR, "posts.csv")
# The folder where the pelican posts need to be written
NEWSLETTER_POSTS_TARGET_DIR = os.path.join(
HOMEPAGE_PROJ_DIR, "content/posts/{}".format(NEWSLETTER_NAME)
)
# This list will hold the post data loaded from the posts.csv file
substack_posts = []
# Read the posts.csv file and filter out any non-published posts
with open(NEWSLETTER_POSTS_CSV, newline="", encoding='utf-8') as csvfile:
postsreader = csv.reader(csvfile)
headers = None
for row in postsreader:
if headers == None:
headers = row
else:
post_item = {}
for idx, col in enumerate(row):
post_item[headers[idx]] = col
substack_posts.append(post_item)
substack_posts = list(
item for item in substack_posts if item["is_published"] == "true"
)
for v in substack_posts:
v["post_html"] = os.path.join(
NEWSLETTER_IMPORT_DIR, "posts", v["post_id"] + ".html"
)
print("Found {} published posts".format(len(substack_posts)))
# Process the loaded posts metadata
# Create exact path to the target post file.
# If the target post does not exist,
# create a new markdown file with frontmatter
# loaded from posts csv, and content from the html file
for post in substack_posts:
src_path = Path(post["post_html"])
target_path = Path(NEWSLETTER_POSTS_TARGET_DIR, src_path.name)
target_path = target_path.with_suffix(".md")
# print(src_path, target_path)
if not os.path.exists(target_path):
with open(target_path, "w", encoding="utf8") as outf:
outf.write("---\n")
for key, val in post.items():
outf.write("{}: {}\n".format(key, val))
if key == "post_date":
outf.write("{}: {}\n".format("date", val))
outf.write("---\n\n")
with open(src_path, "r", encoding="utf8") as inf:
outf.write(inf.read())
print("wrote {}".format(target_path))
else:
print("WARN: {} already exists. Will not overwrite!".format(target_path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment