abhishekmishra · January 11, 2022 07:05
diff --git a/substack_posts_import.py b/substack_posts_import.py
 #!/usr/bin/env python3

 # UNLICENSE
 #
 # This is free and unencumbered software released into the public domain.

 # Anyone is free to copy, modify, publish, use, compile, sell, or
 # distribute this software, either in source code form or as a compiled
 # binary, for any purpose, commercial or non-commercial, and by any
 # means.

 # In jurisdictions that recognize copyright laws, the author or authors
 # of this software dedicate any and all copyright interest in the
 # software to the public domain. We make this dedication for the benefit
 # of the public at large and to the detriment of our heirs and
 # successors. We intend this dedication to be an overt act of
 # relinquishment in perpetuity of all present and future rights to this
 # software under copyright law.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 # OTHER DEALINGS IN THE SOFTWARE.

 # For more information, please refer to <http://unlicense.org/>

 ## Author: Abhishek Mishra <[email protected]>
 ## Date: 29th Dec 2021
 ##
 ## Program to process newsletter export downloaded from substack
 ## and import them as markdown files in pelican content folder.
 ## It should also work with other static-site generators.
 ##
 ## Download the exported posts zip from substack, and unzip on
 ## folder on disk.
 ##
 ## For the purposes of this script, I extracted the posts zip
 ## file at ./ext_work/<newsletter>_import/.
 ## If this folder is different for you, change the definition
 ## of NEWSLETTER_IMPORT_DIR

 import csv
 import os
 from pathlib import Path

 # REVIEW AND EDIT THESE CONSTANTS

 # Change this to your newsletter name
 # the newsletter part from https://<newsletter>.substack.com
 # in your newsletter's url
 NEWSLETTER_NAME = "artnoob"

 # Create the paths to the exported posts folder and target folder
 HOMEPAGE_PROJ_DIR = "."

 # The folder where the substack export is extracted
 NEWSLETTER_IMPORT_DIR = os.path.join(
    HOMEPAGE_PROJ_DIR, "ext_work/{}_import".format(NEWSLETTER_NAME)
 )
 NEWSLETTER_POSTS_CSV = os.path.join(NEWSLETTER_IMPORT_DIR, "posts.csv")

 # The folder where the pelican posts need to be written
 NEWSLETTER_POSTS_TARGET_DIR = os.path.join(
    HOMEPAGE_PROJ_DIR, "content/posts/{}".format(NEWSLETTER_NAME)
 )

 # This list will hold the post data loaded from the posts.csv file
 substack_posts = []

 # Read the posts.csv file and filter out any non-published posts
 with open(NEWSLETTER_POSTS_CSV, newline="", encoding='utf-8') as csvfile:
    postsreader = csv.reader(csvfile)
    headers = None
    for row in postsreader:
        if headers == None:
            headers = row
        else:
            post_item = {}
            for idx, col in enumerate(row):
                post_item[headers[idx]] = col
            substack_posts.append(post_item)
    substack_posts = list(
        item for item in substack_posts if item["is_published"] == "true"
    )
    for v in substack_posts:
        v["post_html"] = os.path.join(
            NEWSLETTER_IMPORT_DIR, "posts", v["post_id"] + ".html"
        )
    print("Found {} published posts".format(len(substack_posts)))

 # Process the loaded posts metadata
 # Create exact path to the target post file.
 # If the target post does not exist,
 # create a new markdown file with frontmatter
 # loaded from posts csv, and content from the html file
 for post in substack_posts:
    src_path = Path(post["post_html"])
    target_path = Path(NEWSLETTER_POSTS_TARGET_DIR, src_path.name)
    target_path = target_path.with_suffix(".md")

    # print(src_path, target_path)
    if not os.path.exists(target_path):
        with open(target_path, "w", encoding="utf8") as outf:
            outf.write("---\n")
            for key, val in post.items():
                outf.write("{}: {}\n".format(key, val))
                if key == "post_date":
                    outf.write("{}: {}\n".format("date", val))
            outf.write("---\n\n")
            with open(src_path, "r", encoding="utf8") as inf:
                outf.write(inf.read())
        print("wrote {}".format(target_path))
    else:
        print("WARN: {} already exists. Will not overwrite!".format(target_path))
	#!/usr/bin/env python3

	# UNLICENSE
	#
	# This is free and unencumbered software released into the public domain.

	# Anyone is free to copy, modify, publish, use, compile, sell, or
	# distribute this software, either in source code form or as a compiled
	# binary, for any purpose, commercial or non-commercial, and by any
	# means.

	# In jurisdictions that recognize copyright laws, the author or authors
	# of this software dedicate any and all copyright interest in the
	# software to the public domain. We make this dedication for the benefit
	# of the public at large and to the detriment of our heirs and
	# successors. We intend this dedication to be an overt act of
	# relinquishment in perpetuity of all present and future rights to this
	# software under copyright law.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
	# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	# OTHER DEALINGS IN THE SOFTWARE.

	# For more information, please refer to <http://unlicense.org/>

	## Author: Abhishek Mishra <[email protected]>
	## Date: 29th Dec 2021
	##
	## Program to process newsletter export downloaded from substack
	## and import them as markdown files in pelican content folder.
	## It should also work with other static-site generators.
	##
	## Download the exported posts zip from substack, and unzip on
	## folder on disk.
	##
	## For the purposes of this script, I extracted the posts zip
	## file at ./ext_work/<newsletter>_import/.
	## If this folder is different for you, change the definition
	## of NEWSLETTER_IMPORT_DIR

	import csv
	import os
	from pathlib import Path

	# REVIEW AND EDIT THESE CONSTANTS

	# Change this to your newsletter name
	# the newsletter part from https://<newsletter>.substack.com
	# in your newsletter's url
	NEWSLETTER_NAME = "artnoob"

	# Create the paths to the exported posts folder and target folder
	HOMEPAGE_PROJ_DIR = "."

	# The folder where the substack export is extracted
	NEWSLETTER_IMPORT_DIR = os.path.join(
	HOMEPAGE_PROJ_DIR, "ext_work/{}_import".format(NEWSLETTER_NAME)
	)
	NEWSLETTER_POSTS_CSV = os.path.join(NEWSLETTER_IMPORT_DIR, "posts.csv")

	# The folder where the pelican posts need to be written
	NEWSLETTER_POSTS_TARGET_DIR = os.path.join(
	HOMEPAGE_PROJ_DIR, "content/posts/{}".format(NEWSLETTER_NAME)
	)

	# This list will hold the post data loaded from the posts.csv file
	substack_posts = []

	# Read the posts.csv file and filter out any non-published posts
	with open(NEWSLETTER_POSTS_CSV, newline="", encoding='utf-8') as csvfile:
	postsreader = csv.reader(csvfile)
	headers = None
	for row in postsreader:
	if headers == None:
	headers = row
	else:
	post_item = {}
	for idx, col in enumerate(row):
	post_item[headers[idx]] = col
	substack_posts.append(post_item)
	substack_posts = list(
	item for item in substack_posts if item["is_published"] == "true"
	)
	for v in substack_posts:
	v["post_html"] = os.path.join(
	NEWSLETTER_IMPORT_DIR, "posts", v["post_id"] + ".html"
	)
	print("Found {} published posts".format(len(substack_posts)))

	# Process the loaded posts metadata
	# Create exact path to the target post file.
	# If the target post does not exist,
	# create a new markdown file with frontmatter
	# loaded from posts csv, and content from the html file
	for post in substack_posts:
	src_path = Path(post["post_html"])
	target_path = Path(NEWSLETTER_POSTS_TARGET_DIR, src_path.name)
	target_path = target_path.with_suffix(".md")

	# print(src_path, target_path)
	if not os.path.exists(target_path):
	with open(target_path, "w", encoding="utf8") as outf:
	outf.write("---\n")
	for key, val in post.items():
	outf.write("{}: {}\n".format(key, val))
	if key == "post_date":
	outf.write("{}: {}\n".format("date", val))
	outf.write("---\n\n")
	with open(src_path, "r", encoding="utf8") as inf:
	outf.write(inf.read())
	print("wrote {}".format(target_path))
	else:
	print("WARN: {} already exists. Will not overwrite!".format(target_path))