Skip to content

Instantly share code, notes, and snippets.

@peterhys
Last active June 7, 2021 21:22
Show Gist options
  • Save peterhys/be91a8d8cabc7b8ef1e47ee31a6f564c to your computer and use it in GitHub Desktop.
Save peterhys/be91a8d8cabc7b8ef1e47ee31a6f564c to your computer and use it in GitHub Desktop.
Convert classic google sites to markdown files
"""
Peter Sun
Jun 7, 2020
https://github.com/peterhs73
Custom script that convert google sites to markdown files.
The output is for github wiki specifically.
How to:
Extract all google sites page html using google takeout feature
run python gsites2md.py path_to_home_directory
the markdown output is stored in the same directory as the html file
requirements
beautifulsoup4=4.9.1
markdownify==0.9.0
lxml==4.5.2
"""
import glob
import sys
import os
import re
import lxml
from bs4 import BeautifulSoup
from markdownify import markdownify
def refine_toc(md_content):
"""Convert google's toc from space to dash
md_content str: content of the site body
"""
toc_list = re.findall(r"\(#.*\)", md_content)
for toc_content in toc_list:
# no need to make it too complex
toc_refined = toc_content.replace(" ", "-")
md_content = md_content.replace(toc_content, toc_refined)
return md_content
def html_to_markdown(html_path):
"""The main function to convert html
html_path str: the path of the google sites html file
"""
output_dir, file_name = os.path.split(os.path.splitext(html_path)[0])
# titlize the output markdown
output_path = os.path.join(output_dir, file_name.title() + '.md')
with open(html_path, "br") as content:
soup = BeautifulSoup(content.read().decode("UTF-8"), features="lxml")
# title can be added if needed
# title = soup.title.text
md = markdownify(str(soup.tbody.contents[0]), heading_style="ATX")
# Fix the TOC from google sites
# The google sites body starts and ends with '|', they are stripped
md_refined = refine_toc(md)[2:-2]
with open(output_path, "bw+") as output:
# add title here if needed
# output.write(f'# {title}\n'.encode('UTF-8'))
output.write(md_refined.encode("UTF-8"))
if __name__ == "__main__":
# grab the site home directory and grab all .html files
# this includes nested files
home_dir = str(sys.argv[1])
html_pattern = os.path.join(home_dir, "**", "*.html")
html_files = glob.glob(html_pattern, recursive=True)
for html_path in html_files:
# markdown files are stored in the same directory as the markdown file
try:
html_to_markdown(html_path)
print(f"converted: {html_path}")
except AttributeError as e:
print(f"cannot convert file: {html_path} with error: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment