Last active
June 7, 2021 21:22
-
-
Save peterhys/be91a8d8cabc7b8ef1e47ee31a6f564c to your computer and use it in GitHub Desktop.
Convert classic google sites to markdown files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Peter Sun | |
Jun 7, 2020 | |
https://github.com/peterhs73 | |
Custom script that convert google sites to markdown files. | |
The output is for github wiki specifically. | |
How to: | |
Extract all google sites page html using google takeout feature | |
run python gsites2md.py path_to_home_directory | |
the markdown output is stored in the same directory as the html file | |
requirements | |
beautifulsoup4=4.9.1 | |
markdownify==0.9.0 | |
lxml==4.5.2 | |
""" | |
import glob | |
import sys | |
import os | |
import re | |
import lxml | |
from bs4 import BeautifulSoup | |
from markdownify import markdownify | |
def refine_toc(md_content): | |
"""Convert google's toc from space to dash | |
md_content str: content of the site body | |
""" | |
toc_list = re.findall(r"\(#.*\)", md_content) | |
for toc_content in toc_list: | |
# no need to make it too complex | |
toc_refined = toc_content.replace(" ", "-") | |
md_content = md_content.replace(toc_content, toc_refined) | |
return md_content | |
def html_to_markdown(html_path): | |
"""The main function to convert html | |
html_path str: the path of the google sites html file | |
""" | |
output_dir, file_name = os.path.split(os.path.splitext(html_path)[0]) | |
# titlize the output markdown | |
output_path = os.path.join(output_dir, file_name.title() + '.md') | |
with open(html_path, "br") as content: | |
soup = BeautifulSoup(content.read().decode("UTF-8"), features="lxml") | |
# title can be added if needed | |
# title = soup.title.text | |
md = markdownify(str(soup.tbody.contents[0]), heading_style="ATX") | |
# Fix the TOC from google sites | |
# The google sites body starts and ends with '|', they are stripped | |
md_refined = refine_toc(md)[2:-2] | |
with open(output_path, "bw+") as output: | |
# add title here if needed | |
# output.write(f'# {title}\n'.encode('UTF-8')) | |
output.write(md_refined.encode("UTF-8")) | |
if __name__ == "__main__": | |
# grab the site home directory and grab all .html files | |
# this includes nested files | |
home_dir = str(sys.argv[1]) | |
html_pattern = os.path.join(home_dir, "**", "*.html") | |
html_files = glob.glob(html_pattern, recursive=True) | |
for html_path in html_files: | |
# markdown files are stored in the same directory as the markdown file | |
try: | |
html_to_markdown(html_path) | |
print(f"converted: {html_path}") | |
except AttributeError as e: | |
print(f"cannot convert file: {html_path} with error: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment