peterhys · June 7, 2021 21:22
diff --git a/gsites2md.py b/gsites2md.py
 """
 Peter Sun
 Jun 7, 2020
 https://github.com/peterhs73

 Custom script that convert google sites to markdown files.
 The output is for github wiki specifically.

 How to:
 Extract all google sites page html using google takeout feature

 run python gsites2md.py path_to_home_directory
 the markdown output is stored in the same directory as the html file

 requirements
 beautifulsoup4=4.9.1
 markdownify==0.9.0
 lxml==4.5.2
 """

 import glob
 import sys
 import os
 import re

 import lxml
 from bs4 import BeautifulSoup
 from markdownify import markdownify


 def refine_toc(md_content):
    """Convert google's toc from space to dash
    md_content str: content of the site body
    """
    toc_list = re.findall(r"\(#.*\)", md_content)
    for toc_content in toc_list:
        # no need to make it too complex
        toc_refined = toc_content.replace(" ", "-")
        md_content = md_content.replace(toc_content, toc_refined)
    return md_content


 def html_to_markdown(html_path):
    """The main function to convert html
    html_path str: the path of the google sites html file
    """
    output_dir, file_name = os.path.split(os.path.splitext(html_path)[0])
    # titlize the output markdown
    output_path = os.path.join(output_dir, file_name.title() + '.md')

    with open(html_path, "br") as content:
        soup = BeautifulSoup(content.read().decode("UTF-8"), features="lxml")
        # title can be added if needed
        # title = soup.title.text

        md = markdownify(str(soup.tbody.contents[0]), heading_style="ATX")

        # Fix the TOC from google sites
        # The google sites body starts and ends with '|', they are stripped
        md_refined = refine_toc(md)[2:-2]

        with open(output_path, "bw+") as output:
            # add title here if needed
            # output.write(f'# {title}\n'.encode('UTF-8'))
            output.write(md_refined.encode("UTF-8"))


 if __name__ == "__main__":

    # grab the site home directory and grab all .html files
    # this includes nested files
    home_dir = str(sys.argv[1])
    html_pattern = os.path.join(home_dir, "**", "*.html")
    html_files = glob.glob(html_pattern, recursive=True)

    for html_path in html_files:
        # markdown files are stored in the same directory as the markdown file
        try:
            html_to_markdown(html_path)
            print(f"converted: {html_path}")
        except AttributeError as e:
            print(f"cannot convert file: {html_path} with error: {e}")
	"""
	Peter Sun
	Jun 7, 2020
	https://github.com/peterhs73

	Custom script that convert google sites to markdown files.
	The output is for github wiki specifically.

	How to:
	Extract all google sites page html using google takeout feature

	run python gsites2md.py path_to_home_directory
	the markdown output is stored in the same directory as the html file

	requirements
	beautifulsoup4=4.9.1
	markdownify==0.9.0
	lxml==4.5.2
	"""

	import glob
	import sys
	import os
	import re

	import lxml
	from bs4 import BeautifulSoup
	from markdownify import markdownify


	def refine_toc(md_content):
	"""Convert google's toc from space to dash
	md_content str: content of the site body
	"""
	toc_list = re.findall(r"\(#.*\)", md_content)
	for toc_content in toc_list:
	# no need to make it too complex
	toc_refined = toc_content.replace(" ", "-")
	md_content = md_content.replace(toc_content, toc_refined)
	return md_content


	def html_to_markdown(html_path):
	"""The main function to convert html
	html_path str: the path of the google sites html file
	"""
	output_dir, file_name = os.path.split(os.path.splitext(html_path)[0])
	# titlize the output markdown
	output_path = os.path.join(output_dir, file_name.title() + '.md')

	with open(html_path, "br") as content:
	soup = BeautifulSoup(content.read().decode("UTF-8"), features="lxml")
	# title can be added if needed
	# title = soup.title.text

	md = markdownify(str(soup.tbody.contents[0]), heading_style="ATX")

	# Fix the TOC from google sites
	# The google sites body starts and ends with '\|', they are stripped
	md_refined = refine_toc(md)[2:-2]

	with open(output_path, "bw+") as output:
	# add title here if needed
	# output.write(f'# {title}\n'.encode('UTF-8'))
	output.write(md_refined.encode("UTF-8"))


	if __name__ == "__main__":

	# grab the site home directory and grab all .html files
	# this includes nested files
	home_dir = str(sys.argv[1])
	html_pattern = os.path.join(home_dir, "*", ".html")
	html_files = glob.glob(html_pattern, recursive=True)

	for html_path in html_files:
	# markdown files are stored in the same directory as the markdown file
	try:
	html_to_markdown(html_path)
	print(f"converted: {html_path}")
	except AttributeError as e:
	print(f"cannot convert file: {html_path} with error: {e}")