Last active
September 30, 2021 07:30
-
-
Save kism/4f78cab89563b2c456b94a51c81e8176 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import textwrap | |
import os | |
import math | |
from os import listdir, path | |
from html.parser import HTMLParser | |
# Globals | |
debug = True | |
rootDir = '/var/www/html' # Make sure there is no forward slash at the end | |
outHTML = 'sitemap.html' | |
#dirtoinclude = ["siteone", "sitetwo", "sitethree"] | |
dirtoinclude = ["siteone", "sitethree"] | |
htmlintro = textwrap.dedent(""" | |
<!DOCTYPE html> | |
<html> | |
<head> | |
<title>Site Index</title> | |
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" /> | |
<link | |
rel="stylesheet" | |
href="https://fonts.googleapis.com/css?family=Fira+Code" | |
/> | |
</head> | |
<style> | |
body { | |
font-family: "Fira Code", "Consolas", "Lucida Console", monospace; | |
font-size: 12px; | |
margin-left: 4px; | |
background-color: rgb(26, 26, 26); | |
color: rgb(200, 200, 200); | |
} | |
h1 { | |
font-size: 20px; | |
color: rgb(220, 220, 220); | |
} | |
h2 { | |
font-size: 16px; | |
color: rgb(220, 220, 220); | |
} | |
a:link { | |
color: rgb(0, 128, 128); | |
} | |
a:visited { | |
color: rgb(128, 0, 64); | |
} | |
</style> | |
<body> | |
<h2>Website Index</h2> | |
""") | |
htmloutro = textwrap.dedent(""" | |
</body> | |
</html> | |
""") | |
# Debug print function | |
def print_debug(inText=''): | |
if debug: | |
print("\033[93m" + "DEBUG: " + str(inText) + "\033[0m") | |
# HTMLParser object to grab the 'title' of the html page, in this case scan for h1, h2 tags etc | |
class TitleParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.recording = False | |
self.data = [] | |
def handle_starttag(self, tag, attributes): | |
if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2: | |
self.recording = True | |
return | |
else: | |
self.recording = False | |
return | |
def handle_endtag(self, tag): | |
if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2 and self.recording: | |
self.recording = False | |
def handle_data(self, data): | |
if self.recording: | |
data = remove_white_space(data) | |
print_debug("Heading tag data: " + data) | |
self.data.append(data) | |
return data | |
# Remove whitespace, useful for funky headings | |
def remove_white_space(inText): | |
inText = inText.strip() # use the inbuilt python whitespace removal, this only gets leading and trailing whitespace | |
whitespacelist = ['\n','\t',' '] # Make sure double space is last | |
for whitespacechar in whitespacelist: | |
while whitespacechar in inText: # While there is this type of whitespace in the string | |
inText = inText.replace(whitespacechar,' ') # Replace with single space, this prevents newlines being removed and joining words together without a space | |
return inText | |
# Open file, send it to html parser | |
def get_page_title(inDir, inFileName): | |
# use os.sep instead of / just incase this somehow runs on some lesser operating system | |
path = inDir + os.sep + inFileName | |
print_debug("Opening: " + path) | |
with open(path, 'r') as file: | |
html = file.read() | |
# print_debug(html) # this is 10/10 noisy | |
p = TitleParser() # Create html parser instance | |
p.feed(html) # Feed the parser the html | |
# Output the title, p.data is a list of results, we are grabbing the first entry if it exists | |
out = '' | |
try: | |
out = p.data[0] | |
except: | |
out = inFileName | |
# This is a hack, default name for index.html | |
if inFileName == 'index.html': | |
out = 'Index' | |
print_debug("Using link title: " + out) | |
file.close() | |
return out | |
def get_depth(inPathDepth): | |
# Using logerithms for the indent, dimishing indentation | |
result = math.log((inPathDepth - 0.3) * 0.5,1.15) - 1.4 | |
# Just incase, helpful when playing with the constants | |
if result < 0: | |
result = 0 | |
result = round(result,2) | |
return result | |
# Get the page title, Create paragraph with a href ;) | |
def process_file(inDirName, inFileName, inPathDepth): | |
pagetitle = get_page_title(inDirName, inFileName) | |
inDirName = inDirName.replace(rootDir, '') # Strip out root directory, the remaining will be valid to the webroot | |
line = '\t<p' + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '><a href="' + inDirName + "/" + inFileName + '">' + pagetitle + '</a></p>\n' # Create the line of html | |
print_debug('Adding HTML line: ' + remove_white_space(line)) | |
print_debug() | |
return line | |
# Add an entry for folder name | |
def processFolder(inDirName, inPathDepth): | |
inPathDepth += 2 | |
print_debug("Adding directory: " + inDirName) | |
line = '\t<h' + str(inPathDepth) + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '>' + inDirName + '</h' + str(inPathDepth) + '>\n' # Create the line of html, the heading level corrisponds to the depth of the directory | |
print_debug('Adding HTML line: ' + remove_white_space(line)) | |
print_debug() | |
return line | |
# Main, would you believe | |
def main(): | |
print_debug("Main") | |
print_debug() | |
htmlout = htmlintro # Start the html sandwich | |
# Do a scan of html files in the root directory and add them to the html | |
for fileName in listdir(rootDir): | |
if fileName.endswith(".html"): | |
htmlout = htmlout + process_file(rootDir, fileName, 3) | |
# Do a recursive scan of html files for paths that have enteries in the whitelist | |
for dirName, _, fileList in os.walk(rootDir): | |
pathDepth = len(dirName.split(os.sep)) - len(rootDir.split(os.sep)) | |
for dir in dirtoinclude: | |
print_debug("Checking whitelist: " + dir + " > " + dirName) | |
if dirName.find(dir) != -1: | |
print_debug("Found result: " + dir + " in " + dirName) | |
# Grab the name of the deepest folder in the path, | |
# capitalise its name and add send it to be added to the html | |
htmlout += processFolder((dirName.rsplit(os.sep, 1)[1].upper()), pathDepth) | |
# For every file in the folder, check if its a html file, send it for processing | |
for fileName in fileList: | |
if fileName.endswith(".html"): | |
htmlout = htmlout + \ | |
process_file(dirName, fileName, pathDepth + 3) | |
else: | |
print_debug("Nope, directory not on whitelist") | |
print_debug() | |
htmlout = htmlout + htmloutro # Finish the html sandwich | |
# Write the sitemap html file | |
print_debug('Writing file: ' + rootDir + '/' + outHTML) | |
with open(rootDir + '/' + outHTML, 'w') as file: | |
file.write(htmlout) | |
file.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment