Last active
January 23, 2024 19:53
-
-
Save So-Cool/cdf7b693f7cfdd5f8a65 to your computer and use it in GitHub Desktop.
Prettify ugly MS Word HTML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from bs4 import BeautifulSoup | |
if len(sys.argv) != 3: | |
sys.exit("First argument is HTML second is an output directory.") | |
with open(sys.argv[1], "r") as html: | |
soup = BeautifulSoup(html.read(), "html.parser") | |
mydivs = soup.findAll("div", {"class":"extract"}) # swish, figure, exercise, infobox | |
for d in mydivs: | |
name = d["class"][1] + "_" + d["id"] + ".html" | |
with open(os.path.join(sys.argv[2], name), 'w') as of: | |
of.write(str(d)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, Comment, NavigableString | |
# from pprint import pprint | |
# import sys | |
style_name = "AutoStyle%02d" | |
style_counter = 0 | |
stylesCSS = {} | |
stylesCSSno = {} | |
def writeCSStoFile(CSS, CSSno): | |
s = "" | |
for style in CSS: | |
for tag in CSS[style]: | |
s += tag.encode("utf-8") + '.' + style_name%CSSno[style] + ',' | |
s = s[:-1] + '{\n' | |
defs = style.split(';') | |
defs = [d for d in defs if d.strip()] | |
for d in defs: | |
s+= ' ' + d.encode("utf-8") + ';\n' | |
s += '}\n' | |
with open('common_style.css', 'w') as CSSfile: | |
CSSfile.write(s) | |
tag_list = [] | |
style_list = [] | |
soupNames = ["../simply-logical/Part I.htm.dat",\ | |
"../simply-logical/Part II.htm.dat",\ | |
"../simply-logical/Part III.htm.dat",\ | |
"../simply-logical/Appendix.htm.dat"] | |
for soupName in soupNames: | |
soup = BeautifulSoup(open(soupName), "html.parser") | |
# replace \n with blank in tags parameters (attributes) | |
# Don't do pre tags which are swish blocks | |
for tag in soup.findAll(True): | |
if tag.name == 'pre': | |
continue | |
for attr in tag.attrs: | |
if type(tag[attr]) == str or type(tag[attr]) == unicode: | |
x = tag[attr].encode("utf-8").split('\n') | |
x = [xx.strip() for xx in x] | |
x = ''.join(x) | |
tag[attr] = unicode(x, 'utf-8') | |
else: | |
for i in range(len(tag[attr])): | |
x = tag[attr][i].encode("utf-8").split('\n') | |
x = [xx.strip() for xx in x] | |
x = ''.join(x) | |
tag[attr][i] = unicode(x, 'utf-8') | |
# Remove comments | |
for element in soup(text=lambda text: isinstance(text, Comment)): | |
element.extract() | |
# remove empty tags | |
et = [1] | |
while et: | |
empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip())) | |
et = [empty_tag.extract() for empty_tag in empty_tags] | |
# Remove some attrs form tags | |
for tag in soup.findAll(True): | |
try: | |
if tag['style'] == 'mso-bidi-font-weight:normal' or tag['style'] == 'mso-bidi-font-style:normal': | |
del tag['style'] | |
except: | |
pass | |
try: | |
if tag['lang'] == 'EN-US': | |
del tag['lang'] | |
except: | |
pass | |
# <span style='font-family:Courier'></span> into <tt></tt> | |
for a in soup.findAll('span'): | |
try: | |
if a['style'] == 'font-family:Courier' and len(a.attrs) == 1: | |
del a['style'] | |
a.name = 'tt' | |
except: | |
pass | |
# remove some tags | |
for tag in soup.findAll(): | |
try: | |
style = tag['style'] | |
style = style.split(';') | |
for s in style: | |
if 'display:none' in s: | |
tag.extract() | |
break | |
except: | |
pass | |
# remove all *mso* `style` parameters | |
for tag in soup.findAll(): | |
try: | |
style = tag['style'] | |
style = style.split(';') | |
style = [s for s in style if 'mso' not in s and\ | |
'tab-stops' not in s and\ | |
'page-break' not in s and\ | |
'font-family:YuTimes' not in s and\ | |
'font-family:Extra' not in s and\ | |
'Avant Garde' not in s] | |
style = ';'.join(style) | |
if style: | |
tag['style'] = style | |
else: | |
del tag['style'] | |
except: | |
pass | |
# remove `div` tags without any attributes | |
# for tag in soup.findAll('div'): | |
# if not tag.attrs: | |
# print tag | |
# tag.replaceWithChildren() | |
for tag in soup.findAll('span'): | |
if not tag.attrs: | |
# print tag | |
tag.replaceWithChildren() | |
for tag in soup.findAll('o:p'): | |
tag.replaceWithChildren() | |
# Remove tags but leave their content only if specified parameter is the only | |
# one in the tag | |
bks = ['bk'+str(i) for i in range(10)] | |
bks_style = ['mso-bookmark:'+i for i in bks] | |
invalid_tags = { | |
'a':{'name':bks}, | |
'span':{'style':bks_style+["mso-spacerun:yes"] }\ | |
} | |
# def strip_tags(soup, invalid_tags, bks): | |
# for tag in soup.findAll(True): | |
# if tag.name in invalid_tags: | |
# try: | |
# attr = invalid_tags[tag.name].keys()[-1] | |
# if len(tag.attrs) == 1 and tag[attr] in invalid_tags[tag.name][attr]: | |
# pass | |
# else: | |
# continue | |
# s = "" | |
# for c in tag.contents: | |
# if not isinstance(c, NavigableString): | |
# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags, bks) | |
# s += unicode(c) | |
# tag.replaceWith(s) | |
# except: | |
# pass | |
# return soup | |
# soup = strip_tags(soup, invalid_tags, bks) | |
for tag in invalid_tags: | |
for match in soup.findAll(tag): | |
attr = invalid_tags[match.name].keys()[-1] | |
try: | |
if len(match.attrs) == 1 and match[attr] in invalid_tags[match.name][attr]: | |
match.replaceWithChildren() | |
except: | |
pass | |
# # remove <span> without attributes | |
# def strip_tags(soup, invalid_tags): | |
# for tag in soup.findAll(True): | |
# if tag.name in invalid_tags and len(tag.attrs) == 0: | |
# # print tag | |
# s = unicode("") | |
# for c in tag.contents: | |
# if not isinstance(c, NavigableString): | |
# c = strip_tags(BeautifulSoup(unicode(c), 'html.parser'), invalid_tags) | |
# s += unicode(c) | |
# # print s, '\n*******\n\n\n\n' | |
# tag.replaceWith(s) | |
# return soup | |
# soup = strip_tags(soup, ['span']) | |
for tag in ['span']: | |
for match in soup.findAll(tag): | |
if len(match.attrs) == 0: | |
match.replaceWithChildren() | |
# remove empty tags - AGAIN and AGAIN and AGAIN | |
et = [1] | |
while et: | |
empty_tags = soup.findAll(lambda tag: not tag.name == 'img' and not 'br' in tag.name and not tag.contents and (tag.string is None or not tag.string.strip())) | |
et = [empty_tag.extract() for empty_tag in empty_tags] | |
#replace newlines inside tag content with spaces | |
for tag in soup.findAll(True): | |
if tag.name == 'pre' or not tag.contents: | |
continue | |
# for i in range(len(tag.contents))[::-1]: | |
# if tag.contents[i] == '\n': | |
# tag.contents[i] = u'\n' | |
# print tag.contents | |
i = tag.contents[0] | |
while i: | |
j = i.next_sibling | |
if isinstance(i, NavigableString) and i.parent.name!='pre': | |
x = i.encode("utf-8").strip().split('\n') | |
# print "**\n",repr(tag.contents[i]),"\n*" | |
# print x | |
# print i.previous_element | |
# x = [xx.strip() for xx in x] | |
x = ' '.join(x) | |
# print type(tag.contents[i]) | |
if x: | |
if isinstance(i.previous_sibling, NavigableString) and i.parent.name!='pre': | |
# get content of previous | |
prev = i.previous_sibling.encode("utf-8") | |
this = soup.new_string(unicode(prev.strip()+x.strip(), 'utf-8')) | |
i.previous_sibling.replace_with( this ) | |
i.extract() | |
i = j | |
continue | |
this = soup.new_string(unicode(x, 'utf-8')) | |
i.replace_with( this ) | |
else: | |
i.extract() | |
i = j | |
# remove empty tags - AGAIN and AGAIN and AGAIN | |
eto = [1] | |
while eto: | |
eto = [] | |
empty_tags = soup.findAll(lambda tag: not tag.contents and (tag.string is None or not tag.string.strip())) | |
# print empty_tags | |
for et in empty_tags: | |
if not et.attrs and et.name != 'br' and et.name != 'img': | |
eto.append(et.extract()) | |
elif et.name != 'br' and et.name != 'img': | |
print "Removeing: ", et | |
eto.append(et.extract()) | |
# replace footnotes stylesheets | |
def getNameClass(node): | |
n = [node.name] | |
try: | |
n = [node.name + '.' + c for c in node['class']] | |
except: | |
pass | |
return n | |
def ancestralSet(node): | |
try: | |
return getNameClass(node.parent) + ancestralSet(node.parent) | |
except: | |
return [] | |
for s in soup.findAll('span', { "class" : "MsoFootnoteReference" }): | |
if 'span.MsoFootnoteReference' in ancestralSet(s): | |
s.replaceWithChildren() | |
# for s in soup.findAll('span', { "style" : "font-size:8.0pt"}): | |
# print list(s.children)[1]['style'] | |
# try: | |
# if len(list(s.children)) == 3 and "font-size:8.0pt" in list(s.children)[1]['style']: | |
# s.replaceWithChildren() | |
# except: | |
# pass | |
# replace <abar> with <span class="CustomFootnote"> | |
for s in soup.findAll('abar'): | |
if 'p.MsoFootnoteText' in ancestralSet(s): | |
s.replaceWithChildren() | |
else: | |
s.name = 'span' | |
s['class'] = "CustomFootnote" | |
# generate classes instead of styles | |
for tag in soup.findAll(): | |
try: | |
if tag['style'] in stylesCSS: | |
try: | |
tn = [tag.name + '.' + c for c in tag['class']] | |
tag['class'].append(style_name % stylesCSSno[tag['style']]) | |
except: | |
tn = [tag.name] | |
tag['class'] = [ style_name % stylesCSSno[tag['style']] ] | |
for t in tn: | |
if t not in stylesCSS[tag['style']]: | |
stylesCSS[tag['style']] += [t] | |
del tag['style'] | |
else: | |
# remove style and add class | |
stylesCSSno[tag['style']] = style_counter | |
try: | |
stylesCSS[tag['style']] = [tag.name + '.' + c for c in tag['class']] | |
tag['class'].append(style_name % stylesCSSno[tag['style']]) | |
except: | |
stylesCSS[tag['style']] = [tag.name] | |
tag['class'] = [ style_name % stylesCSSno[tag['style']] ] | |
del tag['style'] | |
style_counter += 1 | |
except: | |
pass | |
with open(soupName[:-4], 'w') as wfile: | |
wfile.write(soup.prettify(encoding='utf-8', formatter='html')) | |
# sys.exit('Here') | |
with open(soupName[:-4], 'r') as wfile: | |
clean = wfile.read() | |
def cleanUp(txt, ls): | |
for i in ls: | |
txt = txt.split(i) | |
txt = [tt.strip() for tt in txt] | |
txt = filter(None, txt) | |
txt = '\n'.join(txt) | |
return txt | |
clean = cleanUp(clean, ['if !vml', 'if !supportFootnotes', 'endif']) | |
soup = BeautifulSoup(clean, "html.parser") | |
puretxt = soup.prettify(encoding='utf-8', formatter='html') | |
puretxt = cleanUp(puretxt, ['</br>']) | |
# <br> cleanup and preserve indent | |
puretxt = puretxt.replace('<br/>', '<br>') | |
brs = puretxt.split('<br>') | |
brsprs = brs[0].rstrip() | |
for br_i in range(1, len(brs)): | |
# get space count | |
space_count = brsprs.rfind('\n') + 1 | |
space_count = brsprs[space_count:] | |
space_count =len(space_count) - len(space_count.lstrip(' ')) | |
brsprs += '<br>\n' + space_count*' ' + brs[br_i].strip() | |
puretxt = brsprs | |
def cleanUp(txt, ls): | |
for i in ls: | |
txt = txt.split(i) | |
txt = [tt for tt in txt] | |
txt = filter(None, txt) | |
# my join implementation | |
if txt == []: | |
txt = "" | |
else: | |
s = txt[0] | |
for j in txt[1:]: | |
if '/' in i: | |
# punctuation mark afterwards | |
if j.strip()[0] in ['.', ',', '?', '!', ':', ';', ')', ']', '\'', '"', '/', '’', '>']: | |
s = s.rstrip() + i + j.lstrip() | |
elif j.strip()[0:6] == ' ': | |
s = s.rstrip() + i + j.lstrip() | |
# two tags next to each other if their the same join the word: like <i>text</i><i>another text</i> | |
elif j.strip()[:len(i.replace('/', ''))] == i.replace('/', ''): | |
# remove tag form j at the begining and merge it with space(maybe) | |
s = s.rstrip() + j.strip()[len(i.replace('/', '')):].lstrip() | |
#there is another tag which is not in the list then do not remove newline | |
elif j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls: | |
# if the previous symbol is tag as well | |
if s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls: | |
s += i + j | |
else: | |
s = s.rstrip() + i + j | |
else: # ['(', '[', '-'] | |
s = s.rstrip() + i + ' ' + j.lstrip() | |
# txt = (i+' ').join(txt) | |
else: # tag before-if same merge if different do not and dont intent | |
if s.rstrip()[-1] in ['(', '[', '\'', '"', '‘', '<']: | |
s = s.rstrip() + i + j.lstrip() | |
# non breaking space before | |
elif s.strip()[-6:] == ' ': | |
s = s.rstrip() + i + j.lstrip() | |
# if different tag before do not remove indent | |
elif s.strip()[-1] == '>' and s.rstrip()[s.strip().rfind('<'):] not in ls: | |
# if the next is tag as well | |
if j.strip()[0] == '<' and (j.strip()[:j.strip().find(' ')]+'>') not in ls: | |
s += i + j | |
else: | |
s += i + j.lstrip() | |
# if its first thing inside bracket ro pharenthesis | |
else: | |
s = s.rstrip() + ' ' + i + j.lstrip() | |
# txt = (' '+i).join(txt) | |
txt = s | |
return txt | |
puretxt = cleanUp(puretxt, ['<tt>', '</tt>', '<i>', '</i>', '<b>', '</b>']) | |
with open(soupName[:-4], 'w') as wfile: | |
wfile.write(puretxt) | |
# sys.exit('Cleanup') | |
# Print all tag names | |
tagSoup = BeautifulSoup(puretxt, "html.parser") | |
for tag in tagSoup.findAll(): | |
try: | |
classes = tag['class'] | |
if classes: | |
for c in classes: | |
if 'AutoStyle' not in c: | |
tc = tag.name + '.' + c | |
else: | |
tc = tag.name | |
if tc not in tag_list: | |
tag_list.append(tc) | |
else: | |
if tag.name not in tag_list: | |
tag_list.append(tag.name) | |
except: | |
if tag.name not in tag_list: | |
tag_list.append(tag.name) | |
# try: | |
# tn = tag.name + '._name_.' + tag['name'] | |
# if tn not in tag_list: | |
# tag_list.append(tn) | |
# except: | |
# pass | |
try: | |
tc = tag.name + '.' + tag['style'] | |
if tc not in style_list: | |
style_list.append(tc) | |
except: | |
if tag.name not in style_list: | |
style_list.append(tag.name) | |
# print all tags | |
tag_list = [t.encode("utf-8") for t in tag_list]; tag_list.sort() | |
style_list = [t.encode("utf-8") for t in style_list]; style_list.sort() | |
# pprint(tag_list) | |
print "Tag.Class" | |
print tag_list | |
# for t in tag_list: | |
# print t | |
# pprint(style_list) | |
# print "\n\nTag.Style" | |
# for t in style_list: | |
# print t | |
# save new styles | |
font_styles = [] | |
for f in stylesCSS.keys(): | |
ff = f.split(';') | |
for ffi in ff: | |
if 'font-family' in ffi: | |
if ffi not in font_styles: | |
font_styles.append(ffi.encode('utf-8')) | |
print font_styles | |
writeCSStoFile(stylesCSS, stylesCSSno) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyparsing, sys | |
def getFontFace(styles): | |
s = styles.split(';') | |
for i in s: | |
if 'font-family' in i: | |
return i.strip().replace('font-family', '').strip().strip(':').strip().strip('\'').strip('\"').strip() | |
sys.exit('Font-family not found') | |
def setOverlap(styleStr, listStyleStr): | |
s1 = styleStr.split(',') | |
s1 = [s.strip() for s in s1] | |
s2 = [] | |
for i in listStyleStr: | |
ss = i.split(',') | |
ss = [s.strip() for s in ss] | |
s2 += ss | |
for s in s1: | |
if s in s2: | |
return False | |
return True | |
STYLES = {} | |
FONT_FACE = {} | |
PAGE = {} | |
used_styles = ['a', 'b', 'br', 'div', 'div.WordSection1', 'div.WordSection2', 'div.WordSection3', 'div.WordSection4', 'h1', 'h2', 'h3', 'hr', 'i', 'img', 'p', 'p.Caption1', 'p.MsoFootnoteText', 'p.MsoNormal', 'p.answer', 'p.caption', 'p.cijfer', 'p.citaat', 'p.exercise', 'p.figure', 'p.formule', 'p.grammar', 'p.inter-title', 'p.intermezzo', 'p.med-caption', 'p.med-figure', 'p.oms', 'p.oms-eerst', 'p.opsomming', 'p.p-eerst', 'p.p-el', 'p.p-laatst', 'p.pi', 'p.pi-eerst', 'p.pi-el', 'p.pi-laatst', 'p.programma', 'p.query', 'p.referenties', 'p.romeinscijfer', 'p.sektie', 'p.sektie1', 'p.small-caption', 'p.small-figure', 'p.tekst', 'pre', 'pre.inherit', 'pre.query', 'pre.source', 'pre.swish', 'pre.temp', 'script', 'span', 'span.CustomFootnote', 'span.MsoFootnoteReference', 'span.query', 'span.swish', 'table', 'tbody', 'td', 'tr', 'tt'] | |
used_fonts = ['font-family:Symbol', 'font-family:Times', 'font-family:Helvetica', 'font-family:Courier'] | |
out_file = "mso.css" | |
CSS_files = ["../simply-logical/bootstrap/css/" + i for i in ["Part_I.css", "Part_II.css", "Part_III.css", "Appendix.css"]] | |
CSSs = [] | |
for i in CSS_files: | |
with open(i, 'r') as i_file: | |
CSSs.append(i_file.read()) | |
# Remove all comments | |
comment = pyparsing.nestedExpr("/*", "*/").suppress() | |
CSSs = [comment.transformString(i) for i in CSSs] | |
for css in CSSs: | |
css_split = [] | |
for c in css.split('{'): | |
css_split += [i.strip() for i in c.split('}') if i.strip()] | |
names = [] | |
for n in css_split[0:][::2]: # odd | |
name = n.split(',') | |
name = [nam.strip() for nam in name if nam.strip()] | |
name = ','.join(name) | |
names.append(name) | |
tags = [] | |
for n in css_split[1:][::2]: # even | |
tag = n.split(';') | |
tag = [t.strip() for t in tag if t.strip()] | |
tag = [t for t in tag if 'mso-' not in t and 'tab-stop' not in t] | |
tag_formatted = "" | |
for t in tag: | |
tag_formatted += ' ' + t + ';\n' | |
tags.append(tag_formatted) | |
if len(tags) != len(names): | |
sys.exit("Tags and names do not agree") | |
# remove empty tags | |
for i in range(len(tags))[::-1]: | |
if not tags[i]: | |
tags.pop(i) | |
names.pop(i) | |
# divide into right placeholders | |
# set_of_keys = setOfKeys() | |
for i in range(len(tags)): | |
if '@font-face' in names[i]: | |
if getFontFace(tags[i]) in FONT_FACE: | |
if FONT_FACE[getFontFace(tags[i])] != tags[i]: | |
print('Font face does not agree\n' + tags[i] +'\n dic:\n'+FONT_FACE[getFontFace(tags[i])]) | |
else: | |
FONT_FACE[getFontFace(tags[i])] = tags[i] | |
elif '@page' in names[i]: | |
if names[i] in PAGE: | |
if PAGE[names[i]] != tags[i]: | |
sys.exit('@Page does not agree\n ' + tags[i]) | |
else: | |
PAGE[names[i]] = tags[i] | |
else: | |
if names[i] in STYLES: | |
if STYLES[names[i]] != tags[i]: | |
print('Styles do not agree:'+names[i]+'\n' + tags[i] +'\nand\n' + STYLES[names[i]]) | |
STYLES[names[i]] += '/*' + tags[i] + '*/' | |
else: | |
if setOverlap(names[i], STYLES.keys()): | |
STYLES[names[i]] = tags[i] | |
else: | |
sys.exit("Keys set is overlapping") | |
# filter used @Page | |
delMe = True | |
for i in PAGE.keys(): | |
delMe = True | |
ni = i.strip().strip('@page').strip() | |
if not ni: | |
continue | |
else: | |
for j in used_styles: | |
if ni in j: | |
delMe = False | |
break | |
if delMe: | |
del PAGE[i] | |
# filter used STYLES | |
delMe = True | |
for i in STYLES.keys(): | |
delMe = True | |
if setOverlap(i, used_styles): | |
del STYLES[i] | |
# filter used FONTS | |
delMe = True | |
for i in FONT_FACE.keys(): | |
delMe = True | |
if 'font-family:'+i in used_fonts: | |
delMe = False | |
if delMe: | |
del FONT_FACE[i] | |
with open(out_file, 'w') as of: | |
s = "" | |
for i in FONT_FACE: | |
s += '@font-face' + '{\n' + FONT_FACE[i] + '}\n' | |
for i in PAGE: | |
s += i + '{\n' + PAGE[i] + '}\n' | |
for i in STYLES: | |
s += i + '{\n' + STYLES[i] + '}\n' | |
of.write(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment