Last active
September 11, 2019 20:28
-
-
Save leonoverweel/6500c304d018b4805af0a1d880408c5c to your computer and use it in GitHub Desktop.
Convert a Revue newsletter issue to markdown; currently supported content types: title, headers, paragraphs, lists, blockquotes, and images.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from urllib import request | |
from bs4 import BeautifulSoup | |
import html2text | |
# CSS classes used by Revue | |
CLS_BLOCKQUOTE = 'revue-blockquote' | |
CLS_H2 = 'header-text' | |
CLS_IMG = 'img' | |
CLS_P = 'revue-p' | |
CLS_UL = 'ul' | |
def transform_element(html_element): | |
"""Transform an HTML element from Revue into markdown text and return it.""" | |
cls = html_element['class'][0] | |
# Skip empty elements | |
if html_element.text == '' and cls != CLS_IMG: | |
return '' | |
converter = html2text.HTML2Text() | |
converter.body_width = 0 | |
# Blockquotes | |
if cls == CLS_BLOCKQUOTE: | |
text = converter.handle(str(html_element)).strip() | |
# Make text one sentence per line | |
text = text.replace('. ', '.\n> ') | |
text = text.replace('! ', '!\n> ') | |
text = text.replace('? ', '?\n> ') | |
# Headers | |
elif cls == CLS_H2: | |
text = f'## {html_element.text.strip()}' | |
# Images | |
elif cls == CLS_IMG: | |
url = html_element.attrs["src"] | |
alt = html_element.attrs["alt"] | |
text = f'\n_{alt}_' | |
# Paragraphs | |
elif cls == CLS_P: | |
text = converter.handle(str(html_element)) | |
# Make text one sentence per line | |
text = text.replace('.** ', '.**\n') | |
text = text.replace('!** ', '!**\n') | |
text = text.replace('?** ', '?**\n') | |
text = text.replace('. ', '.\n') | |
text = text.replace('! ', '!\n') | |
text = text.replace('? ', '?\n') | |
# Lists | |
elif cls == CLS_UL: | |
text = converter.handle(str(html_element)) | |
# Remove indent and use -s instead of *s | |
text = text.replace(' * ', '* ') | |
text = text.replace('* ', '- ') | |
else: | |
raise ValueError('Unimplemented class') | |
return f'{text.strip()}\n\n' | |
def load_issue(issue_id, base_url='https://dynamicallytyped.com'): | |
"""Download an issue and return its HTML contents.""" | |
url = f'{base_url}/issues/0-{issue_id}' | |
req = request.Request(url, headers={'User-Agent': 'Totally a real browser and not a bot, yep'}) | |
return request.urlopen(req).read().decode('utf-8') | |
def revue_to_md(issue_id): | |
html_doc = load_issue(issue_id) | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
# Clean content to make it ready for transformer | |
quotes = soup.find_all(class_=CLS_BLOCKQUOTE) # html2text needs semantic HTML for blockquotes | |
for tag in quotes: | |
tag.name = 'blockquote' | |
lists = soup.find_all(name='ul') # only selecting on classes so we add class="ul" to <ul>s | |
for tag in lists: | |
tag.attrs['class'] = [CLS_UL] | |
images = soup.find_all('img', width='600') | |
for tag in images: | |
if tag.attrs['alt'] != 'Dynamically Typed': | |
tag.attrs['class'] = [CLS_IMG] | |
# Extract relevant content | |
content = soup.find_all(class_=lambda cls: cls in [CLS_BLOCKQUOTE, CLS_H2, CLS_IMG, CLS_P, CLS_UL]) | |
# Transform content | |
title = soup.title.text.split('|')[0] | |
markdown = f'# {title}\n\n' | |
markdown += ''.join(transform_element(tag) for tag in content).strip() | |
return markdown | |
if __name__ == '__main__': | |
if len(sys.argv) != 2: | |
print('Usage: `python3 revue_to_md.py issue_id`, where the latter is the 6-digit ID in the URL.') | |
exit() | |
markdown = revue_to_md(sys.argv[1]) | |
print(markdown) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment