Created
September 30, 2010 05:51
-
-
Save fish2000/604095 to your computer and use it in GitHub Desktop.
python port of Arc90 readability.js (circa 2009)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
##### | |
##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be | |
##### http://objectsinspaceandtime.com/ | |
##### | |
##### it's a python port of the Readability JavaScript bookmarklet, | |
##### by Arc90 Labs -- | |
##### http://lab.arc90.com/2009/03/readability.php | |
##### http://code.google.com/p/arc90labs-readability/ | |
##### this script and the original are both licenced under the Apache License 2.0. | |
##### | |
import sys | |
import os | |
import os.path | |
import re | |
import getopt | |
import urllib2 | |
import urlparse | |
import lxml | |
from pyquery import PyQuery as pq | |
from lxml import etree | |
urls = [] | |
roster = {} | |
outpaths = {} | |
thisURL = "" | |
breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE | re.MULTILINE) | |
fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE | re.MULTILINE) | |
fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE) | |
multibreakre = re.compile('(<br\s*\/?>(\s| ?)*){1,}', re.IGNORECASE | re.MULTILINE) | |
badnamere = re.compile('(comment|meta|footer|footnote|side)', re.IGNORECASE) | |
goodclsre = re.compile('((^|\\s)(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))', re.IGNORECASE) | |
goodidre = re.compile('^(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$', re.IGNORECASE) | |
readabilityversion = "0.4" | |
emailsrc = 'http://proto1.arc90.com/readability/email.php' | |
iframeLoads = 0 | |
def main(argv): | |
view = False | |
toview = [] | |
textout = False | |
htmlout = False | |
pageout = True | |
termout = True | |
basepath = os.getcwd() | |
try: | |
opts, args = getopt.getopt(argv, "wo:thpv", [ | |
"wtf", | |
"out=", | |
"text", | |
"html", | |
"page", | |
"view", | |
]) | |
except getopt.GetoptError: | |
print "Error: Bad GetOpt" | |
echousage() | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt in ("-w", "--wtf"): | |
echousage() | |
sys.exit(0) | |
elif opt in ("-o", "--out"): | |
if os.path.exists(arg): | |
basepath = os.path.realpath(arg) | |
elif opt in ("-t", "--text"): | |
textout = True | |
elif opt in ("-h", "--html"): | |
htmlout = True | |
elif opt in ("-p", "--page"): | |
pageout = True | |
elif opt in ("-v", "--view"): | |
view = True | |
if (len(args) > 0): | |
for urlarg in args: | |
if (urlarg.startswith('http://')): | |
urls.append(urlarg) | |
else: | |
print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8") | |
else: | |
urls.append('http://objectsinspaceandtime.com/') | |
for url in urls: | |
urlp = urlparse.urlparse(url) | |
urlb = urlparse.urlsplit(url) | |
fileout = os.path.splitext(os.path.basename(urlp.path))[0] | |
if (fileout == "" or fileout == None): | |
fileout = re.sub("\.", "_", urlb.netloc) | |
if (fileout == "" or fileout == None): | |
fileout = "unknown" | |
if (fileout == "" or fileout == None): | |
fileout = os.path.splitext(os.path.dirname(urlb.path))[0] | |
print "\nURL: "+unicode(url).encode("utf-8") | |
htmlobject = urllib2.urlopen(url) | |
thehtml = htmlobject.read() | |
(thediv, cleanpage) = grabarticle(thehtml, url) | |
if (textout): | |
txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text()) | |
toview.append(os.path.join(basepath, (fileout+"_extract.txt"))) | |
if (htmlout): | |
txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html()) | |
toview.append(os.path.join(basepath, (fileout+"_extract.html"))) | |
if (pageout): | |
txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl()))) | |
toview.append(os.path.join(basepath, (fileout+"_readable.html"))) | |
if (view and len(toview) > 0): | |
# not cross-platform at all | |
os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview)) | |
def grabarticle(html, url): | |
global thisURL | |
thisURL = ""+url | |
if url not in roster: | |
roster[url] = {} | |
else: | |
roster[url] = {} | |
### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts. | |
nhtml = breakre.sub('</p><p>', html) | |
mhtml = fontre.sub('', nhtml) | |
d = pq(mhtml, parser='html') | |
allparagraphs = d('p, blockquote, span') | |
if len(allparagraphs) < 1: | |
allparagraphs = d('span, div') | |
topdivcount = 0 | |
td = None | |
topdiv = None | |
### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score. | |
### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc. | |
try: | |
allparagraphs.map(assignscore) | |
except KeyError: | |
# but why? | |
print 'KEY ERROR!' | |
print "\nScoreboard Results:" | |
for sk, sv in roster[url].iteritems(): | |
print "Object: "+str(sk)+"\t\tScore: "+str(sv) | |
print "\n" | |
### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 | |
scoreboard = roster[url] | |
for tk in scoreboard.iterkeys(): | |
if (td == None): | |
td = tk | |
elif (scoreboard[tk] > scoreboard[td]): | |
td = tk | |
if td == None: | |
out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable." | |
out += "\nroster:" | |
out += unicode(roster).encode("utf-8") | |
print out | |
return (pq("<div><h1>"+out+"</h1></div>"), url) | |
else: | |
topdiv = pq(td).clone() | |
### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ... | |
#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove() | |
d('head').find('link').remove() | |
### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) : | |
d('head').find('style').remove() | |
d('head').find('script').remove() | |
### ORIGINAL COMMENT: Removes all style attributes | |
topdiv.find("*").attr('style', 'void:none;') | |
topdiv.find("*").removeAttr('style') | |
### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff | |
topdiv.find('div').each(killdivs) | |
topdiv.find('script').remove() | |
d.find('div').each(killdivs) | |
d.find('script').remove() | |
### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br /> | |
#hh = multibreakre.sub(topdiv.html(), '<br />') | |
hh = multibreakre.sub('<br />', topdiv.html()) | |
topdiv = pq("<div></div>").html(hh) | |
### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case: | |
topdiv.find('form').each(cleanit) | |
topdiv.find('object').each(cleanit) | |
topdiv.find('table').each(cleanit) | |
topdiv.find('h1').each(cleanit) | |
topdiv.find('h2').each(cleanit) | |
topdiv.find('iframe').each(cleanit) | |
d.find('form').each(cleanit) | |
d.find('object').each(cleanit) | |
d.find('table').each(cleanit) | |
d.find('h1').each(cleanit) | |
d.find('h2').each(cleanit) | |
d.find('iframe').each(cleanit) | |
### ORIGINAL COMMENT: Add the footer and contents: | |
return (topdiv, d) | |
## function for each call to assign score | |
def calculatescore(dompart, itsmyfirsttime): | |
### ORIGINAL COMMENT: Initialize readability data | |
ppart = dompart.getparent() | |
pscore = 0 | |
if itsmyfirsttime: | |
### ORIGINAL COMMENT: Look for a special classname | |
thecls = pq(ppart).attr('class') | |
theid = pq(ppart).attr('id') | |
if (thecls != None): | |
if (badnamere.search(thecls) != None): | |
pscore -= 50 | |
elif (goodclsre.search(thecls) != None): | |
pscore += 25 | |
### ORIGINAL COMMENT: Look for a special ID | |
if (theid != None): | |
if (badnamere.search(pq(ppart).attr('id')) != None): | |
pscore -= 50 | |
elif (goodidre.search(pq(ppart).attr('id')) != None): | |
pscore += 25 | |
### ORIGINAL COMMENT: Add a point for the paragraph found | |
if len(pq(dompart).text()) > 10: | |
pscore += 1 | |
### ORIGINAL COMMENT: Add points for any commas within this paragraph | |
pscore += getcharcount(dompart) | |
return pscore | |
def assignscore(idx, dompart): | |
global thisURL | |
ppart = dompart.getparent() | |
scoreboard = roster[thisURL] | |
virginscore = (ppart not in scoreboard) | |
iscored = calculatescore(dompart, virginscore) | |
if virginscore: | |
scoreboard[ppart] = iscored | |
else: | |
scoreboard[ppart] += iscored | |
### ORIGINAL COMMENT: Get character count | |
def getcharcount(dompart, *args): | |
metric = "," | |
if len(args) > 1: | |
metric = str(args[0]) | |
return pq(dompart).text().count(metric) | |
def killdivs(thing): | |
### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ... | |
if getcharcount(thing) < 10: | |
dp = thing | |
ip = len(dp.find('p')) | |
iimg = len(dp.find('img')) | |
ili = len(dp.find('li')) | |
ia = len(dp.find('a')) | |
iembed = len(dp.find('embed')) | |
if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0): | |
thing.remove() | |
def cleanit(thing): | |
### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child: | |
minwords = 10000000 | |
if thing.is_("table"): | |
minwords = 250 | |
if getcharcount(thing, " ") < minwords: | |
thing.remove() | |
def readability(rdiv, rcontext, *args, **kwargs): | |
toolshtml = """ | |
<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a> | |
<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a> | |
<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>""" | |
readheadhtml = """ | |
<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' /> | |
<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />""" | |
readfooterhtml = """ | |
<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a> | |
<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>""" | |
readstyle = 'style-newspaper' | |
readsize = 'size-large' | |
readmargin = 'margin-wide' | |
overlay = pq('<div id="readOverlay"></div>') | |
innerdiv = pq('<div id="readInner"></div>') | |
articletools = pq('<div id="readTools"></div>') | |
articlecontent = pq('<div></div>') | |
articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>') | |
articlefooter = pq('<div id="readFooter"></div>') | |
### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title. | |
articlefooter.html(readfooterhtml) | |
articletools.html(toolshtml) | |
articlecontent.append((rdiv)) | |
innerdiv.addClass(readmargin).addClass(readsize) | |
innerdiv.append((articletitle)) | |
innerdiv.append((articlecontent)) | |
overlay.addClass(readstyle) | |
if len(rcontext.find('body')) < 1: | |
rcontext.append(pq('<body></body>')) | |
rcontext.find('body').empty() | |
rcontext.find('head').eq(0).append((readheadhtml)) | |
rcontext.find('body').eq(0).append((innerdiv)) | |
rcontext.find('body').eq(0).append((articletools)) | |
rcontext.find('body').eq(0).addClass(readstyle) | |
rcontext.find('body').eq(0).append((articlefooter)) | |
if (kwargs['theurlbase'] != None): | |
tbb = kwargs['theurlbase'] | |
rcontext.make_links_absolute(base_url=tbb) | |
rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src'))))) | |
return rcontext | |
def txtfile(where, what): | |
if os.path.exists(os.path.split(where)[0]): | |
print "Writing file %s..." % where | |
fff = open(where, 'w') | |
fff.write(unicode(what).encode("utf-8")) | |
fff.close() | |
return True | |
else: | |
return False | |
def echousage(): | |
print "\n" | |
print "readability.py 0.1. (c) 2009 Fish. All rights reserved." | |
print "\thttp://objectsinspaceandtime.com/" | |
print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:" | |
print "\thttp://lab.arc90.com/2009/03/readability.php" | |
print "Usage:" | |
print "%s [-wothpv] url [url, url ...]" % __file__ | |
print "\t-w\t--wtf\t\tPrint this message" | |
print "\t-o DIR\t--output=DIR\tSpecify a path for the output files" | |
print "\t-t\t--text\t\tSave text extract as a text file" | |
print "\t-h\t--html\t\tSave text extract as an HTML file" | |
print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page" | |
print "\t-v\t--view\t\tView HTML files in the browser after generation" | |
print "\n" | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment