fish2000 · September 30, 2010 05:51
diff --git a/readability.py b/readability.py
 #!/usr/bin/env python
 # encoding: utf-8

 #####
 ##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be 
 ##### http://objectsinspaceandtime.com/
 #####
 ##### it's a python port of the Readability JavaScript bookmarklet,
 ##### by Arc90 Labs --
 ##### http://lab.arc90.com/2009/03/readability.php
 ##### http://code.google.com/p/arc90labs-readability/
 ##### this script and the original are both licenced under the Apache License 2.0.
 ##### 

 import sys
 import os
 import os.path
 import re
 import getopt
 import urllib2
 import urlparse
 import lxml
 from pyquery import PyQuery as pq
 from lxml import etree

 urls = []
 roster = {}
 outpaths = {}
 thisURL = ""
 breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE | re.MULTILINE)
 fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE | re.MULTILINE)
 fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE)
 multibreakre = re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}', re.IGNORECASE | re.MULTILINE)
 badnamere = re.compile('(comment|meta|footer|footnote|side)', re.IGNORECASE)
 goodclsre = re.compile('((^|\\s)(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))', re.IGNORECASE)
 goodidre = re.compile('^(post|hentry|main|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$', re.IGNORECASE)
 readabilityversion = "0.4"
 emailsrc = 'http://proto1.arc90.com/readability/email.php'
 iframeLoads = 0

 def main(argv):
 	view = False
 	toview = []
 	textout = False
 	htmlout = False
 	pageout = True
 	termout = True
 	basepath = os.getcwd()
 	
 	try:
 		opts, args = getopt.getopt(argv, "wo:thpv", [
 			"wtf",
 			"out=",
 			"text",
 			"html",
 			"page",
 			"view",
 		])
 	except getopt.GetoptError:
 		print "Error: Bad GetOpt"
 		echousage()
 		sys.exit(2)
 	for opt, arg in opts:
 		if opt in ("-w", "--wtf"):
 			echousage()
 			sys.exit(0)
 		elif opt in ("-o", "--out"):
 			if os.path.exists(arg):
 				basepath = os.path.realpath(arg)
 		elif opt in ("-t", "--text"):
 			textout = True
 		elif opt in ("-h", "--html"):
 			htmlout = True
 		elif opt in ("-p", "--page"):
 			pageout = True
 		elif opt in ("-v", "--view"):
 			view = True
 	if (len(args) > 0):
 		for urlarg in args:
 			if (urlarg.startswith('http://')):
 				urls.append(urlarg)
 			else:
 				print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8")
 	else:
 		urls.append('http://objectsinspaceandtime.com/')
 	
 	for url in urls:
 		urlp = urlparse.urlparse(url)
 		urlb = urlparse.urlsplit(url)
 		fileout = os.path.splitext(os.path.basename(urlp.path))[0]
 		if (fileout == "" or fileout == None):
 			fileout = re.sub("\.", "_", urlb.netloc)
 		if (fileout == "" or fileout == None):
 			fileout = "unknown"
 		if (fileout == "" or fileout == None):
 			fileout = os.path.splitext(os.path.dirname(urlb.path))[0]
 		
 		print "\nURL: "+unicode(url).encode("utf-8")
 		htmlobject = urllib2.urlopen(url)
 		thehtml = htmlobject.read()
 		(thediv, cleanpage) = grabarticle(thehtml, url)
 		
 		if (textout):
 			txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text())
 			toview.append(os.path.join(basepath, (fileout+"_extract.txt")))
 		if (htmlout):
 			txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html())
 			toview.append(os.path.join(basepath, (fileout+"_extract.html")))
 		if (pageout):
 			txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl())))
 			toview.append(os.path.join(basepath, (fileout+"_readable.html")))
 		
 	if (view and len(toview) > 0):
 		# not cross-platform at all
 		os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview))

 def grabarticle(html, url):
 	global thisURL
 	thisURL = ""+url
 	if url not in roster:
 		roster[url] = {}
 	else:
 		roster[url] = {}
 	
 	### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
 	nhtml = breakre.sub('</p><p>', html)
 	mhtml = fontre.sub('', nhtml)
 	
 	d = pq(mhtml, parser='html')
 	
 	allparagraphs = d('p, blockquote, span')
 	if len(allparagraphs) < 1:
 		allparagraphs = d('span, div')
 	
 	topdivcount = 0
 	td = None
 	topdiv = None
 	
 	### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score.
 	### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc.
 	try:
 		allparagraphs.map(assignscore)
 	except KeyError:
 		# but why?
 		print 'KEY ERROR!'
 	
 	print "\nScoreboard Results:"
 	for sk, sv in roster[url].iteritems():
 		print "Object: "+str(sk)+"\t\tScore: "+str(sv)
 	print "\n"
 	
 	### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
 	scoreboard = roster[url]
 	for tk in scoreboard.iterkeys():
 		if (td == None):
 			td = tk
 		elif (scoreboard[tk] > scoreboard[td]):
 			td = tk
 	if td == None:
 		out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable."
 		out += "\nroster:"
 		out += unicode(roster).encode("utf-8")
 		print out
 		return (pq("<div><h1>"+out+"</h1></div>"), url)
 	else:
 		topdiv = pq(td).clone()
 	
 	### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ...
 	#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove()
 	d('head').find('link').remove()
 	
 	### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) :
 	d('head').find('style').remove()
 	d('head').find('script').remove()
 	
 	### ORIGINAL COMMENT: Removes all style attributes
 	topdiv.find("*").attr('style', 'void:none;')
 	topdiv.find("*").removeAttr('style')
 	
 	### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff
 	topdiv.find('div').each(killdivs)
 	topdiv.find('script').remove()
 	d.find('div').each(killdivs)
 	d.find('script').remove()
 	
 	### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br /> 
 	#hh = multibreakre.sub(topdiv.html(), '<br />')
 	hh = multibreakre.sub('<br />', topdiv.html())
 	topdiv = pq("<div></div>").html(hh)
 	
 	### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case:
 	topdiv.find('form').each(cleanit)
 	topdiv.find('object').each(cleanit)
 	topdiv.find('table').each(cleanit)
 	topdiv.find('h1').each(cleanit)
 	topdiv.find('h2').each(cleanit)
 	topdiv.find('iframe').each(cleanit)
 	d.find('form').each(cleanit)
 	d.find('object').each(cleanit)
 	d.find('table').each(cleanit)
 	d.find('h1').each(cleanit)
 	d.find('h2').each(cleanit)
 	d.find('iframe').each(cleanit)
 	
 	
 	### ORIGINAL COMMENT: Add the footer and contents:
 	return (topdiv, d)

 ## function for each call to assign score
 def calculatescore(dompart, itsmyfirsttime):
 	### ORIGINAL COMMENT: Initialize readability data
 	ppart = dompart.getparent()
 	pscore = 0
 	
 	if itsmyfirsttime:
 		### ORIGINAL COMMENT: Look for a special classname
 		thecls = pq(ppart).attr('class')
 		theid = pq(ppart).attr('id')
 		if (thecls != None):
 			if (badnamere.search(thecls) != None):
 				pscore -= 50
 			elif (goodclsre.search(thecls) != None):
 				pscore += 25
 		### ORIGINAL COMMENT: Look for a special ID
 		if (theid != None):
 			if (badnamere.search(pq(ppart).attr('id')) != None):
 				pscore -= 50
 			elif (goodidre.search(pq(ppart).attr('id')) != None):
 				pscore += 25
 	### ORIGINAL COMMENT: Add a point for the paragraph found
 	if len(pq(dompart).text()) > 10:
 		pscore += 1
 	### ORIGINAL COMMENT: Add points for any commas within this paragraph
 	pscore += getcharcount(dompart)
 	return pscore

 def assignscore(idx, dompart):
 	global thisURL
 	ppart = dompart.getparent()
 	scoreboard = roster[thisURL]
 	virginscore = (ppart not in scoreboard)
 	iscored = calculatescore(dompart, virginscore)
 	if virginscore:
 		scoreboard[ppart] = iscored
 	else:
 		scoreboard[ppart] += iscored

 ### ORIGINAL COMMENT: Get character count
 def getcharcount(dompart, *args):
 	metric = ","
 	if len(args) > 1:
 		metric = str(args[0])
 	return pq(dompart).text().count(metric)

 def killdivs(thing):
 	### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ...
 	if getcharcount(thing) < 10:
 		dp = thing
 		ip = len(dp.find('p'))
 		iimg = len(dp.find('img'))
 		ili = len(dp.find('li'))
 		ia = len(dp.find('a'))
 		iembed = len(dp.find('embed'))
 		if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0):
 			thing.remove()

 def cleanit(thing):
 	### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child:
 	minwords = 10000000
 	if thing.is_("table"):
 		minwords = 250
 	if getcharcount(thing, " ") < minwords:
 		thing.remove()

 def readability(rdiv, rcontext, *args, **kwargs):
 	toolshtml = """
 	<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>
 	<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>
 	<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"""
 	readheadhtml = """
 	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' />
 	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />"""
 	readfooterhtml = """
 		<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a>
 			<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>"""
 	
 	readstyle = 'style-newspaper'
 	readsize = 'size-large'
 	readmargin = 'margin-wide'
 	
 	overlay = pq('<div id="readOverlay"></div>')
 	innerdiv = pq('<div id="readInner"></div>')
 	articletools = pq('<div id="readTools"></div>')
 	articlecontent = pq('<div></div>')
 	articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>')
 	articlefooter = pq('<div id="readFooter"></div>')
 	
 	### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title.
 	articlefooter.html(readfooterhtml)
 	articletools.html(toolshtml)
 	
 	articlecontent.append((rdiv))
 	innerdiv.addClass(readmargin).addClass(readsize)
 	innerdiv.append((articletitle))
 	innerdiv.append((articlecontent))
 	overlay.addClass(readstyle)
 	
 	if len(rcontext.find('body')) < 1:
 		rcontext.append(pq('<body></body>'))
 	rcontext.find('body').empty()
 	rcontext.find('head').eq(0).append((readheadhtml))
 	rcontext.find('body').eq(0).append((innerdiv))
 	rcontext.find('body').eq(0).append((articletools))
 	rcontext.find('body').eq(0).addClass(readstyle)
 	rcontext.find('body').eq(0).append((articlefooter))
 	
 	if (kwargs['theurlbase'] != None):
 		tbb = kwargs['theurlbase']
 		rcontext.make_links_absolute(base_url=tbb)
 		rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src')))))
 	
 	return rcontext

 def txtfile(where, what):
 	if os.path.exists(os.path.split(where)[0]):
 		print "Writing file %s..." % where
 		fff = open(where, 'w')
 		fff.write(unicode(what).encode("utf-8"))
 		fff.close()
 		return True
 	else:
 		return False

 def echousage():
 	print "\n"
 	print "readability.py 0.1. (c) 2009 Fish. All rights reserved."
 	print "\thttp://objectsinspaceandtime.com/"
 	print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:"
 	print "\thttp://lab.arc90.com/2009/03/readability.php"
 	print "Usage:"
 	print "%s [-wothpv] url [url, url ...]" % __file__
 	print "\t-w\t--wtf\t\tPrint this message"
 	print "\t-o DIR\t--output=DIR\tSpecify a path for the output files"
 	print "\t-t\t--text\t\tSave text extract as a text file"
 	print "\t-h\t--html\t\tSave text extract as an HTML file"
 	print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page"
 	print "\t-v\t--view\t\tView HTML files in the browser after generation"
 	print "\n"

 if __name__ == '__main__':
 	main(sys.argv[1:])
	#!/usr/bin/env python
	# encoding: utf-8

	#####
	##### Readability.py -- by fish. © 2009, Some Rights Reserved But Some May Be
	##### http://objectsinspaceandtime.com/
	#####
	##### it's a python port of the Readability JavaScript bookmarklet,
	##### by Arc90 Labs --
	##### http://lab.arc90.com/2009/03/readability.php
	##### http://code.google.com/p/arc90labs-readability/
	##### this script and the original are both licenced under the Apache License 2.0.
	#####

	import sys
	import os
	import os.path
	import re
	import getopt
	import urllib2
	import urlparse
	import lxml
	from pyquery import PyQuery as pq
	from lxml import etree

	urls = []
	roster = {}
	outpaths = {}
	thisURL = ""
	breakre = re.compile('<br/?>[ \r\n\s]*<br/?>', re.IGNORECASE \| re.MULTILINE)
	fontre = re.compile('<\/?font[^>]*>', re.IGNORECASE \| re.MULTILINE)
	fuckingbodyre = re.compile('<\/?body[^>]*?>', re.IGNORECASE)
	multibreakre = re.compile('(<br\s\/?>(\s\| ?)){1,}', re.IGNORECASE \| re.MULTILINE)
	badnamere = re.compile('(comment\|meta\|footer\|footnote\|side)', re.IGNORECASE)
	goodclsre = re.compile('((^\|\\s)(post\|hentry\|main\|entry[-]?(content\|text\|body)?\|article[-]?(content\|text\|body)?)(\\s\|$))', re.IGNORECASE)
	goodidre = re.compile('^(post\|hentry\|main\|entry[-]?(content\|text\|body)?\|article[-]?(content\|text\|body)?)$', re.IGNORECASE)
	readabilityversion = "0.4"
	emailsrc = 'http://proto1.arc90.com/readability/email.php'
	iframeLoads = 0

	def main(argv):
	view = False
	toview = []
	textout = False
	htmlout = False
	pageout = True
	termout = True
	basepath = os.getcwd()

	try:
	opts, args = getopt.getopt(argv, "wo:thpv", [
	"wtf",
	"out=",
	"text",
	"html",
	"page",
	"view",
	])
	except getopt.GetoptError:
	print "Error: Bad GetOpt"
	echousage()
	sys.exit(2)
	for opt, arg in opts:
	if opt in ("-w", "--wtf"):
	echousage()
	sys.exit(0)
	elif opt in ("-o", "--out"):
	if os.path.exists(arg):
	basepath = os.path.realpath(arg)
	elif opt in ("-t", "--text"):
	textout = True
	elif opt in ("-h", "--html"):
	htmlout = True
	elif opt in ("-p", "--page"):
	pageout = True
	elif opt in ("-v", "--view"):
	view = True
	if (len(args) > 0):
	for urlarg in args:
	if (urlarg.startswith('http://')):
	urls.append(urlarg)
	else:
	print "WARNING: Bad URL: "+unicode(urlarg).encode("utf-8")
	else:
	urls.append('http://objectsinspaceandtime.com/')

	for url in urls:
	urlp = urlparse.urlparse(url)
	urlb = urlparse.urlsplit(url)
	fileout = os.path.splitext(os.path.basename(urlp.path))[0]
	if (fileout == "" or fileout == None):
	fileout = re.sub("\.", "_", urlb.netloc)
	if (fileout == "" or fileout == None):
	fileout = "unknown"
	if (fileout == "" or fileout == None):
	fileout = os.path.splitext(os.path.dirname(urlb.path))[0]

	print "\nURL: "+unicode(url).encode("utf-8")
	htmlobject = urllib2.urlopen(url)
	thehtml = htmlobject.read()
	(thediv, cleanpage) = grabarticle(thehtml, url)

	if (textout):
	txtfile(os.path.join(basepath, (fileout+"_extract.txt")), thediv.text())
	toview.append(os.path.join(basepath, (fileout+"_extract.txt")))
	if (htmlout):
	txtfile(os.path.join(basepath, (fileout+"_extract.html")), thediv.html())
	toview.append(os.path.join(basepath, (fileout+"_extract.html")))
	if (pageout):
	txtfile(os.path.join(basepath, (fileout+"_readable.html")), readability(thediv, cleanpage, theurlbase=str(urlb.geturl())))
	toview.append(os.path.join(basepath, (fileout+"_readable.html")))

	if (view and len(toview) > 0):
	# not cross-platform at all
	os.execv('/usr/bin/open', (['-a', '/Applications/Safari.app'] + toview))

	def grabarticle(html, url):
	global thisURL
	thisURL = ""+url
	if url not in roster:
	roster[url] = {}
	else:
	roster[url] = {}

	### ORIGINAL COMMENT: Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
	nhtml = breakre.sub('</p><p>', html)
	mhtml = fontre.sub('', nhtml)

	d = pq(mhtml, parser='html')

	allparagraphs = d('p, blockquote, span')
	if len(allparagraphs) < 1:
	allparagraphs = d('span, div')

	topdivcount = 0
	td = None
	topdiv = None

	### ORIGINAL COMMENT: Study all the paragraphs and find the chunk that has the best score.
	### ORIGINAL COMMENT: A score is determined by things like: Number of <p>'s, commas, special classes, etc.
	try:
	allparagraphs.map(assignscore)
	except KeyError:
	# but why?
	print 'KEY ERROR!'

	print "\nScoreboard Results:"
	for sk, sv in roster[url].iteritems():
	print "Object: "+str(sk)+"\t\tScore: "+str(sv)
	print "\n"

	### ORIGINAL COMMENT: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
	scoreboard = roster[url]
	for tk in scoreboard.iterkeys():
	if (td == None):
	td = tk
	elif (scoreboard[tk] > scoreboard[td]):
	td = tk
	if td == None:
	out = "ERROR: This page sucks. The content from: " + str(url) + "... was totally unreadable."
	out += "\nroster:"
	out += unicode(roster).encode("utf-8")
	print out
	return (pq("<div><h1>"+out+"</h1></div>"), url)
	else:
	topdiv = pq(td).clone()

	### ORIGINAL COMMENT: REMOVES ALL STYLESHEETS ...
	#d('head').find('link').filter(lambda i: pq(this).attr('rel') == 'stylesheet').remove()
	d('head').find('link').remove()

	### ORIGINAL COMMENT: Remove all style tags in head (not doing this on IE) :
	d('head').find('style').remove()
	d('head').find('script').remove()

	### ORIGINAL COMMENT: Removes all style attributes
	topdiv.find("*").attr('style', 'void:none;')
	topdiv.find("*").removeAttr('style')

	### ORIGINAL COMMENT: Goes in and removes DIV's that have more non <p> stuff than <p> stuff
	topdiv.find('div').each(killdivs)
	topdiv.find('script').remove()
	d.find('div').each(killdivs)
	d.find('script').remove()

	### ORIGINAL COMMENT: Removes any consecutive <br />'s into just one <br />
	#hh = multibreakre.sub(topdiv.html(), '<br />')
	hh = multibreakre.sub('<br />', topdiv.html())
	topdiv = pq("<div></div>").html(hh)

	### ORIGINAL COMMENT: Cleans out junk from the topDiv just in case:
	topdiv.find('form').each(cleanit)
	topdiv.find('object').each(cleanit)
	topdiv.find('table').each(cleanit)
	topdiv.find('h1').each(cleanit)
	topdiv.find('h2').each(cleanit)
	topdiv.find('iframe').each(cleanit)
	d.find('form').each(cleanit)
	d.find('object').each(cleanit)
	d.find('table').each(cleanit)
	d.find('h1').each(cleanit)
	d.find('h2').each(cleanit)
	d.find('iframe').each(cleanit)


	### ORIGINAL COMMENT: Add the footer and contents:
	return (topdiv, d)

	## function for each call to assign score
	def calculatescore(dompart, itsmyfirsttime):
	### ORIGINAL COMMENT: Initialize readability data
	ppart = dompart.getparent()
	pscore = 0

	if itsmyfirsttime:
	### ORIGINAL COMMENT: Look for a special classname
	thecls = pq(ppart).attr('class')
	theid = pq(ppart).attr('id')
	if (thecls != None):
	if (badnamere.search(thecls) != None):
	pscore -= 50
	elif (goodclsre.search(thecls) != None):
	pscore += 25
	### ORIGINAL COMMENT: Look for a special ID
	if (theid != None):
	if (badnamere.search(pq(ppart).attr('id')) != None):
	pscore -= 50
	elif (goodidre.search(pq(ppart).attr('id')) != None):
	pscore += 25
	### ORIGINAL COMMENT: Add a point for the paragraph found
	if len(pq(dompart).text()) > 10:
	pscore += 1
	### ORIGINAL COMMENT: Add points for any commas within this paragraph
	pscore += getcharcount(dompart)
	return pscore

	def assignscore(idx, dompart):
	global thisURL
	ppart = dompart.getparent()
	scoreboard = roster[thisURL]
	virginscore = (ppart not in scoreboard)
	iscored = calculatescore(dompart, virginscore)
	if virginscore:
	scoreboard[ppart] = iscored
	else:
	scoreboard[ppart] += iscored

	### ORIGINAL COMMENT: Get character count
	def getcharcount(dompart, *args):
	metric = ","
	if len(args) > 1:
	metric = str(args[0])
	return pq(dompart).text().count(metric)

	def killdivs(thing):
	### ORIGINAL COMMENT: If the number of commas is less than 10 (bad sign) ...
	if getcharcount(thing) < 10:
	dp = thing
	ip = len(dp.find('p'))
	iimg = len(dp.find('img'))
	ili = len(dp.find('li'))
	ia = len(dp.find('a'))
	iembed = len(dp.find('embed'))
	if (iimg > ip or ili > ip or ia > ip or ip == 0 or iembed > 0):
	thing.remove()

	def cleanit(thing):
	### ORIGINAL COMMENT: If the text content isn't laden with words, remove the child:
	minwords = 10000000
	if thing.is_("table"):
	minwords = 250
	if getcharcount(thing, " ") < minwords:
	thing.remove()

	def readability(rdiv, rcontext, args, *kwargs):
	toolshtml = """
	<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>
	<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>
	<a href='#' onclick='emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"""
	readheadhtml = """
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability.css' media='screen' type='text/css' />
	<link rel='stylesheet' href='http://lab.arc90.com/experiments/readability/css/readability-print.css' media='print' type='text/css' />"""
	readfooterhtml = """
	<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer.png' /></a>
	<div class='footer-right'><span class='version'>Readability version readabilityVersion</span></div>"""

	readstyle = 'style-newspaper'
	readsize = 'size-large'
	readmargin = 'margin-wide'

	overlay = pq('<div id="readOverlay"></div>')
	innerdiv = pq('<div id="readInner"></div>')
	articletools = pq('<div id="readTools"></div>')
	articlecontent = pq('<div></div>')
	articletitle = pq('<h1>'+unicode(rcontext('title').text()).encode("utf-8")+'</h1>')
	articlefooter = pq('<div id="readFooter"></div>')

	### ORIGINAL COMMENT: Grab the title from the <title> tag and inject it as the title.
	articlefooter.html(readfooterhtml)
	articletools.html(toolshtml)

	articlecontent.append((rdiv))
	innerdiv.addClass(readmargin).addClass(readsize)
	innerdiv.append((articletitle))
	innerdiv.append((articlecontent))
	overlay.addClass(readstyle)

	if len(rcontext.find('body')) < 1:
	rcontext.append(pq('<body></body>'))
	rcontext.find('body').empty()
	rcontext.find('head').eq(0).append((readheadhtml))
	rcontext.find('body').eq(0).append((innerdiv))
	rcontext.find('body').eq(0).append((articletools))
	rcontext.find('body').eq(0).addClass(readstyle)
	rcontext.find('body').eq(0).append((articlefooter))

	if (kwargs['theurlbase'] != None):
	tbb = kwargs['theurlbase']
	rcontext.make_links_absolute(base_url=tbb)
	rcontext.find('img').each(lambda aimg: aimg.attr('src').startswith('http://') and (1) or (aimg.attr('src', urlparse.urljoin(tbb, aimg.attr('src')))))

	return rcontext

	def txtfile(where, what):
	if os.path.exists(os.path.split(where)[0]):
	print "Writing file %s..." % where
	fff = open(where, 'w')
	fff.write(unicode(what).encode("utf-8"))
	fff.close()
	return True
	else:
	return False

	def echousage():
	print "\n"
	print "readability.py 0.1. (c) 2009 Fish. All rights reserved."
	print "\thttp://objectsinspaceandtime.com/"
	print "Adapted from the Readability JavaScript bookmarklet by Arc90 Labs:"
	print "\thttp://lab.arc90.com/2009/03/readability.php"
	print "Usage:"
	print "%s [-wothpv] url [url, url ...]" % __file__
	print "\t-w\t--wtf\t\tPrint this message"
	print "\t-o DIR\t--output=DIR\tSpecify a path for the output files"
	print "\t-t\t--text\t\tSave text extract as a text file"
	print "\t-h\t--html\t\tSave text extract as an HTML file"
	print "\t-p\t--page\t\tSave text extract within a Readability(tm) HTML page"
	print "\t-v\t--view\t\tView HTML files in the browser after generation"
	print "\n"

	if __name__ == '__main__':
	main(sys.argv[1:])