|
# -*- coding: utf-8 -*- |
|
require 'mp3info' |
|
require 'nokogiri' |
|
|
|
# The main method on this module +plain_text+ will convert a string of HTML to a plain text approximation. |
|
class HtmlToPlainText |
|
|
|
IGNORE_TAGS = %w(script style object applet iframe).inject({}){|h, t| h[t] = true; h}.freeze |
|
PARAGRAPH_TAGS = %w(p h1 h2 h3 h4 h5 h6 table ol ul dl dd blockquote dialog figure aside section).inject({}){|h, t| h[t] = true; h}.freeze |
|
BLOCK_TAGS = %w(div address li dt center del article header header footer nav pre legend tr).inject({}){|h, t| h[t] = true; h}.freeze |
|
WHITESPACE = [" ", "\n", "\r"].freeze |
|
PLAINTEXT = "plaintext".freeze |
|
PRE = "pre".freeze |
|
BR = "br".freeze |
|
HR = "hr".freeze |
|
TD = "td".freeze |
|
TH = "th".freeze |
|
TR = "tr".freeze |
|
OL = "ol".freeze |
|
UL = "ul".freeze |
|
LI = "li".freeze |
|
A = "a".freeze |
|
TABLE = "table".freeze |
|
NUMBERS = ["1", "a"].freeze |
|
ABSOLUTE_URL_PATTERN = /^[a-z]+:\/\/[a-z0-9]/i.freeze |
|
HTML_PATTERN = /[<&]/.freeze |
|
TRAILING_WHITESPACE = /[ \t]+$/.freeze |
|
BODY_TAG_XPATH = "/html/body".freeze |
|
CARRIDGE_RETURN_PATTERN = /\r(\n?)/.freeze |
|
LINE_BREAK_PATTERN = /[\n\r]/.freeze |
|
NON_PROTOCOL_PATTERN = /:\/?\/?(.*)/.freeze |
|
NOT_WHITESPACE_PATTERN = /\S/.freeze |
|
SPACE = " ".freeze |
|
EMPTY = "".freeze |
|
NEWLINE = "\n".freeze |
|
HREF = "href".freeze |
|
TABLE_SEPARATOR = " | ".freeze |
|
|
|
class << self |
|
# Convert some HTML into a plain text approximation. |
|
|
|
def truncate(str, truncate_at, options = {}) |
|
return str.dup unless str.length > truncate_at |
|
|
|
options[:omission] ||= '...' |
|
length_with_room_for_omission = truncate_at - options[:omission].length |
|
stop = if options[:separator] |
|
str.rindex(options[:separator], length_with_room_for_omission) || length_with_room_for_omission |
|
else |
|
length_with_room_for_omission |
|
end |
|
|
|
"#{str[0...stop]}#{options[:omission]}" |
|
end |
|
|
|
def plain_text(html) |
|
return nil if html.nil? |
|
return html.dup unless html =~ HTML_PATTERN |
|
body = Nokogiri::HTML::Document.parse(html).xpath(BODY_TAG_XPATH).first |
|
return unless body |
|
convert_node_to_plain_text(body).strip.gsub(CARRIDGE_RETURN_PATTERN, NEWLINE) |
|
end |
|
|
|
private |
|
|
|
# Convert an HTML node to plain text. This method is called recursively with the output and |
|
# formatting options for special tags. |
|
def convert_node_to_plain_text(parent, out = '', options = {}) |
|
if PARAGRAPH_TAGS.include?(parent.name) |
|
append_paragraph_breaks(out) |
|
elsif BLOCK_TAGS.include?(parent.name) |
|
append_block_breaks(out) |
|
end |
|
|
|
format_list_item(out, options) if parent.name == LI |
|
out << "| " if parent.name == TR && data_table?(parent.parent) |
|
|
|
parent.children.each do |node| |
|
if node.text? || node.cdata? |
|
text = node.text |
|
unless options[:pre] |
|
text = node.text.gsub(LINE_BREAK_PATTERN, SPACE).squeeze(SPACE) |
|
text.lstrip! if WHITESPACE.include?(out[-1, 1]) |
|
end |
|
out << text |
|
elsif node.name == PLAINTEXT |
|
out << node.text |
|
elsif node.element? && !IGNORE_TAGS.include?(node.name) |
|
convert_node_to_plain_text(node, out, child_options(node, options)) |
|
|
|
if node.name == BR |
|
out.sub!(TRAILING_WHITESPACE, EMPTY) |
|
out << NEWLINE |
|
elsif node.name == HR |
|
out.sub!(TRAILING_WHITESPACE, EMPTY) |
|
out << NEWLINE unless out.end_with?(NEWLINE) |
|
out << "-------------------------------\n" |
|
elsif node.name == TD || node.name == TH |
|
out << (data_table?(parent.parent) ? TABLE_SEPARATOR : SPACE) |
|
elsif node.name == A |
|
href = node[HREF] |
|
if href && |
|
href =~ ABSOLUTE_URL_PATTERN && |
|
node.text =~ NOT_WHITESPACE_PATTERN && |
|
node.text != href && |
|
node.text != href[NON_PROTOCOL_PATTERN, 1] # use only text for <a href="mailto:a@b.com">a@b.com</a> |
|
out << " (#{href}) " |
|
end |
|
elsif PARAGRAPH_TAGS.include?(node.name) |
|
append_paragraph_breaks(out) |
|
elsif BLOCK_TAGS.include?(node.name) |
|
append_block_breaks(out) |
|
end |
|
end |
|
end |
|
out |
|
end |
|
|
|
# Set formatting options that will be passed to child elements for a tag. |
|
def child_options(node, options) |
|
if node.name == UL |
|
level = options[:ul] || -1 |
|
level += 1 |
|
options.merge(:list => :ul, :ul => level) |
|
elsif node.name == OL |
|
level = options[:ol] || -1 |
|
level += 1 |
|
options.merge(:list => :ol, :ol => level, :number => NUMBERS[level % 2]) |
|
elsif node.name == PRE |
|
options.merge(:pre => true) |
|
else |
|
options |
|
end |
|
end |
|
|
|
# Add double line breaks between paragraph elements. If line breaks already exist, |
|
# new ones will only be added to get to two. |
|
def append_paragraph_breaks(out) |
|
out.sub!(TRAILING_WHITESPACE, EMPTY) |
|
if out.end_with?(NEWLINE) |
|
out << NEWLINE unless out.end_with?("\n\n") |
|
else |
|
out << "\n\n" |
|
end |
|
end |
|
|
|
# Add a single line break between block elements. If a line break already exists, |
|
# none will be added. |
|
def append_block_breaks(out) |
|
out.sub!(TRAILING_WHITESPACE, EMPTY) |
|
out << NEWLINE unless out.end_with?(NEWLINE) |
|
end |
|
|
|
# Add an appropriate bullet or number to a list element. |
|
def format_list_item(out, options) |
|
if options[:list] == :ul |
|
out << "#{'*' * (options[:ul] + 1)} " |
|
elsif options[:list] == :ol |
|
number = options[:number] |
|
options[:number] = number.next |
|
out << "#{number}. " |
|
end |
|
end |
|
|
|
def data_table?(table) |
|
table.attributes['border'].to_s.to_i > 0 |
|
end |
|
end |
|
end |
|
|
|
xml.instruct! |
|
xml.rss 'xmlns:itunes' => 'http://www.itunes.com/dtds/podcast-1.0.dtd', 'xmlns:atom' => 'http://www.w3.org/2005/Atom', :version => '2.0' do |
|
xml.channel do |
|
xml.title config[:blog_title] |
|
xml.description config[:blog_description] |
|
xml.link URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml') |
|
xml.atom :link, 'rel' => 'self', 'href' => URI.join(config[:blog_url], blog.options.prefix.to_s, 'podcast.xml') |
|
xml.language 'en-CA' |
|
xml.lastBuildDate blog.articles.first.date.rfc2822 |
|
xml.pubDate blog.articles.first.date.rfc2822 |
|
xml.itunes :author, 'John Frank' |
|
xml.itunes :keywords, config[:blog_keywords].join(', ') |
|
xml.itunes :explicit, (config[:blog_clean] ? 'clean' : 'yes') |
|
xml.itunes :image, :href => URI.join(config[:blog_url], image_path('icon.png')) |
|
xml.itunes :summary, HtmlToPlainText.truncate(HtmlToPlainText.plain_text(config[:blog_description]), 1950) |
|
xml.itunes :owner do |
|
xml.itunes :name, 'John Frank' |
|
xml.itunes :email, 'john@example.com' |
|
end |
|
xml.itunes :category, :text => 'Science & Medicine' do |
|
xml.itunes :category, :text => 'Medicine' |
|
end |
|
|
|
blog.articles.each do |article| |
|
xml.item do |
|
xml.title article.title |
|
xml.pubDate article.date.rfc822 |
|
xml.enclosure :url => tracked_url(podcast_url(article)), :length => File.size(podcast_source_path(article)), :type => 'audio/mpeg' |
|
xml.link URI.join(config[:blog_url], article.url) |
|
xml.guid({ :isPermaLink => true }, URI.join(config[:blog_url], article.url)) |
|
xml.itunes :author, 'John Frank' |
|
xml.itunes :summary do |
|
xml.cdata! HtmlToPlainText.truncate(HtmlToPlainText.plain_text(article.body), 3950) |
|
end |
|
xml.itunes :duration, Mp3Info.new(podcast_source_path(article)).length.to_i |
|
# xml.description do |
|
# xml.cdata! article.body + partial(:audio_tag, :locals => { :article => article }) |
|
# end |
|
# Most RSS readers will pull out the link to the enclosure, so no need to include it here. |
|
xml.description do |
|
xml.cdata! article.body |
|
end |
|
end |
|
end |
|
end |
|
end |