Last active
October 28, 2021 10:57
Revisions
-
Patrick Lerner revised this gist
Jun 27, 2013 . 1 changed file with 8 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -6,6 +6,8 @@ require 'nokogiri' require 'open-uri' require 'tmpdir' require 'trollop' require 'rbconfig' $is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin|mingw|cygwin/) def clean_string (str) str.tr('0-9', '0-9').sub('h2', 'h2').sub('h3', 'h3').sub('h4', 'h4') @@ -188,7 +190,11 @@ eos File.open(dir + "/" + fileName + ".opf", 'w') { |file| file.write(@opf_file) } if $is_windows system "kindlegen.exe \"#{dir + "/" + fileName}.opf\"" else system "kindlegen \"#{dir + "/" + fileName}.opf\"" end FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi" } end @@ -228,7 +234,7 @@ backends.each { |b| end KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]}) if opts[:open] and not $is_windows system "killall Kindle" kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi" FileUtils.rm kindleFilePath if File.exists? (kindleFilePath) -
Patrick Lerner revised this gist
Jun 27, 2013 . 1 changed file with 3 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,6 @@ #!/usr/bin/env ruby # encoding: utf-8 # Version: 0.2a 2013-06-28 require 'nokogiri' require 'open-uri' @@ -196,7 +197,7 @@ end # main part opts = Trollop::options do version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [[email protected]]" banner <<-EOS This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!). @@ -225,7 +226,7 @@ backends.each { |b| else fileName = article.get_title(:ruby => false, :clean => true) end KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]}) if opts[:open] system "killall Kindle" -
Patrick Lerner revised this gist
Jun 27, 2013 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -81,7 +81,7 @@ class NHKArticle < Article strip_element_tags lines, 'a' c += clean_string(lines.inner_html.to_s) end c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">') end end -
Patrick Lerner created this gist
Jun 27, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,240 @@ #!/usr/bin/env ruby # encoding: utf-8 require 'nokogiri' require 'open-uri' require 'tmpdir' require 'trollop' def clean_string (str) str.tr('0-9', '0-9').sub('h2', 'h2').sub('h3', 'h3').sub('h4', 'h4') end def strip_element_tags (node, element_name) node.search('.//' + element_name).each do |e| e.replace e.inner_html end end def strip_ruby_tags (node) node.search('.//rt').remove strip_element_tags(node, 'ruby') end class Article def get_title (options = {}) @doc.xpath(@XPath_title).each do |lines| strip_ruby_tags lines if not options[:ruby] return lines.content.to_s if options[:clean] return clean_string(lines.to_s) end end def get_date (options = {}) @doc.xpath(@XPath_time).each do |lines| strip_element_tags lines, 'span' return clean_string(lines.to_s) end end def get_content (options = {:ruby => false}) @doc.xpath(@XPath_article).each do |lines| strip_ruby_tags lines if not options[:ruby] strip_element_tags lines, 'span' strip_element_tags lines, 'a' return clean_string(lines.inner_html.to_s) end end end class NHKEasyArticle < Article def initialize (url) @doc = Nokogiri::HTML(open(url)) @XPath_title = '//*[@id="newstitle"]/h2' @XPath_time = '//*[@id="newsDate"]' @XPath_article = '//*[@id="newsarticle"]' end end class NHKArticle < Article def initialize (url) @doc = Nokogiri::HTML(open(url)) @XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span' @XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div' @XPath_article = '//*[@id="news"]/div[2]/div/div/div' end def get_title (options = {}) super.gsub 'span', 'h2' end def get_date (options = {}) super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>') end def get_content (options = {:ruby => false}) c = '' @doc.xpath(@XPath_article).each do |lines| break if lines.attribute('id').to_s == "news_mkanren" strip_ruby_tags lines if not options[:ruby] strip_element_tags lines, 'span' strip_element_tags lines, 'a' c += clean_string(lines.inner_html.to_s) end c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">').sub(/<div id="news_mkanren"><\/div>.*/, '') end end class HTMLOutput def initialize (article, fileName, options = {}) title = article.get_title(:ruby => false, :clean => true) @horizontal_css = <<eos body { font-family: serif; } h2, h3 { font-weight: bold; padding-top: 2em; margin-right: 1em; margin-left: 1em; } h2 { font-size: 120%; } p { text-indent: 1em; } #newsDate { font-size: 90%; font-weight:bold; line-height: 1.5; } eos @vertical_css = <<eos body { -webkit-writing-mode: vertical-rl; } #newsDate { padding-top: 10em; text-indent: -4em; } eos @vertical_css = @horizontal_css + @vertical_css @html_header = <<eos <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta http-equiv="Content-Style-Type" content="text/css" /> <meta name="generator" content="pandoc" /> <title>{{TITLE}}</title> <link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" /> <link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" /> <meta name="DC.Title" content="{{TITLE}}" /> <meta name="DC.Creator" content="NHK" /> <meta name="DC.Publisher" content="NHK" /></head> <body> eos @html_footer = <<eos </body> </html> eos @html_header.gsub! '{{TITLE}}', title @html_header.gsub! '{{CSS_FILE}}', fileName + ".css" File.open(fileName + ".css", 'w') { |file| file.write(@horizontal_css) if options[:horizontal] file.write(@vertical_css) if not options[:horizontal] } File.open(fileName + ".html", 'w') { |file| file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css")) file.write(article.get_title(options)) file.write(article.get_date(options)) file.write(article.get_content(options)) file.write(@html_footer) } end end class KindleOutput def initialize (article, fileName, options = {}) title = article.get_title(:ruby => false, :clean => true) @opf_file = <<eos <?xml version="1.0" encoding="UTF-8"?> <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/"> <dc:title>{{TITLE}}</dc:title> <dc:contributor>NHK</dc:contributor> <dc:language>ja</dc:language> <dc:publisher>NHK</dc:publisher> </metadata> <manifest> <item id="style" href="{{CSS_FILE}}" media-type="text/css" /> <item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" /> </manifest> <spine toc="tocncx" page-progression-direction="rtl"> <itemref idref="titlepage" /> </spine> </package> eos @opf_file.gsub! '{{TITLE}}', title @opf_file.gsub! '{{FILENAME}}', fileName @opf_file.gsub! '{{CSS_FILE}}', fileName + ".css" Dir.mktmpdir { |dir| HTMLOutput.new(article, dir + "/" + fileName, options) File.open(dir + "/" + fileName + ".opf", 'w') { |file| file.write(@opf_file) } system "kindlegen \"#{dir + "/" + fileName}.opf\"" FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi" } end end # main part opts = Trollop::options do version "JapNewsToKindle 0.2 (c) 2013 Patrick Lerner [[email protected]]" banner <<-EOS This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!). Usage: JapNewsToKindle [options] where [options] are: EOS opt :ruby, "Get furigana if possible", :short => 'r' opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u' opt :out, "The output filename", :type => String, :short => 'O' opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n' opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o' end backends = [ [/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle], [/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle] ] backends.each { |b| if b[0].match(opts[:url]) article = b[1].new(opts[:url]) if opts[:out] fileName = opts[:out] else fileName = article.get_title(:ruby => false, :clean => true) end KindleOutput.new(article, fileName, {:ruby => opts[:ruby]}) if opts[:open] system "killall Kindle" kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi" FileUtils.rm kindleFilePath if File.exists? (kindleFilePath) system "open \"#{fileName}.mobi\"" end exit end } Trollop::die :url, "must match against a backend supported by this program"