PatrickLerner · October 28, 2021 10:57 · Jun 27, 2013 · Jun 27, 2013 · Jun 27, 2013 · Jun 27, 2013
diff --git a/JapNewsToKindle b/JapNewsToKindle
@@ -6,6 +6,8 @@ require 'nokogiri'
 require 'open-uri'
 require 'tmpdir'
 require 'trollop'
+require 'rbconfig'
+$is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin|mingw|cygwin/)
 
 def clean_string (str)
   str.tr('0-9', '０-９').sub('h２', 'h2').sub('h３', 'h3').sub('h４', 'h4')
@@ -188,7 +190,11 @@ eos
       File.open(dir + "/" + fileName + ".opf", 'w') { |file|
         file.write(@opf_file)
       }
-      system "kindlegen \"#{dir + "/" + fileName}.opf\""
+      if $is_windows
+        system "kindlegen.exe \"#{dir + "/" + fileName}.opf\""
+      else
+        system "kindlegen \"#{dir + "/" + fileName}.opf\""
+      end
       FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
     }
   end
@@ -228,7 +234,7 @@ backends.each { |b|
     end
     KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})
 
-    if opts[:open]
+    if opts[:open] and not $is_windows
       system "killall Kindle"
       kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
       FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)

diff --git a/JapNewsToKindle b/JapNewsToKindle
@@ -1,5 +1,6 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
+# Version: 0.2a 2013-06-28
 
 require 'nokogiri'
 require 'open-uri'
@@ -196,7 +197,7 @@ end
 # main part
 
 opts = Trollop::options do
-  version "JapNewsToKindle 0.2 (c) 2013 Patrick Lerner [[email protected]]"
+  version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [[email protected]]"
   banner <<-EOS
 This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).
 
@@ -225,7 +226,7 @@ backends.each { |b|
     else
       fileName = article.get_title(:ruby => false, :clean => true)
     end
-    KindleOutput.new(article, fileName, {:ruby => opts[:ruby]})
+    KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})
 
     if opts[:open]
       system "killall Kindle"

diff --git a/JapNewsToKindle b/JapNewsToKindle
@@ -81,7 +81,7 @@ class NHKArticle < Article
       strip_element_tags lines, 'a'
       c += clean_string(lines.inner_html.to_s)
     end
-    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">').sub(/<div id="news_mkanren"><\/div>.*/, '')
+    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">')
   end
 end
 

diff --git a/JapNewsToKindle b/JapNewsToKindle
@@ -0,0 +1,240 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'nokogiri'
+require 'open-uri'
+require 'tmpdir'
+require 'trollop'
+
+def clean_string (str)
+  str.tr('0-9', '０-９').sub('h２', 'h2').sub('h３', 'h3').sub('h４', 'h4')
+end
+
+def strip_element_tags (node, element_name)
+  node.search('.//' + element_name).each do |e|
+    e.replace e.inner_html
+  end
+end
+
+def strip_ruby_tags (node)
+  node.search('.//rt').remove
+  strip_element_tags(node, 'ruby')
+end
+
+class Article
+  def get_title (options = {})
+    @doc.xpath(@XPath_title).each do |lines|
+      strip_ruby_tags lines if not options[:ruby]
+      return lines.content.to_s if options[:clean]
+      return clean_string(lines.to_s)
+    end
+  end
+
+  def get_date (options = {})
+    @doc.xpath(@XPath_time).each do |lines|
+      strip_element_tags lines, 'span'
+      return clean_string(lines.to_s)
+    end
+  end
+
+  def get_content (options = {:ruby => false})
+    @doc.xpath(@XPath_article).each do |lines|
+      strip_ruby_tags lines if not options[:ruby]
+      strip_element_tags lines, 'span'
+      strip_element_tags lines, 'a'
+      return clean_string(lines.inner_html.to_s)
+    end
+  end
+end
+
+class NHKEasyArticle < Article
+  def initialize (url)
+    @doc = Nokogiri::HTML(open(url))
+    @XPath_title = '//*[@id="newstitle"]/h2'
+    @XPath_time = '//*[@id="newsDate"]'
+    @XPath_article = '//*[@id="newsarticle"]'
+  end
+end
+
+class NHKArticle < Article
+  def initialize (url)
+    @doc = Nokogiri::HTML(open(url))
+    @XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span'
+    @XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div'
+    @XPath_article = '//*[@id="news"]/div[2]/div/div/div'
+  end
+
+  def get_title (options = {})
+    super.gsub 'span', 'h2'
+  end
+
+  def get_date (options = {})
+    super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>')
+  end
+
+  def get_content (options = {:ruby => false})
+    c = ''
+    @doc.xpath(@XPath_article).each do |lines|
+      break if lines.attribute('id').to_s == "news_mkanren"
+      strip_ruby_tags lines if not options[:ruby]
+      strip_element_tags lines, 'span'
+      strip_element_tags lines, 'a'
+      c += clean_string(lines.inner_html.to_s)
+    end
+    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">').sub(/<div id="news_mkanren"><\/div>.*/, '')
+  end
+end
+
+class HTMLOutput
+  def initialize (article, fileName, options = {})
+    title = article.get_title(:ruby => false, :clean => true)
+
+    @horizontal_css = <<eos
+body {
+  font-family: serif; }
+h2, h3 {
+  font-weight: bold;
+  padding-top: 2em;
+  margin-right: 1em;
+  margin-left: 1em; }
+h2 {
+  font-size: 120%; }
+p {
+  text-indent: 1em; }
+#newsDate {
+  font-size: 90%;
+  font-weight:bold;
+  line-height: 1.5; }
+eos
+
+    @vertical_css = <<eos
+body {
+  -webkit-writing-mode: vertical-rl; }
+#newsDate {
+  padding-top: 10em;
+  text-indent: -4em; }
+eos
+  @vertical_css = @horizontal_css + @vertical_css
+
+    @html_header = <<eos
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta http-equiv="Content-Style-Type" content="text/css" />
+  <meta name="generator" content="pandoc" />
+  <title>{{TITLE}}</title>
+  <link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" />
+  <link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" />
+  <meta name="DC.Title" content="{{TITLE}}" />
+  <meta name="DC.Creator" content="NHK" />
+  <meta name="DC.Publisher" content="NHK" /></head>
+<body>
+eos
+
+    @html_footer = <<eos
+</body>
+</html>
+eos
+
+    @html_header.gsub! '{{TITLE}}', title
+    @html_header.gsub! '{{CSS_FILE}}', fileName + ".css"
+
+    File.open(fileName + ".css", 'w') { |file|
+      file.write(@horizontal_css) if options[:horizontal]
+      file.write(@vertical_css) if not options[:horizontal]
+    }
+
+    File.open(fileName + ".html", 'w') { |file|
+      file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css"))
+      file.write(article.get_title(options))
+      file.write(article.get_date(options))
+      file.write(article.get_content(options))
+      file.write(@html_footer)
+    }
+  end
+end
+
+class KindleOutput
+  def initialize (article, fileName, options = {})
+    title = article.get_title(:ruby => false, :clean => true)
+
+    @opf_file = <<eos
+<?xml version="1.0" encoding="UTF-8"?>
+<package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
+ <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/">
+   <dc:title>{{TITLE}}</dc:title> 
+   <dc:contributor>NHK</dc:contributor>
+   <dc:language>ja</dc:language>
+   <dc:publisher>NHK</dc:publisher>
+ </metadata>
+ <manifest>
+  <item id="style" href="{{CSS_FILE}}" media-type="text/css" />
+  <item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" />
+ </manifest>
+ <spine toc="tocncx" page-progression-direction="rtl">
+  <itemref idref="titlepage" />
+ </spine>
+</package>
+eos
+    @opf_file.gsub! '{{TITLE}}', title
+    @opf_file.gsub! '{{FILENAME}}', fileName
+    @opf_file.gsub! '{{CSS_FILE}}', fileName + ".css"
+
+    Dir.mktmpdir { |dir|
+      HTMLOutput.new(article, dir + "/" + fileName, options)
+
+      File.open(dir + "/" + fileName + ".opf", 'w') { |file|
+        file.write(@opf_file)
+      }
+      system "kindlegen \"#{dir + "/" + fileName}.opf\""
+      FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
+    }
+  end
+end
+
+# main part
+
+opts = Trollop::options do
+  version "JapNewsToKindle 0.2 (c) 2013 Patrick Lerner [[email protected]]"
+  banner <<-EOS
+This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).
+
+Usage:
+       JapNewsToKindle [options]
+where [options] are:
+EOS
+
+  opt :ruby, "Get furigana if possible", :short => 'r'
+  opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u'
+  opt :out, "The output filename", :type => String, :short => 'O'
+  opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n'
+  opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o'
+end
+
+backends = [
+  [/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle],
+  [/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle]
+]
+
+backends.each { |b|
+  if b[0].match(opts[:url])
+    article = b[1].new(opts[:url])
+    if opts[:out]
+      fileName = opts[:out]
+    else
+      fileName = article.get_title(:ruby => false, :clean => true)
+    end
+    KindleOutput.new(article, fileName, {:ruby => opts[:ruby]})
+
+    if opts[:open]
+      system "killall Kindle"
+      kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
+      FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)
+      system "open \"#{fileName}.mobi\""
+    end
+   exit
+  end
+}
+
+Trollop::die :url, "must match against a backend supported by this program"