Skip to content

Instantly share code, notes, and snippets.

@PatrickLerner
Last active October 28, 2021 10:57

Revisions

  1. Patrick Lerner revised this gist Jun 27, 2013. 1 changed file with 8 additions and 2 deletions.
    10 changes: 8 additions & 2 deletions JapNewsToKindle
    Original file line number Diff line number Diff line change
    @@ -6,6 +6,8 @@ require 'nokogiri'
    require 'open-uri'
    require 'tmpdir'
    require 'trollop'
    require 'rbconfig'
    $is_windows = (RbConfig::CONFIG['host_os'] =~ /mswin|mingw|cygwin/)

    def clean_string (str)
    str.tr('0-9', '0-9').sub('h2', 'h2').sub('h3', 'h3').sub('h4', 'h4')
    @@ -188,7 +190,11 @@ eos
    File.open(dir + "/" + fileName + ".opf", 'w') { |file|
    file.write(@opf_file)
    }
    system "kindlegen \"#{dir + "/" + fileName}.opf\""
    if $is_windows
    system "kindlegen.exe \"#{dir + "/" + fileName}.opf\""
    else
    system "kindlegen \"#{dir + "/" + fileName}.opf\""
    end
    FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
    }
    end
    @@ -228,7 +234,7 @@ backends.each { |b|
    end
    KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})

    if opts[:open]
    if opts[:open] and not $is_windows
    system "killall Kindle"
    kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
    FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)
  2. Patrick Lerner revised this gist Jun 27, 2013. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions JapNewsToKindle
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,6 @@
    #!/usr/bin/env ruby
    # encoding: utf-8
    # Version: 0.2a 2013-06-28

    require 'nokogiri'
    require 'open-uri'
    @@ -196,7 +197,7 @@ end
    # main part

    opts = Trollop::options do
    version "JapNewsToKindle 0.2 (c) 2013 Patrick Lerner [[email protected]]"
    version "JapNewsToKindle 0.2a (c) 2013 Patrick Lerner [[email protected]]"
    banner <<-EOS
    This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).
    @@ -225,7 +226,7 @@ backends.each { |b|
    else
    fileName = article.get_title(:ruby => false, :clean => true)
    end
    KindleOutput.new(article, fileName, {:ruby => opts[:ruby]})
    KindleOutput.new(article, fileName, {:ruby => opts[:ruby], :horizontal => opts[:horizontal]})

    if opts[:open]
    system "killall Kindle"
  3. Patrick Lerner revised this gist Jun 27, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion JapNewsToKindle
    Original file line number Diff line number Diff line change
    @@ -81,7 +81,7 @@ class NHKArticle < Article
    strip_element_tags lines, 'a'
    c += clean_string(lines.inner_html.to_s)
    end
    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">').sub(/<div id="news_mkanren"><\/div>.*/, '')
    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">')
    end
    end

  4. Patrick Lerner created this gist Jun 27, 2013.
    240 changes: 240 additions & 0 deletions JapNewsToKindle
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,240 @@
    #!/usr/bin/env ruby
    # encoding: utf-8

    require 'nokogiri'
    require 'open-uri'
    require 'tmpdir'
    require 'trollop'

    def clean_string (str)
    str.tr('0-9', '0-9').sub('h2', 'h2').sub('h3', 'h3').sub('h4', 'h4')
    end

    def strip_element_tags (node, element_name)
    node.search('.//' + element_name).each do |e|
    e.replace e.inner_html
    end
    end

    def strip_ruby_tags (node)
    node.search('.//rt').remove
    strip_element_tags(node, 'ruby')
    end

    class Article
    def get_title (options = {})
    @doc.xpath(@XPath_title).each do |lines|
    strip_ruby_tags lines if not options[:ruby]
    return lines.content.to_s if options[:clean]
    return clean_string(lines.to_s)
    end
    end

    def get_date (options = {})
    @doc.xpath(@XPath_time).each do |lines|
    strip_element_tags lines, 'span'
    return clean_string(lines.to_s)
    end
    end

    def get_content (options = {:ruby => false})
    @doc.xpath(@XPath_article).each do |lines|
    strip_ruby_tags lines if not options[:ruby]
    strip_element_tags lines, 'span'
    strip_element_tags lines, 'a'
    return clean_string(lines.inner_html.to_s)
    end
    end
    end

    class NHKEasyArticle < Article
    def initialize (url)
    @doc = Nokogiri::HTML(open(url))
    @XPath_title = '//*[@id="newstitle"]/h2'
    @XPath_time = '//*[@id="newsDate"]'
    @XPath_article = '//*[@id="newsarticle"]'
    end
    end

    class NHKArticle < Article
    def initialize (url)
    @doc = Nokogiri::HTML(open(url))
    @XPath_title = '//*[@id="news"]/div[2]/div/div/div[1]/h1/span'
    @XPath_time = '//*[@id="news"]/div[2]/div/div/div[1]/h1/div'
    @XPath_article = '//*[@id="news"]/div[2]/div/div/div'
    end

    def get_title (options = {})
    super.gsub 'span', 'h2'
    end

    def get_date (options = {})
    super.gsub('<div class="time">', '<p id="newsDate">[').gsub('</div>', ']</p>')
    end

    def get_content (options = {:ruby => false})
    c = ''
    @doc.xpath(@XPath_article).each do |lines|
    break if lines.attribute('id').to_s == "news_mkanren"
    strip_ruby_tags lines if not options[:ruby]
    strip_element_tags lines, 'span'
    strip_element_tags lines, 'a'
    c += clean_string(lines.inner_html.to_s)
    end
    c.sub(/.*<p id="news_textbody">/m, '<p id="news_textbody">').sub(/<div id="news_mkanren"><\/div>.*/, '')
    end
    end

    class HTMLOutput
    def initialize (article, fileName, options = {})
    title = article.get_title(:ruby => false, :clean => true)

    @horizontal_css = <<eos
    body {
    font-family: serif; }
    h2, h3 {
    font-weight: bold;
    padding-top: 2em;
    margin-right: 1em;
    margin-left: 1em; }
    h2 {
    font-size: 120%; }
    p {
    text-indent: 1em; }
    #newsDate {
    font-size: 90%;
    font-weight:bold;
    line-height: 1.5; }
    eos

    @vertical_css = <<eos
    body {
    -webkit-writing-mode: vertical-rl; }
    #newsDate {
    padding-top: 10em;
    text-indent: -4em; }
    eos
    @vertical_css = @horizontal_css + @vertical_css

    @html_header = <<eos
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xml:lang="ja" xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta http-equiv="Content-Style-Type" content="text/css" />
    <meta name="generator" content="pandoc" />
    <title>{{TITLE}}</title>
    <link rel="stylesheet" href="{{CSS_FILE}}" type="text/css" />
    <link rel="Schema.DC" href="http://purl.org/dc/elements/1.1/" />
    <meta name="DC.Title" content="{{TITLE}}" />
    <meta name="DC.Creator" content="NHK" />
    <meta name="DC.Publisher" content="NHK" /></head>
    <body>
    eos

    @html_footer = <<eos
    </body>
    </html>
    eos

    @html_header.gsub! '{{TITLE}}', title
    @html_header.gsub! '{{CSS_FILE}}', fileName + ".css"

    File.open(fileName + ".css", 'w') { |file|
    file.write(@horizontal_css) if options[:horizontal]
    file.write(@vertical_css) if not options[:horizontal]
    }

    File.open(fileName + ".html", 'w') { |file|
    file.write(@html_header.sub('{{CSS_FILE}}', fileName + ".css"))
    file.write(article.get_title(options))
    file.write(article.get_date(options))
    file.write(article.get_content(options))
    file.write(@html_footer)
    }
    end
    end

    class KindleOutput
    def initialize (article, fileName, options = {})
    title = article.get_title(:ruby => false, :clean => true)

    @opf_file = <<eos
    <?xml version="1.0" encoding="UTF-8"?>
    <package version="3.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
    <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/">
    <dc:title>{{TITLE}}</dc:title>
    <dc:contributor>NHK</dc:contributor>
    <dc:language>ja</dc:language>
    <dc:publisher>NHK</dc:publisher>
    </metadata>
    <manifest>
    <item id="style" href="{{CSS_FILE}}" media-type="text/css" />
    <item id="titlepage" href="{{FILENAME}}.html" media-type="application/xhtml+xml" />
    </manifest>
    <spine toc="tocncx" page-progression-direction="rtl">
    <itemref idref="titlepage" />
    </spine>
    </package>
    eos
    @opf_file.gsub! '{{TITLE}}', title
    @opf_file.gsub! '{{FILENAME}}', fileName
    @opf_file.gsub! '{{CSS_FILE}}', fileName + ".css"

    Dir.mktmpdir { |dir|
    HTMLOutput.new(article, dir + "/" + fileName, options)

    File.open(dir + "/" + fileName + ".opf", 'w') { |file|
    file.write(@opf_file)
    }
    system "kindlegen \"#{dir + "/" + fileName}.opf\""
    FileUtils.cp dir + "/" + fileName + ".mobi", fileName + ".mobi"
    }
    end
    end

    # main part

    opts = Trollop::options do
    version "JapNewsToKindle 0.2 (c) 2013 Patrick Lerner [[email protected]]"
    banner <<-EOS
    This program dumps Japanese News websites into a kindle compatible mobi file using Amazon's kindlegen (needs to be in path!).
    Usage:
    JapNewsToKindle [options]
    where [options] are:
    EOS

    opt :ruby, "Get furigana if possible", :short => 'r'
    opt :url, "The URL that is supposed to be dumped", :type => String, :short => 'u'
    opt :out, "The output filename", :type => String, :short => 'O'
    opt :horizontal, "Use a horizontal layout instead of the default vertical one", :default => false, :short => 'n'
    opt :open, "Open the generated file in the Kindle Application", :default => false, :short => 'o'
    end

    backends = [
    [/nhk.or.jp\/news\/easy\/k[0-9]+\/k[0-9]+\.html/, NHKEasyArticle],
    [/nhk.or.jp\/news\/html\/[0-9]+\/[a-z][0-9]+\.html/, NHKArticle]
    ]

    backends.each { |b|
    if b[0].match(opts[:url])
    article = b[1].new(opts[:url])
    if opts[:out]
    fileName = opts[:out]
    else
    fileName = article.get_title(:ruby => false, :clean => true)
    end
    KindleOutput.new(article, fileName, {:ruby => opts[:ruby]})

    if opts[:open]
    system "killall Kindle"
    kindleFilePath = ENV['HOME'] + "/Library/Application Support/Kindle/My Kindle Content/#{fileName}.mobi"
    FileUtils.rm kindleFilePath if File.exists? (kindleFilePath)
    system "open \"#{fileName}.mobi\""
    end
    exit
    end
    }

    Trollop::die :url, "must match against a backend supported by this program"