Last active
December 19, 2015 23:35
-
-
Save henrik/4c53a7fed26fced6704d to your computer and use it in GitHub Desktop.
Dagens Nyheter (DN) PDF downloader for paying subscribers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ruby script for DN subscribers to download the latest DN as a PDF – because it beats their iPad app. | |
# By Henrik Nyh 2015-12-08 under the MIT license. | |
# | |
# INSTRUCTIONS (for advanced users) | |
# | |
# Fetch the script dependencies: | |
# | |
# (sudo) gem install mechanize | |
# | |
# Put your username and password in a ~/.dnloader file separarated by a ":", e.g.: | |
# | |
# echo "[email protected]:myp4ssw0rd" > ~/.dnloader | |
# | |
# Optionally, customize where the files end up, here: | |
DEST_DIR = File.expand_path("~/Dropbox/DN") | |
# Optionally, customize how many issues to keep: | |
MAX_ISSUES = 14 | |
# Now make sure the script works: | |
# | |
# ruby dnloader.rb | |
# | |
# You can add a cron job to run this script every few minutes. (If the file is already downloaded, nothing happens.) | |
# E.g. to run it every 5 minutes: | |
# | |
# */5 * * * * ruby /Users/foo/bin/dnloader.rb > /dev/null | |
require "mechanize" | |
require "date" | |
DATE = Date.today | |
PDF_INDEX_URL = "https://kund.dn.se/mitt-konto/dn-som-pdf/" | |
PDF_PATH = "/service/download/#{DATE.strftime("%Y%m%d")}/DN.pdf" | |
PDF_URL = "https://kund.dn.se#{PDF_PATH}" | |
PDF_DEST = "#{DEST_DIR}/DN_#{DATE}.pdf" | |
REMOVAL_GLOB = "#{DEST_DIR}/DN_*.pdf" | |
if File.exist?(PDF_DEST) | |
puts "Already downloaded: #{PDF_DEST}" | |
exit | |
end | |
FileUtils.mkdir_p(DEST_DIR) | |
CONFIG_FILE = File.expand_path("~/.dnloader") | |
abort("Missing config file! See docs.") unless File.exist?(CONFIG_FILE) | |
username, password = File.read(CONFIG_FILE).strip.split(":", 2) | |
abort("Missing username or password!") unless username && password | |
# Log in and download (we need a session to get the PDF) | |
agent = Mechanize.new | |
agent.pluggable_parser.pdf = Mechanize::Download | |
login_page = agent.get("https://auth.dn.se/login?appId=dagensnyheter.se&lc=sv&callback=http%3A%2F%2Fkund.dn.se%2Fservice%2Floginplus%3Fredirect%3D%2F") | |
login_result_page = login_page.form_with(id: "loginForm") do |form| | |
form.field_with(name: "form.username").value = username | |
form.field_with(name: "form.password").value = password | |
end.submit | |
if login_result_page.body.include?("Logga ut") | |
# For some reason, they will sometimes make "today's" PDF available right after midnight but with yesterday's content. | |
# But they don't seem to update links until there is a real issue, so we use that. | |
page = agent.get(PDF_INDEX_URL) | |
unless page.links_with(href: PDF_PATH).any? | |
puts "Seems today's issue is not published yet." | |
exit | |
end | |
agent.get(PDF_URL).save(PDF_DEST) | |
puts "Downloaded: #{PDF_DEST}" | |
else | |
# If you want to debug it: | |
#p login_result_page | |
#puts login_result_page.body | |
abort "Error logging in!" | |
end | |
# Remove old files. | |
files_to_remove = Dir[REMOVAL_GLOB].sort.reverse.drop(MAX_ISSUES) | |
if files_to_remove.any? | |
FileUtils.rm(files_to_remove) | |
puts "Removed files: #{files_to_remove.inspect}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This goes together well with the "synced folders" feature of GoodReader for the iPad.