Last active
August 29, 2015 14:22
-
-
Save masciugo/f2f6048cf9a5541cd0e4 to your computer and use it in GitHub Desktop.
download quotidie
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'mechanize' | |
require 'open-uri' | |
require 'byebug' | |
class Zeitung | |
attr_accessor :uri | |
attr_reader :final_name, :regexp, :move | |
def initialize(final_name, regexp, move=false) | |
@final_name = final_name | |
@regexp = regexp | |
@move = move | |
@uri = nil | |
end | |
end | |
zeitungs = [] | |
zeitungs << Zeitung.new("Milano Finanza", /milano finanza/i, true) | |
zeitungs << Zeitung.new("Corriere della Sera", /corriere della sera/i, true) | |
zeitungs << Zeitung.new("Corriere Economia", /corriere economia/i, true) | |
zeitungs << Zeitung.new("Gazzetta dello Sport", /gazzetta dello sport/i, true) | |
zeitungs << Zeitung.new("Fatto Quotidiano", /fatto quotidiano/i, true) | |
zeitungs << Zeitung.new("Repubblica", /repubblica/i, true) | |
zeitungs << Zeitung.new("Sole 24 Ore", /sole 24 ore/i, true) | |
zeitungs << Zeitung.new("Stampa", /stampa/i, true) | |
zeitungs << Zeitung.new("Centro", /centro/i, true) | |
zeitungs << Zeitung.new("Foglio", /foglio/i, false) | |
zeitungs << Zeitung.new("Repubblica Roma", /Rep.locale.+\-RM/i, false) | |
zeitungs << Zeitung.new("Corriere Milano", /Corriere della Sera Milano/i, false) | |
zeitungs << Zeitung.new("Giornale", /giornale/i, false) | |
zeitungs << Zeitung.new("Libero", /libero/i ,false) | |
def download_zeitung(zeitung, filename, destination='') | |
puts "zeitung '#{zeitung.final_name}' is being downloaded..." | |
begin | |
uri = URI::HTTPS.build(host: zeitung.uri.host, path: zeitung.uri.path, query: "directDownload=true") | |
File.open("#{filename}.temp", "wb") do |saved_file| | |
open(uri, "rb") do |read_file| | |
saved_file.write(read_file.read) | |
end | |
end | |
FileUtils.mv("#{filename}.temp", destination+filename) | |
rescue Exception => e | |
exit if e.is_a? Interrupt | |
puts "problem with '#{filename}' at #{uri}: #{e.message}" | |
end | |
end | |
# zeitungs: è un array di oggetti Zeitung | |
# password: la password del mese | |
# date: la data | |
# where: stabilisce dove vanno caricati i giornali | |
def download_zeitungs(zeitungs, password, date, where) | |
t = Time.now | |
date = (date.nil? ? Date.today : Date.parse(date)) | |
date_string = date.strftime("%Y-%m-%d") | |
puts "Downloading zeitungs for #{date_string}" | |
puts "-----------------------------------------------------" | |
agent = Mechanize.new | |
agent.get('http://zeitung2.tumblr.com') do |page| | |
# compilo form autenticazione | |
form = page.forms.first | |
form.password = 'uvetta' | |
# non so perche arriva un'altro form precompilato | |
form2 = agent.submit(form).forms.first | |
# la pagina con i diversi giorni | |
quotidie_page = agent.submit(form2) | |
# finalmente l'url della pagina con la lista dei quotidiani | |
today_quotidie_page_url = quotidie_page.link_with(text: date.strftime("%d.%m.%Y")).href | |
agent.get(today_quotidie_page_url) do |page| | |
# tutti i link utili | |
all_quotidie_links = page.links.find_all { |link| link.attributes.parent.parent.path == "/html/body/section/section/div/article/div/section[1]/div/div" } | |
zeitungs.each do |z| | |
if link = all_quotidie_links.find{|l| z.regexp.match l.attributes.previous_sibling.previous_sibling } # se c'è un link per il zetung corrente | |
z.uri = link.uri | |
filename = "#{z.final_name} - #{date_string}.pdf" | |
destination = ((z.move and where) or '') | |
download_zeitung(z, filename, destination) | |
else # se non c'è un link per il zetung corrente | |
puts "zeitung '#{z.final_name}' not downloaded: CANNOT FIND A LINK " | |
end | |
end | |
end | |
end | |
puts "Time elapsed: #{Time.now-t}s" | |
puts | |
puts | |
end | |
# download_zeitungs(zeitungs, 'xxxx', nil, 'Dropbox/quotidie/') | |
# download_zeitungs(zeitungs, 'xxxx', '2015-06-01', nil) | |
download_zeitungs(zeitungs, 'xxxx', nil, nil) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment