Skip to content

Instantly share code, notes, and snippets.

@tmocellin
Created December 31, 2015 15:44
Show Gist options
  • Save tmocellin/bbbd0cd1a70eb005fb92 to your computer and use it in GitHub Desktop.
Save tmocellin/bbbd0cd1a70eb005fb92 to your computer and use it in GitHub Desktop.
Skeleton rake task for scrapping website with nokogiri
require 'open-uri'
require 'nokogiri'
require 'securerandom'
namespace :Scrap do
desc "code structure for new scrapper"
task :sample => :environment do
src = "" # define the source of the article
url_to_scrap = "" # define the url where scrap the article can be the same as src
stream = open(url_to_scrap) # get the stream of the url
html = stream.read() # get html code of the stream
noko_html = Nokogiri::HTML(html) # get the nokogiri object of html
articles_html = nil # list of the article find by nokogiri
articles = [] # List of all the article object Model
# Replace by the block to select the articles html list
# iterate on all the articles
articles_html.each { |article|
# Replace nil by the css selector
img = nil
url = nil
title = nil
desc = nil
#Set the object data
data = {url:url,
src:src,
img:img,
title:title,
description:desc,
twitted:false,
uuid:SecureRandom.hex(10)
}
articles.push(data)
}
count = SaveArticle(articles)
#finally save the history with how many article added
history = History.create(action:"SCRAP" , src:url_to_scrap , count:count)
history.save
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment