Created
December 31, 2015 15:44
-
-
Save tmocellin/bbbd0cd1a70eb005fb92 to your computer and use it in GitHub Desktop.
Skeleton rake task for scrapping website with nokogiri
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'securerandom' | |
namespace :Scrap do | |
desc "code structure for new scrapper" | |
task :sample => :environment do | |
src = "" # define the source of the article | |
url_to_scrap = "" # define the url where scrap the article can be the same as src | |
stream = open(url_to_scrap) # get the stream of the url | |
html = stream.read() # get html code of the stream | |
noko_html = Nokogiri::HTML(html) # get the nokogiri object of html | |
articles_html = nil # list of the article find by nokogiri | |
articles = [] # List of all the article object Model | |
# Replace by the block to select the articles html list | |
# iterate on all the articles | |
articles_html.each { |article| | |
# Replace nil by the css selector | |
img = nil | |
url = nil | |
title = nil | |
desc = nil | |
#Set the object data | |
data = {url:url, | |
src:src, | |
img:img, | |
title:title, | |
description:desc, | |
twitted:false, | |
uuid:SecureRandom.hex(10) | |
} | |
articles.push(data) | |
} | |
count = SaveArticle(articles) | |
#finally save the history with how many article added | |
history = History.create(action:"SCRAP" , src:url_to_scrap , count:count) | |
history.save | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment