Created
November 2, 2016 06:39
-
-
Save majacaci00/5ebf38be52e8e744ee473c58b7a722d5 to your computer and use it in GitHub Desktop.
In class lab use this file in your "spiders" folder of a scrapy project. Make sure you set your "DOWNLOAD_DELAY" to 4 seconds while you're testing your spider. Remove the delay once you've debugged your spider and then let it fly. Please try to avoid running your crawling processes at full speed more than necessary!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## scrapy crawl indeed_base -o indeed_raw.json | |
# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.linkextractors import LinkExtractor | |
import scrapy | |
from indeed.items import IndeedItem | |
from scrapy.spiders import CrawlSpider, Rule | |
from bs4 import BeautifulSoup | |
class IndeedSpider(CrawlSpider): | |
name = "indeed_base" | |
allowed_domains = ["indeed.com", "indeed.co.uk", "de.indeed.com", "indeed.com.br", "indeed.es", "indeex.hk"] | |
handle_httpstatus_list = [301, 302] | |
start_urls = [ | |
# San Francisco | |
#"http://www.indeed.com/jobs?q=san+scientist&l=San+Francisco%2C+CA", | |
# New York | |
#'http://www.indeed.com/jobs?q=data+science&l=New+York%2C+NY', | |
# London | |
#"http://www.indeed.co.uk/data-scientist-jobs-in-london", | |
# Minneapolis | |
# "http://www.indeed.com/jobs?q=data+scientist&l=Minneapolis%2C+MN", | |
# Texas | |
# "http://www.indeed.com/jobs?q=data+scientist&l=Texas", | |
# Illinois | |
# "http://www.indeed.com/jobs?q=data+scientist&l=Illinois", | |
# Massachusetts | |
# "http://www.indeed.com/jobs?q=data+scientist&l=Massachusetts", | |
# Berlin | |
# "http://de.indeed.com/Jobs?q=Data+Science&l=Berlin", | |
# Brazil | |
# "http://www.indeed.com.br/empregos?q=data+science&l=", | |
# Spain | |
# "http://www.indeed.es/ofertas?q=data+science&l=", | |
# Hong Kong | |
# "http://www.indeed.hk/jobs?q=data+science&l=", | |
] | |
rules = ( | |
#Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_search", follow = True), | |
Rule(LinkExtractor(deny=('account/login'), allow=(), restrict_xpaths=("//a[contains(@href, 'start')]",)), callback="parse_indeed_results", follow = True), | |
) | |
def parse_indeed_results(self, response): | |
# To extract elements, add them here | |
xpaths = { | |
"title" : './/a[@data-tn-element="jobTitle"]/@title', | |
"summary" : './/span[@class="summary"]' | |
} | |
for sel in response.xpath("//td[@id='resultsCol']/div"): | |
item = IndeedItem() | |
total_result_extracted = False | |
# Run xpath queries in sequence | |
for key, xpath_query in xpaths.items(): | |
# Run the xpath query against the target element | |
extracted = sel.xpath(xpath_query).extract() | |
# Make sure it found something | |
if len(extracted) > 0: | |
# Because there are nested elements represending the summary (multiple spans), we can use Beautfulsoup | |
# to pull out all everything as text without having to do complicated parsing methods or joining | |
if key == "summary": | |
soup = BeautifulSoup(extracted[0], 'html.parser') | |
item[key] = soup.get_text() | |
else: | |
item[key] = extracted[0] | |
# We have to have at least one extracted item to qualify the row | |
total_result_extracted = True | |
# If we have at least one item extracted per result, we put it into the model | |
if total_result_extracted: | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment