Created
May 14, 2018 23:17
-
-
Save dailenspencer/5f9f2b436108195f8015fcc33e84b13e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
import re | |
from scrapy import Request | |
import csv | |
import os | |
# todo: use scrapy-proxies to ensure we dont get throttled or banned by craigslist when sifting through urls and extractin data | |
class JobsSpider(scrapy.Spider): | |
name = 'jobs' | |
allowed_domains = ['craigslist.org'] | |
# This function handles the verification of a craigslist result item top-level data (title, and url) | |
# TODO: get rid of this dumb extra argument 'log' that is attached to the extract() response from scrapy | |
def verifyTopLevelData (log, title, url): | |
dataIsValid = True | |
# verify craigslist result item title | |
# NOTE: we most likely wont use this type of check, just testing the ole regex skills in python for future use | |
if (bool(re.search('(Web)|(Developer)|(Programmer)|(Front End)|(Back End)', title, flags=0)) == False): | |
dataIsValid = False | |
# verify craigslist result item url | |
if (url == ""): | |
dataIsValid = False | |
return bool(dataIsValid) | |
# This function handles the building of all craigslist ulrs to extract result list items from | |
# TEMPLATE: <rootlocation>.craigstlist.org/search/<search_category>?query=<search_query> | |
def gatherCraigsListUrls(): | |
urls = [] | |
# craigslist urls are location based, we build urls based on these root locations (see data/craigslistLocations.txt) | |
rootLocations = open('craigslist/data/craigslistLocations.txt').read().splitlines() | |
# craigslist url search categories | |
searchCategories = [ | |
'ggg' | |
] | |
# craigslit url search queries | |
# TODO: add regex so we can replace whitespace with '+' | |
searchQueries = [ | |
'web+developer', | |
'web+design' | |
] | |
# build urls | |
for location in rootLocations: | |
for category in searchCategories: | |
for query in searchQueries: | |
url = 'https://' + location + '.craigslist.org/search/' + category + '?query=' + query | |
urls.append(url) | |
return urls | |
start_urls = ['https://losangeles.craigslist.org/search/ggg?query=web+developer'] | |
# NOTE: gatherCriagsListUrls () is what we will use in production. Dont want to use in dev as we dont havea proxy setup | |
# and will most likely get blocked | |
# gatherCraigsListUrls () | |
def parse(self, response): | |
# grabbing list items from DOM on craigslist result page | |
posts = response.xpath('//p[@class="result-info"]') | |
# loop through result listings and retrieve top-level data (title, and url) | |
for post in posts: | |
# top-level data | |
post_title = post.xpath('a/text()').extract_first("") | |
post_relative_url = post.xpath('a/@href').extract_first() | |
post_absolute_url = response.urljoin(post_relative_url) | |
# verify top-level data and proceed with extracting contents from craigslist post page | |
if self.verifyTopLevelData (post_title, post_absolute_url): | |
yield Request(post_absolute_url, callback=self.parse_page, meta={'Title':post_title, 'URL': post_absolute_url}) | |
# move to next page (using next button on DOM) and repeat process | |
nextpage_relative_url = response.xpath('//a[@class="button next"]/@href').extract_first() | |
nextpage_absolute_url = response.urljoin(nextpage_relative_url) | |
yield Request(nextpage_absolute_url,callback=self.parse) | |
def parse_page(self, response): | |
description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract()) | |
response.meta['Description'] = description | |
# TODO: use the following element to grab the posted time | |
# <time class="date timeago" datetime="2018-04-30T16:07:39-0700" title="2018-04-30 4:07pm">6 days ago</time> | |
# if time is 24 hours past, leave it. | |
yield response.meta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment