Skip to content

Instantly share code, notes, and snippets.

@dailenspencer
Created May 14, 2018 23:17
Show Gist options
  • Save dailenspencer/5f9f2b436108195f8015fcc33e84b13e to your computer and use it in GitHub Desktop.
Save dailenspencer/5f9f2b436108195f8015fcc33e84b13e to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Request
import csv
import os
# todo: use scrapy-proxies to ensure we dont get throttled or banned by craigslist when sifting through urls and extractin data
class JobsSpider(scrapy.Spider):
name = 'jobs'
allowed_domains = ['craigslist.org']
# This function handles the verification of a craigslist result item top-level data (title, and url)
# TODO: get rid of this dumb extra argument 'log' that is attached to the extract() response from scrapy
def verifyTopLevelData (log, title, url):
dataIsValid = True
# verify craigslist result item title
# NOTE: we most likely wont use this type of check, just testing the ole regex skills in python for future use
if (bool(re.search('(Web)|(Developer)|(Programmer)|(Front End)|(Back End)', title, flags=0)) == False):
dataIsValid = False
# verify craigslist result item url
if (url == ""):
dataIsValid = False
return bool(dataIsValid)
# This function handles the building of all craigslist ulrs to extract result list items from
# TEMPLATE: <rootlocation>.craigstlist.org/search/<search_category>?query=<search_query>
def gatherCraigsListUrls():
urls = []
# craigslist urls are location based, we build urls based on these root locations (see data/craigslistLocations.txt)
rootLocations = open('craigslist/data/craigslistLocations.txt').read().splitlines()
# craigslist url search categories
searchCategories = [
'ggg'
]
# craigslit url search queries
# TODO: add regex so we can replace whitespace with '+'
searchQueries = [
'web+developer',
'web+design'
]
# build urls
for location in rootLocations:
for category in searchCategories:
for query in searchQueries:
url = 'https://' + location + '.craigslist.org/search/' + category + '?query=' + query
urls.append(url)
return urls
start_urls = ['https://losangeles.craigslist.org/search/ggg?query=web+developer']
# NOTE: gatherCriagsListUrls () is what we will use in production. Dont want to use in dev as we dont havea proxy setup
# and will most likely get blocked
# gatherCraigsListUrls ()
def parse(self, response):
# grabbing list items from DOM on craigslist result page
posts = response.xpath('//p[@class="result-info"]')
# loop through result listings and retrieve top-level data (title, and url)
for post in posts:
# top-level data
post_title = post.xpath('a/text()').extract_first("")
post_relative_url = post.xpath('a/@href').extract_first()
post_absolute_url = response.urljoin(post_relative_url)
# verify top-level data and proceed with extracting contents from craigslist post page
if self.verifyTopLevelData (post_title, post_absolute_url):
yield Request(post_absolute_url, callback=self.parse_page, meta={'Title':post_title, 'URL': post_absolute_url})
# move to next page (using next button on DOM) and repeat process
nextpage_relative_url = response.xpath('//a[@class="button next"]/@href').extract_first()
nextpage_absolute_url = response.urljoin(nextpage_relative_url)
yield Request(nextpage_absolute_url,callback=self.parse)
def parse_page(self, response):
description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract())
response.meta['Description'] = description
# TODO: use the following element to grab the posted time
# <time class="date timeago" datetime="2018-04-30T16:07:39-0700" title="2018-04-30 4:07pm">6 days ago</time>
# if time is 24 hours past, leave it.
yield response.meta
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment