dailenspencer · May 14, 2018 23:17
diff --git a/jobs.py b/jobs.py
 # -*- coding: utf-8 -*-
 import scrapy
 import re
 from scrapy import Request
 import csv
 import os

 # todo: use scrapy-proxies to ensure we dont get throttled or banned by craigslist when sifting through urls and extractin data

 class JobsSpider(scrapy.Spider):
    
    name = 'jobs'
    allowed_domains = ['craigslist.org']

    
    # This function handles the verification of a craigslist result item top-level data (title, and url)
    # TODO: get rid of this dumb extra argument 'log' that is attached to the extract() response from scrapy
    def verifyTopLevelData (log, title, url):

        dataIsValid = True
                
        # verify craigslist result item title
        # NOTE: we most likely wont use this type of check, just testing the ole regex skills in python for future use
        if (bool(re.search('(Web)|(Developer)|(Programmer)|(Front End)|(Back End)', title, flags=0)) == False):
            dataIsValid = False

        # verify craigslist result item url
        if (url == ""):
            dataIsValid = False
        
        return bool(dataIsValid)
    
    # This function handles the building of all craigslist ulrs to extract result list items from
    # TEMPLATE: <rootlocation>.craigstlist.org/search/<search_category>?query=<search_query>
    def gatherCraigsListUrls():
         
        urls = []
                
        # craigslist urls are location based, we build urls based on these root locations (see data/craigslistLocations.txt)
        rootLocations = open('craigslist/data/craigslistLocations.txt').read().splitlines()

        # craigslist url search categories 
        searchCategories = [
            'ggg'
        ]

        # craigslit url search queries
        # TODO: add regex so we can replace whitespace with '+'
        searchQueries = [
            'web+developer',
            'web+design'
        ]

        # build urls
        for location in rootLocations:
            for category in searchCategories:
                for query in searchQueries:
                    url = 'https://' + location + '.craigslist.org/search/' + category + '?query=' + query
                    urls.append(url)
                    
        return urls

    start_urls = ['https://losangeles.craigslist.org/search/ggg?query=web+developer']   
    
    # NOTE: gatherCriagsListUrls () is what we will use in production. Dont want to use in dev as we dont havea  proxy setup
    # and will most likely get blocked
    # gatherCraigsListUrls ()   
    
    def parse(self, response):
        
        # grabbing list items from DOM on craigslist result page
        posts = response.xpath('//p[@class="result-info"]') 
        
        # loop through result listings and retrieve top-level data (title, and url)
        for post in posts:

            # top-level data
            post_title = post.xpath('a/text()').extract_first("")
            post_relative_url = post.xpath('a/@href').extract_first()
            post_absolute_url = response.urljoin(post_relative_url)


            # verify top-level data and proceed with extracting contents from craigslist post page
            if self.verifyTopLevelData (post_title, post_absolute_url):
                yield Request(post_absolute_url, callback=self.parse_page, meta={'Title':post_title, 'URL': post_absolute_url})
            

        # move to next page (using next button on DOM) and repeat process
        nextpage_relative_url = response.xpath('//a[@class="button next"]/@href').extract_first()
        nextpage_absolute_url = response.urljoin(nextpage_relative_url)

        yield Request(nextpage_absolute_url,callback=self.parse)
        
       
    def parse_page(self, response):
        
        description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract())
        response.meta['Description'] = description

        # TODO: use the following element to grab the posted time
        # <time class="date timeago" datetime="2018-04-30T16:07:39-0700" title="2018-04-30  4:07pm">6 days ago</time>
        # if time is 24 hours past, leave it. 
        
        yield response.meta
	# -- coding: utf-8 --
	import scrapy
	import re
	from scrapy import Request
	import csv
	import os

	# todo: use scrapy-proxies to ensure we dont get throttled or banned by craigslist when sifting through urls and extractin data

	class JobsSpider(scrapy.Spider):

	name = 'jobs'
	allowed_domains = ['craigslist.org']


	# This function handles the verification of a craigslist result item top-level data (title, and url)
	# TODO: get rid of this dumb extra argument 'log' that is attached to the extract() response from scrapy
	def verifyTopLevelData (log, title, url):

	dataIsValid = True

	# verify craigslist result item title
	# NOTE: we most likely wont use this type of check, just testing the ole regex skills in python for future use
	if (bool(re.search('(Web)\|(Developer)\|(Programmer)\|(Front End)\|(Back End)', title, flags=0)) == False):
	dataIsValid = False

	# verify craigslist result item url
	if (url == ""):
	dataIsValid = False

	return bool(dataIsValid)

	# This function handles the building of all craigslist ulrs to extract result list items from
	# TEMPLATE: <rootlocation>.craigstlist.org/search/<search_category>?query=<search_query>
	def gatherCraigsListUrls():

	urls = []

	# craigslist urls are location based, we build urls based on these root locations (see data/craigslistLocations.txt)
	rootLocations = open('craigslist/data/craigslistLocations.txt').read().splitlines()

	# craigslist url search categories
	searchCategories = [
	'ggg'
	]

	# craigslit url search queries
	# TODO: add regex so we can replace whitespace with '+'
	searchQueries = [
	'web+developer',
	'web+design'
	]

	# build urls
	for location in rootLocations:
	for category in searchCategories:
	for query in searchQueries:
	url = 'https://' + location + '.craigslist.org/search/' + category + '?query=' + query
	urls.append(url)

	return urls

	start_urls = ['https://losangeles.craigslist.org/search/ggg?query=web+developer']

	# NOTE: gatherCriagsListUrls () is what we will use in production. Dont want to use in dev as we dont havea proxy setup
	# and will most likely get blocked
	# gatherCraigsListUrls ()

	def parse(self, response):

	# grabbing list items from DOM on craigslist result page
	posts = response.xpath('//p[@class="result-info"]')

	# loop through result listings and retrieve top-level data (title, and url)
	for post in posts:

	# top-level data
	post_title = post.xpath('a/text()').extract_first("")
	post_relative_url = post.xpath('a/@href').extract_first()
	post_absolute_url = response.urljoin(post_relative_url)


	# verify top-level data and proceed with extracting contents from craigslist post page
	if self.verifyTopLevelData (post_title, post_absolute_url):
	yield Request(post_absolute_url, callback=self.parse_page, meta={'Title':post_title, 'URL': post_absolute_url})


	# move to next page (using next button on DOM) and repeat process
	nextpage_relative_url = response.xpath('//a[@class="button next"]/@href').extract_first()
	nextpage_absolute_url = response.urljoin(nextpage_relative_url)

	yield Request(nextpage_absolute_url,callback=self.parse)


	def parse_page(self, response):

	description = "".join(line for line in response.xpath('//*[@id="postingbody"]/text()').extract())
	response.meta['Description'] = description

	# TODO: use the following element to grab the posted time
	# <time class="date timeago" datetime="2018-04-30T16:07:39-0700" title="2018-04-30 4:07pm">6 days ago</time>
	# if time is 24 hours past, leave it.

	yield response.meta