Skip to content

Instantly share code, notes, and snippets.

@renjiege
Created November 25, 2017 23:05
Show Gist options
  • Save renjiege/7e3eb7f837046b1b9003105ef7288656 to your computer and use it in GitHub Desktop.
Save renjiege/7e3eb7f837046b1b9003105ef7288656 to your computer and use it in GitHub Desktop.
A simple web crawler to scrape restaurant information from Yelp.com.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-01-04 01:22:44
# Project: yelp_rockville
from pyspider.libs.base_handler import *
import re
home = '13005 Twinbrook Parkway'
class Handler(BaseHandler):
crawl_config = {
'headers': {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko)',
}
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.yelp.com/c/mdburbs/restaurants', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('li.arrange_unit ul.ylist li a').items():
self.crawl(each.attr.href, callback=self.list_page)
@config(age=10 * 24 * 60 * 60)
def list_page(self, response):
if re.search('No Results', response.text):
pass
elif re.search('Search for more', response.text):
for each in response.doc('.button-more').items():
self.crawl(each.attr.href, callback=self.list_page)
else:
style = response.doc('a[class="js-search-header-link"]:eq(3)').text()
if not style:
style = response.doc('a[class="js-search-header-link"]:eq(2)').text()
print(style)
for each in response.doc('.indexed-biz-name a').items():
self.crawl(each.attr.href, callback=self.detail_page, save={'a': style})
for each in response.doc('a.next').items():
self.crawl(each.attr.href, callback=self.list_page)
@config(age=10 * 24 * 60 * 60)
def detail_page(self, response):
if re.search('rating-info', response.text):
results = {
"url": response.url,
"menu": response.doc('.js-menu-explore').attr.href,
"claim_status": response.doc('.claim-status_teaser').text(),
"style": response.save['a'],
"tags": response.doc('.category-str-list').text(),
"price": response.doc('.bullet-after .price-range').text(),
"reviews": response.doc('.rating-info .review-count').text()[:-8],
"rating": response.doc('.rating-info>.biz-rating-very-large>div.i-stars').attr.title[:-12],
"title": response.doc('.biz-page-title').text(),
"distance": "",
"time": "",
}
if re.search('biz-directions', response.text):
homeaddress = "%20".join(home.split())
direction = response.doc('a.biz-directions').attr.href + '?start=' + homeaddress
self.crawl(direction, callback=self.map_page, save=results, fetch_type='js')
else:
return results
@config(priority=2)
def map_page(self, response):
distance = response.doc('span[jstcache="24"]').text()
time = response.doc('.adp-summary span span').text()
assert distance != "" and time != ""
return {
"url": response.save['url'],
"menu": response.save['menu'],
"distance": distance[:-3],
"time": time[:-5],
"claim_status": response.save['claim_status'],
"style": response.save['style'],
"tags": response.save['tags'],
"price": response.save['price'],
"reviews": response.save['reviews'],
"rating": response.save['rating'],
"title": response.save['title'],
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment