Created
November 25, 2017 23:05
-
-
Save renjiege/7e3eb7f837046b1b9003105ef7288656 to your computer and use it in GitHub Desktop.
A simple web crawler to scrape restaurant information from Yelp.com.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# Created on 2017-01-04 01:22:44 | |
# Project: yelp_rockville | |
from pyspider.libs.base_handler import * | |
import re | |
home = '13005 Twinbrook Parkway' | |
class Handler(BaseHandler): | |
crawl_config = { | |
'headers': { | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko)', | |
} | |
} | |
@every(minutes=24 * 60) | |
def on_start(self): | |
self.crawl('https://www.yelp.com/c/mdburbs/restaurants', callback=self.index_page) | |
@config(age=10 * 24 * 60 * 60) | |
def index_page(self, response): | |
for each in response.doc('li.arrange_unit ul.ylist li a').items(): | |
self.crawl(each.attr.href, callback=self.list_page) | |
@config(age=10 * 24 * 60 * 60) | |
def list_page(self, response): | |
if re.search('No Results', response.text): | |
pass | |
elif re.search('Search for more', response.text): | |
for each in response.doc('.button-more').items(): | |
self.crawl(each.attr.href, callback=self.list_page) | |
else: | |
style = response.doc('a[class="js-search-header-link"]:eq(3)').text() | |
if not style: | |
style = response.doc('a[class="js-search-header-link"]:eq(2)').text() | |
print(style) | |
for each in response.doc('.indexed-biz-name a').items(): | |
self.crawl(each.attr.href, callback=self.detail_page, save={'a': style}) | |
for each in response.doc('a.next').items(): | |
self.crawl(each.attr.href, callback=self.list_page) | |
@config(age=10 * 24 * 60 * 60) | |
def detail_page(self, response): | |
if re.search('rating-info', response.text): | |
results = { | |
"url": response.url, | |
"menu": response.doc('.js-menu-explore').attr.href, | |
"claim_status": response.doc('.claim-status_teaser').text(), | |
"style": response.save['a'], | |
"tags": response.doc('.category-str-list').text(), | |
"price": response.doc('.bullet-after .price-range').text(), | |
"reviews": response.doc('.rating-info .review-count').text()[:-8], | |
"rating": response.doc('.rating-info>.biz-rating-very-large>div.i-stars').attr.title[:-12], | |
"title": response.doc('.biz-page-title').text(), | |
"distance": "", | |
"time": "", | |
} | |
if re.search('biz-directions', response.text): | |
homeaddress = "%20".join(home.split()) | |
direction = response.doc('a.biz-directions').attr.href + '?start=' + homeaddress | |
self.crawl(direction, callback=self.map_page, save=results, fetch_type='js') | |
else: | |
return results | |
@config(priority=2) | |
def map_page(self, response): | |
distance = response.doc('span[jstcache="24"]').text() | |
time = response.doc('.adp-summary span span').text() | |
assert distance != "" and time != "" | |
return { | |
"url": response.save['url'], | |
"menu": response.save['menu'], | |
"distance": distance[:-3], | |
"time": time[:-5], | |
"claim_status": response.save['claim_status'], | |
"style": response.save['style'], | |
"tags": response.save['tags'], | |
"price": response.save['price'], | |
"reviews": response.save['reviews'], | |
"rating": response.save['rating'], | |
"title": response.save['title'], | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment