Created
August 19, 2018 00:43
-
-
Save imran31415/5adc61c74ad490cb3874ff3b71c6d3eb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import json | |
import requests | |
from exceptions import ValueError | |
import re, urllib | |
import urllib3 | |
import argparse | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
from requests.packages.urllib3.exceptions import InsecureRequestWarning | |
import time | |
from concurrent.futures import ThreadPoolExecutor | |
import sys | |
from threading import Thread | |
import os | |
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
#[#'https://www.yelp.com/biz/kdb-kitchen-den-bar-long-beach', | |
yelp_urls =['https://www.yelp.com/biz/the-atlas-room-washington','https://www.yelp.com/biz/the-rack-brandon','https://www.yelp.com/biz/payard-p%C3%A2tisserie-and-bistro-new-york-2','https://www.yelp.com/biz/maison-giraud-pacific-palisades','https://www.yelp.com/biz/saltbox-san-diego','https://www.yelp.com/biz/carmichaels-chicago-steak-house-chicago','https://www.yelp.com/biz/black-eyed-pea-restaurant-houston-6','https://www.yelp.com/biz/perfecto-mundo-latin-fusion-bistro-commack','https://www.yelp.com/biz/smittys-bbq-boyd','https://www.yelp.com/biz/reston-kabob-reston','https://www.yelp.com/biz/bookmark-cafe-largo','https://www.yelp.com/biz/the-tin-angel-pittsburgh','https://www.yelp.com/biz/briantos-original-hoagies-orlando','https://www.yelp.com/biz/freeway-diner-woodbury','https://www.yelp.com/biz/river-gods-cambridge','https://www.yelp.com/biz/golan-kosher-restaurant-north-hollywood-2','https://www.yelp.com/biz/city-hall-restaurant-new-york-2','https://www.yelp.com/biz/empire-pizza-and-grill-west-chester','https://www.yelp.com/biz/cityzen-washington-2','https://www.yelp.com/biz/three-degrees-los-gatos','https://www.yelp.com/biz/applebees-grill-bar-quakertown','https://www.yelp.com/biz/johnny-carinos-covina','https://www.yelp.com/biz/buffet-de-la-gare-hastings-hdsn','https://www.yelp.com/biz/continental-food-management-la-mirada','https://www.yelp.com/biz/elephant-bar-restaurant-peoria','https://www.yelp.com/biz/sullivans-steakhouse-denver','https://www.yelp.com/biz/yucatan-liquid-stand-coppell','https://www.yelp.com/biz/tomato-pie-morristown','https://www.yelp.com/biz/willett-house-port-chester','https://www.yelp.com/biz/thai-corner-san-antonio-2','https://www.yelp.com/biz/silkes-american-grill-mesa','https://www.yelp.com/biz/t-mex-cantina-fort-lauderdale-2','https://www.yelp.com/biz/casa-oaxaca-washington','https://www.yelp.com/biz/wings-on-wheels-hebron','https://www.yelp.com/biz/siris-thai-french-cuisine-cherry-hill','https://www.yelp.com/biz/nightwood-chicago','https://www.yelp.com/biz/cafe-gallery-burlington','https://www.yelp.com/biz/the-hurricane-caf%C3%A9-seattle-2','https://www.yelp.com/biz/231-ellsworth-san-mateo','https://www.yelp.com/biz/la-marmite-williston-park','https://www.yelp.com/biz/the-river-house-palm-beach-gardens-2','https://www.yelp.com/biz/langermanns-baltimore','https://www.yelp.com/biz/del-friscos-grille-phoenix','https://www.yelp.com/biz/carrows-family-restaurant-antioch','https://www.yelp.com/biz/minerva-fine-indian-herndon-va-herndon-5','https://www.yelp.com/biz/the-mason-bar-dallas','https://www.yelp.com/biz/la-cote-cafe-and-wine-bar-seattle','https://www.yelp.com/biz/vareli-new-york','https://www.yelp.com/biz/wendys-wixom','https://www.yelp.com/biz/lanterna-tuscan-bistro-nyack','https://www.yelp.com/biz/yo-taco-duxbury','https://www.yelp.com/biz/bombay-palace-new-york','https://www.yelp.com/biz/cafe-buonaros-naperville','https://www.yelp.com/biz/ponti-seafood-grill-seattle-3','https://www.yelp.com/biz/bill-johnsons-big-apple-restaurants-phoenix-5','https://www.yelp.com/biz/by-word-of-mouth-oakland-park','https://www.yelp.com/biz/anna-maries-pizza-and-restaurant-wharton','https://www.yelp.com/biz/dierdorf-and-harts-steakhouse-saint-louis','https://www.yelp.com/biz/wine-5-cafe-las-vegas','https://www.yelp.com/biz/ernies-restaurant-plymouth','https://www.yelp.com/biz/next-door-pizza-and-pub-lees-summit','https://www.yelp.com/biz/lannys-alta-cocina-mexicana-fort-worth','https://www.yelp.com/biz/jalisco-mexican-restaurant-eastlake','https://www.yelp.com/biz/clio-boston','https://www.yelp.com/biz/uncommon-grounds-aliquippa','https://www.yelp.com/biz/uozumi-restaurant-palmdale','https://www.yelp.com/biz/enzos-pizza-matawan','https://www.yelp.com/biz/the-pointe-cafe-south-san-francisco','https://www.yelp.com/biz/captains-restaurant-and-seafood-market-florida-city','https://www.yelp.com/biz/le-perigord-new-york-4','https://www.yelp.com/biz/i-love-thai-arlington','https://www.yelp.com/biz/bistro-44-bedford','https://www.yelp.com/biz/ritters-marietta','https://www.yelp.com/biz/rouge-et-blanc-new-york','https://www.yelp.com/biz/assembly-steak-house-and-seafood-grill-englewood-cliffs-2','https://www.yelp.com/biz/american-turkish-restaurant-fort-lauderdale','https://www.yelp.com/biz/r-and-r-bar-b-que-and-catering-service-missouri-2','https://www.yelp.com/biz/sushi-land-long-beach','https://www.yelp.com/biz/longshots-sports-bar-waretown','https://www.yelp.com/biz/salt-creek-barbeque-glendale-heights','https://www.yelp.com/biz/pizza-market-breese','https://www.yelp.com/biz/john-qs-steakhouse-cleveland','https://www.yelp.com/biz/bistro-n-boca-raton-2','https://www.yelp.com/biz/samanthas-restaurant-silver-spring-2','https://www.yelp.com/biz/baha-brothers-sandbar-grill-taunton-3','https://www.yelp.com/biz/cafe-cortina-farmington-hills-5','https://www.yelp.com/biz/big-beaver-tavern-troy','https://www.yelp.com/biz/hogans-restaurant-bloomfield-hills','https://www.yelp.com/biz/the-copper-monkey-beaverton','https://www.yelp.com/biz/clement-street-bar-and-grill-san-francisco','https://www.yelp.com/biz/pepin-scottsdale','https://www.yelp.com/biz/village-belle-philadelphia','https://www.yelp.com/biz/sweet-woodruff-san-francisco','https://www.yelp.com/biz/siam-marina-tinley-park','https://www.yelp.com/biz/luigis-italian-restaurant-centennial-2','https://www.yelp.com/biz/smokin-wills-barbecue-roselle','https://www.yelp.com/biz/voltaire-restaurant-scottsdale','https://www.yelp.com/biz/jus-cookins-restaurant-lakewood-2','https://www.yelp.com/biz/pegs-countryside-cafe-hamel','https://www.yelp.com/biz/rays-grill-fulshear','https://www.yelp.com/biz/cafe-zalute-rosemont','https://www.yelp.com/biz/guard-house-inn-gladwyne','https://www.yelp.com/biz/road-runner-grand-canyon-las-vegas-2','https://www.yelp.com/biz/garage-restaurant-and-cafe-new-york','https://www.yelp.com/biz/los-tapatios-cedar-hill','https://www.yelp.com/biz/chengdu-46-clifton','https://www.yelp.com/biz/moby-dick-house-of-kabob-fairfax','https://www.yelp.com/biz/natures-food-patch-clearwater','https://www.yelp.com/biz/taco-del-mar-hillsboro-3','https://www.yelp.com/biz/ms-tootsies-rbl-philadelphia','https://www.yelp.com/biz/the-big-c-athletic-club-concord','https://www.yelp.com/biz/west-hanover-pizzeria-hanover','https://www.yelp.com/biz/georges-pastaria-houston','https://www.yelp.com/biz/encuentro-oakland-3','https://www.yelp.com/biz/smokys-bbq-eldersburg','https://www.yelp.com/biz/ruby-tuesday-san-antonio','https://www.yelp.com/biz/saladworks-philadelphia-4','https://www.yelp.com/biz/captain-pizza-middleton','https://www.yelp.com/biz/bob-evans-fredericksburg-3','https://www.yelp.com/biz/frittata-clawson','https://www.yelp.com/biz/the-sandwich-spot-palm-springs','https://www.yelp.com/biz/freds-mexican-cafe-san-diego-4','https://www.yelp.com/biz/geordies-steak-phoenix-2','https://www.yelp.com/biz/five-guys-wayne-5','https://www.yelp.com/biz/zen-sushi-la-crescenta-2','https://www.yelp.com/biz/the-summit-steakhouse-aurora-2','https://www.yelp.com/biz/miramar-bistro-highwood','https://www.yelp.com/biz/mick-o-sheas-baltimore','https://www.yelp.com/biz/dennys-houston-30','https://www.yelp.com/biz/carls-jr-henderson-5','https://www.yelp.com/biz/mexican-town-restaurant-detroit','https://www.yelp.com/biz/sushi-roku-las-vegas','https://www.yelp.com/biz/giant-pizza-king-san-diego','https://www.yelp.com/biz/quiznos-brooklyn-6','https://www.yelp.com/biz/taco-bell-glen-ellyn','https://www.yelp.com/biz/las-tortas-locas-marietta','https://www.yelp.com/biz/smith-and-wollensky-las-vegas-2','https://www.yelp.com/biz/happy-garden-chinese-brighton','https://www.yelp.com/biz/urban-foodie-feed-store-college-park','https://www.yelp.com/biz/the-wolf-oakland','https://www.yelp.com/biz/scuzzis-italian-restaurant-san-antonio-4','https://www.yelp.com/biz/better-gourmet-health-kitchen-staten-island','https://www.yelp.com/biz/the-restaurant-and-cafe-warren','https://www.yelp.com/biz/mcdonalds-houston-214','https://www.yelp.com/biz/pyeong-chang-tofu-house-oakland','https://www.yelp.com/biz/maria-rosa-pizzeria-and-family-restaurant-flemington','https://www.yelp.com/biz/legends-sports-bar-and-grill-roseville-2','https://www.yelp.com/biz/villa-reale-pizzeria-and-restaurant-pittsburgh','https://www.yelp.com/biz/the-terrace-cafe-venice','https://www.yelp.com/biz/the-oval-room-washington-2','https://www.yelp.com/biz/high-point-coal-center','https://www.yelp.com/biz/j-and-s-montebello','https://www.yelp.com/biz/cheers-restaurant-and-bar-fort-lauderdale'] | |
def parse_page(url): | |
# url = "https://www.yelp.com/biz/frances-san-francisco" | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
response = requests.get(url, headers=headers, verify=False).text | |
parser = html.fromstring(response) | |
raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()") | |
raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()") | |
raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()") | |
raw_category = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()') | |
hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr") | |
details_table = parser.xpath("//div[@class='short-def-list']//dl") | |
raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src") | |
raw_phone = parser.xpath(".//span[@class='biz-phone']//text()") | |
raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()') | |
raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href") | |
raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()") | |
raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()") | |
rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]") | |
raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title") | |
working_hours = [] | |
for hours in hours_table: | |
raw_day = hours.xpath(".//th//text()") | |
raw_timing = hours.xpath("./td//text()") | |
day = ''.join(raw_day).strip() | |
timing = ''.join(raw_timing).strip() | |
working_hours.append({day:timing}) | |
info = [] | |
for details in details_table: | |
raw_description_key = details.xpath('.//dt//text()') | |
raw_description_value = details.xpath('.//dd//text()') | |
description_key = ''.join(raw_description_key).strip() | |
description_value = ''.join(raw_description_value).strip() | |
info.append({description_key:description_value}) | |
ratings_histogram = [] | |
for ratings in rating_histogram: | |
raw_rating_key = ratings.xpath(".//th//text()") | |
raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()") | |
rating_key = ''.join(raw_rating_key).strip() | |
rating_value = ''.join(raw_rating_value).strip() | |
ratings_histogram.append({rating_key:rating_value}) | |
name = ''.join(raw_name).strip() | |
phone = ''.join(raw_phone).strip() | |
address = ' '.join(' '.join(raw_address).split()) | |
health_rating = ''.join(raw_health_rating).strip() | |
price_range = ''.join(raw_price_range).strip() | |
claimed_status = ''.join(raw_claimed).strip() | |
reviews = ''.join(raw_reviews).strip() | |
category = ','.join(raw_category) | |
cleaned_ratings = ''.join(raw_ratings).strip() | |
if raw_wbsite_link: | |
decoded_raw_website_link = urllib.unquote(raw_wbsite_link[0]) | |
website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0] | |
else: | |
website = '' | |
if raw_map_link: | |
decoded_map_url = urllib.unquote(raw_map_link[0]) | |
map_coordinates = re.findall("center=([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',') | |
latitude = map_coordinates[0] | |
longitude = map_coordinates[1] | |
else: | |
latitude = '' | |
longitude = '' | |
if raw_ratings: | |
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0] | |
else: | |
ratings = 0 | |
data={'working_hours':working_hours, | |
'info':info, | |
'ratings_histogram':ratings_histogram, | |
'name':name, | |
'phone':phone, | |
'ratings':ratings, | |
'address':address, | |
'health_rating':health_rating, | |
'price_range':price_range, | |
'claimed_status':claimed_status, | |
'reviews':reviews, | |
'category':category, | |
'website':website, | |
'latitude':latitude, | |
'longitude':longitude, | |
'url':url, | |
} | |
return data | |
def parse_reviews(url): | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0 Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0.'} | |
response = requests.get(url, headers=headers, verify=False).text | |
parser = html.fromstring(response) | |
ratings_zipped = [] | |
reviews = [x for x in parser.xpath("//div[contains(@class,'main-section')]//div[contains(@class,'review-list')]//div[contains(@class,'review')]//div[contains(@class,'review-content')]")] | |
for r in reviews: | |
date = r.xpath("./div[contains(@class,'biz-rating')]//span[contains(@class,'rating-qualifier')]/text()")[0].strip() | |
rating = r.xpath("./div[contains(@class,'biz-rating')]//div[contains(@class,'rating-large')]/@title")[0] | |
content = r.xpath("./p")[0].text_content() | |
ratings_zipped.append([date, rating, content]) | |
print 'Ratings scraped: ', len(ratings_zipped) | |
return ratings_zipped | |
def chunks(l, n): | |
"""Yield successive n-sized chunks from l.""" | |
for i in range(0, len(l), n): | |
yield l[i:i + n] | |
def parse_pagination(url): | |
print url | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
response = requests.get(url, headers=headers, verify=False) | |
print response | |
parser = html.fromstring(response.text) | |
try: | |
results = (int(parser.xpath("//div[contains(@class,'page-of-pages')]//text()")[0].strip().split(' ').pop()))*20 | |
except IndexError: | |
results = 20 | |
print results | |
return results | |
def get_businesses_data(data): | |
businesses, failed_searches = [], [] | |
start_time = time.time() | |
result = {} | |
for i,url in enumerate(data): | |
print ('Starting iteration: ', i) | |
result['url']= url | |
pagination = parse_pagination(url) | |
print ('Pagination: ', pagination) | |
info = parse_page(url) | |
result['info'] = info | |
_reviews = [] | |
for v in xrange(0,pagination,20): | |
paginated_url = result['url'].split('?')[0] + '?start='+str(v) | |
print ('Scraping Reviews: ', paginated_url) | |
_reviews += parse_reviews(paginated_url) | |
time.sleep(.5) | |
result['scraped_reviews'] = _reviews | |
result['scraped_reviews_count'] = len(_reviews) | |
businesses.append(result) | |
print ('Success iteration: ', i) | |
# print ('Results: ', result) | |
print ('Num of reviews: ', str(len(_reviews))) | |
print('') | |
print ('Time Elapsed: ', str(time.time() - start_time)) | |
return businesses | |
if __name__=="__main__": | |
index = 5 | |
#0 | |
size = 20 | |
i = index*20 | |
chunk = yelp_urls[i:i+size] | |
businesses = get_businesses_data(chunk) | |
with open ('results/run_3/output_{}.json'.format(i), 'w') as f: | |
json.dump(businesses,f) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment