Last active
June 5, 2017 04:36
-
-
Save vishwanath79/eb9e4e4e821b3ae02e2ecadd8326a70a to your computer and use it in GitHub Desktop.
Extract data from tripadvisor hotel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from datetime import datetime | |
from time import time | |
from lxml import html, etree | |
import requests, re | |
import os, sys | |
import unicodecsv as csv | |
import argparse | |
def parse(locality, checkin_date, checkout_date, sort): | |
checkIn = checkin_date.strftime("%Y/%m/%d") | |
checkOut = checkout_date.strftime("%Y/%m/%d") | |
print("Scraper Inititated for Locality:%s" % locality) | |
# TA rendering the autocomplete list using this API | |
print("Finding search result page URL") | |
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime=' + str(int( | |
time())) + '&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query=' + locality | |
api_response = requests.get(geo_url).json() | |
# getting the TA url for th equery from the autocomplete response | |
url_from_autocomplete = "http://www.tripadvisor.com" + api_response['results'][0]['url'] | |
print('URL found %s' % url_from_autocomplete) | |
geo = api_response['results'][0]['value'] | |
# Formating date for writing to file | |
date = checkin_date.strftime("%Y_%m_%d") + "_" + checkout_date.strftime("%Y_%m_%d") | |
# form data to get the hotels list from TA for the selected date | |
form_data = { | |
'adults': '2', | |
'dateBumped': 'NONE', | |
'displayedSortOrder': sort, | |
'geo': geo, | |
'hs': '', | |
'isFirstPageLoad': 'false', | |
'rad': '0', | |
'refineForm': 'true', | |
'requestingServlet': 'Hotels', | |
'rooms': '1', | |
'scid': 'null_coupon', | |
'searchAll': 'false', | |
'seen': '0', | |
'sequence': '7', | |
'o': "0", | |
'staydates': date | |
} | |
# Referrer is necessary to get the correct response from TA if not provided they will redirect to home page | |
headers = { | |
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*', | |
'Accept-Encoding': 'gzip,deflate', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Cache-Control': 'no-cache', | |
'Connection': 'keep-alive', | |
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', | |
'Host': 'www.tripadvisor.com', | |
'Pragma': 'no-cache', | |
'Referer': url_from_autocomplete, | |
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0', | |
'X-Requested-With': 'XMLHttpRequest' | |
} | |
print("Downloading search results page") | |
page_response = requests.post(url="https://www.tripadvisor.com/Hotels", data=form_data, headers=headers).text | |
#print(page_response) | |
print("Parsing results ") | |
parser = html.fromstring(page_response) | |
hotel_lists = parser.xpath('//div[contains(@class,"hotel_content easyClear sem")]') | |
#print(hotel_lists) | |
hotel_data = [] | |
for hotel in hotel_lists: | |
XPATH_HOTEL_LINK = './/div[@class="listing_title"]/a/@href' | |
XPATH_REVIEWS = './/span[@class="more review_count"]//text()' | |
XPATH_RANK = './/div[@class="popRanking"]//text()' | |
XPATH_RATING = './/div[@class="rating"]//span[contains(@class,"bubble_rating")]/@alt' | |
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()' | |
XPATH_HOTEL_FEATURES = './/a[contains(@class,"tag")]/text()' | |
XPATH_HOTEL_PRICE = './/div[contains(@class,"price")]/text()' | |
XPATH_VIEW_DEALS = './/div[contains(@id,"VIEW_ALL_DEALS")]//span[@class="taLnk"]/text()' | |
XPATH_BOOKING_PROVIDER = './/div[contains(@class,"providerLogo")]/img/@alt' | |
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER) | |
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS) | |
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK) | |
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS) | |
raw_rank = hotel.xpath(XPATH_RANK) | |
raw_rating = hotel.xpath(XPATH_RATING) | |
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME) | |
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES) | |
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE) | |
url = 'http://www.tripadvisor.com' + raw_hotel_link[0] if raw_hotel_link else None | |
reviews = re.findall('(\d+\,?\d+)', raw_no_of_reviews[0])[0].replace(',', '') if raw_no_of_reviews else None | |
rank = ''.join(raw_rank) if raw_rank else None | |
rating = ''.join(raw_rating).replace(' of 5 bubbles', '') if raw_rating else None | |
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None | |
hotel_features = ','.join(raw_hotel_features) | |
print("name is ", name) | |
#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None | |
price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None | |
no_of_deals = re.sub('\D+', '', ''.join(raw_no_of_deals)) if raw_no_of_deals else None | |
# no_of_deals = re.sub('\D+','',no_of_deals) | |
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None | |
data = { | |
'hotel_name': name, | |
'url': url, | |
'locality': locality, | |
'reviews': reviews, | |
'tripadvisor_rating': rating, | |
'checkOut': checkOut, | |
'checkIn': checkIn, | |
'hotel_features': hotel_features, | |
'price_per_night': price_per_night, | |
'no_of_deals': no_of_deals, | |
'booking_provider': booking_provider | |
} | |
hotel_data.append(data) | |
return hotel_data | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('checkin_date', help='Hotel Check In Date (Format: YYYY/MM/DD') | |
parser.add_argument('checkout_date', help='Hotel Chek Out Date (Format: YYYY/MM/DD)') | |
sortorder_help = """ | |
available sort orders are :\n | |
priceLow - hotels with lowest price, | |
distLow : Hotels located near to the search center, | |
recommended: highest rated hotels based on traveler reviews, | |
popularity :Most popular hotels as chosen by Tipadvisor users | |
""" | |
parser.add_argument('sort', help=sortorder_help, default='popularity ') | |
parser.add_argument('locality', help='Search Locality') | |
args = parser.parse_args() | |
locality = args.locality | |
checkin_date = datetime.strptime(args.checkin_date, "%Y/%m/%d") | |
checkout_date = datetime.strptime(args.checkout_date, "%Y/%m/%d") | |
sort = args.sort | |
checkIn = checkin_date.strftime("%Y/%m/%d") | |
checkOut = checkout_date.strftime("%Y/%m/%d") | |
today = datetime.now() | |
if today < datetime.strptime(checkIn, "%Y/%m/%d") and datetime.strptime(checkIn, "%Y/%m/%d") < datetime.strptime( | |
checkOut, "%Y/%m/%d"): | |
data = parse(locality, checkin_date, checkout_date, sort) | |
print("Writing to output file tripadvisor_data.csv") | |
with open('tripadvisor_data.csv', 'wb')as csvfile: | |
fieldnames = ['hotel_name', 'url', 'locality', 'reviews', 'tripadvisor_rating', 'checkIn', 'checkOut', | |
'price_per_night', 'booking_provider', 'no_of_deals', 'hotel_features'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
#print(data) | |
for row in data: | |
writer.writerow((row)) | |
# checking whether the entered date is already passed | |
elif today > datetime.strptime(checkIn, "%Y/%m/%d") or today > datetime.strptime(checkOut, "%Y/%m/%d"): | |
print("Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed") | |
elif datetime.strptime(checkIn, "%Y/%m/%d") > datetime.strptime(checkOut, "%Y/%m/%d"): | |
print("Invalid Checkin date: CheckIn date must be less than checkOut date") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
import requests | |
from collections import OrderedDict | |
import pprint | |
import json | |
import argparse | |
def parse(url): | |
print "Fetching "+url | |
response = requests.get(url).text | |
parser = html.fromstring(response) | |
XPATH_RATING = '//div[@id="ratingFilter"]//ul//li' | |
XPATH_NAME = '//h1[@id="HEADING"]//text()' | |
XPATH_HOTEL_RATING = '//span[@property="ratingValue"]//@content' | |
XPATH_REVIEWS = '//a[@property="reviewCount"]/@content' | |
XPATH_RANK = '//div[contains(@class,"popRanking")]//text()' | |
XPATH_STREET_ADDRESS = "//span[@class='street-address']//text()" | |
XPATH_LOCALITY = '//span[@class="format_address"]//span[@class="locality"]//span[@property="v:locality"]//text()' | |
XPATH_ZIP = '//span[@property="v:postal-code"]//text()' | |
XPATH_COUNTRY = '//span[@class="country-name"]/@content' | |
XPATH_AMENITIES = '//div[@id="AMENITIES_TAB"]//div[contains(@class,"amenity_row")]' | |
XPATH_HIGHLIGHTS = '//div[@class="property_tags_wrap"]//li//text()' | |
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(@class,"additional_info")]//span[contains(@class,"tabs_descriptive_text")]//text()' | |
XPATH_ADDITIONAL_INFO = '//div[@class="additional_info_amenities"]//div[@class="content"]//text()' | |
ratings = parser.xpath(XPATH_RATING) | |
raw_name = parser.xpath(XPATH_NAME) | |
raw_rank = parser.xpath(XPATH_RANK) | |
raw_street_address = parser.xpath(XPATH_STREET_ADDRESS) | |
raw_locality = parser.xpath(XPATH_LOCALITY) | |
raw_zipcode = parser.xpath(XPATH_ZIP) | |
raw_country = parser.xpath(XPATH_COUNTRY) | |
raw_review_count = parser.xpath(XPATH_REVIEWS) | |
raw_rating = parser.xpath(XPATH_HOTEL_RATING) | |
amenities = parser.xpath(XPATH_AMENITIES) | |
raw_highlights = parser.xpath(XPATH_HIGHLIGHTS) | |
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION) | |
raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO) | |
name = ''.join(raw_name).strip() if raw_name else None | |
rank = ''.join(raw_rank).strip() if raw_rank else None | |
street_address = raw_street_address[0].strip() if raw_street_address else None | |
locality = raw_locality[0].strip() if raw_locality else None | |
zipcode = ''.join(raw_zipcode).strip() if raw_zipcode else None | |
country = raw_country[0].strip() if raw_country else None | |
review_count = ''.join(raw_review_count).strip() if raw_review_count else None | |
hotel_rating = ''.join(raw_rating).strip() if raw_rating else None | |
official_description = ' '.join(' '.join(raw_official_description).split()) if raw_official_description else None | |
additional_info = ' '.join(''.join(raw_additional_info).split()) if raw_additional_info else None | |
cleaned_highlights = filter(lambda x:x != '\n', raw_highlights) | |
highlights = ','.join(cleaned_highlights).replace('\n','') | |
# Ordereddict is for preserve the site order | |
ratings_dict = OrderedDict() | |
for rating in ratings: | |
XPATH_RATING_KEY = './/div[@class="row_label"]//text()' | |
XPATH_RATING_VALUE = './/span[@class="row_bar"]/following-sibling::span//text()' | |
raw_rating_key = rating.xpath(XPATH_RATING_KEY) | |
raw_rating_value = rating.xpath(XPATH_RATING_VALUE) | |
cleaned_rating_key = ''.join(raw_rating_key).replace('\n','') | |
cleaned_rating_value = ''.join(raw_rating_value).replace('\n','') | |
ratings_dict.update({cleaned_rating_key:cleaned_rating_value}) | |
amenity_dict = OrderedDict() | |
for amenity in amenities: | |
XPATH_AMENITY_KEY = './/div[@class="amenity_hdr"]//text()' | |
XPATH_AMENITY_VALUE = './/div[@class="amenity_lst"]//li/text()' | |
raw_amenity_key = amenity.xpath(XPATH_AMENITY_KEY) | |
raw_amenity_value = amenity.xpath(XPATH_AMENITY_VALUE) | |
cleaned_aminity_value = filter(lambda x:x != ' ', raw_amenity_value) | |
amenity_key = ''.join(raw_amenity_key).replace('\n','') | |
amenity_value = ' ,'.join(cleaned_aminity_value).replace('\n','') | |
amenity_dict.update({amenity_key:amenity_value}) | |
address = { 'street_address':street_address, | |
'locality':locality, | |
'zipcode':zipcode, | |
'country':country | |
} | |
data = { | |
'address':address, | |
'ratings':ratings_dict, | |
'amenities':amenity_dict, | |
'official_description':official_description, | |
'additional_info':additional_info, | |
'rating':hotel_rating, | |
'review_count':review_count, | |
'name':name, | |
'rank':rank, | |
'highlights':highlights | |
} | |
return data | |
if __name__=='__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('url',help='Tripadvisor hotel url') | |
args = parser.parse_args() | |
url = args.url | |
scraped_data = parse(url) | |
with open('tripadvisor_hotel_scraped_data.json','w') as f: | |
json.dump(scraped_data,f,indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated tripadvisor_scraper.py to work on Python 3.5.2.