vishwanath79 · June 5, 2017 04:36 · vishwanath79 · Jun 5, 2017
diff --git a/tripadvisor_scraper.py b/tripadvisor_scraper.py
 #!/usr/bin/env python
 from datetime import datetime
 from time import time
 from lxml import html, etree
 import requests, re
 import os, sys
 import unicodecsv as csv
 import argparse


 def parse(locality, checkin_date, checkout_date, sort):
    checkIn = checkin_date.strftime("%Y/%m/%d")
    checkOut = checkout_date.strftime("%Y/%m/%d")
    print("Scraper Inititated for Locality:%s" % locality)
    # TA rendering the autocomplete list using this API
    print("Finding search result page URL")
    geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime=' + str(int(
        time())) + '&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query=' + locality
    api_response = requests.get(geo_url).json()
    # getting the TA url for th equery from the autocomplete response
    url_from_autocomplete = "http://www.tripadvisor.com" + api_response['results'][0]['url']
    print('URL found %s' % url_from_autocomplete)
    geo = api_response['results'][0]['value']
    # Formating date for writing to file

    date = checkin_date.strftime("%Y_%m_%d") + "_" + checkout_date.strftime("%Y_%m_%d")
    # form data to get the hotels list from TA for the selected date
    form_data = {
        'adults': '2',
        'dateBumped': 'NONE',
        'displayedSortOrder': sort,
        'geo': geo,
        'hs': '',
        'isFirstPageLoad': 'false',
        'rad': '0',
        'refineForm': 'true',
        'requestingServlet': 'Hotels',
        'rooms': '1',
        'scid': 'null_coupon',
        'searchAll': 'false',
        'seen': '0',
        'sequence': '7',
        'o': "0",
        'staydates': date
    }
    # Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
    headers = {
        'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
        'Accept-Encoding': 'gzip,deflate',
        'Accept-Language': 'en-US,en;q=0.5',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
        'Host': 'www.tripadvisor.com',
        'Pragma': 'no-cache',
        'Referer': url_from_autocomplete,
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
        'X-Requested-With': 'XMLHttpRequest'
    }
    print("Downloading search results page")
    page_response = requests.post(url="https://www.tripadvisor.com/Hotels", data=form_data, headers=headers).text
    #print(page_response)
    print("Parsing results ")
    parser = html.fromstring(page_response)
    hotel_lists = parser.xpath('//div[contains(@class,"hotel_content easyClear sem")]')
    #print(hotel_lists)
    hotel_data = []
    for hotel in hotel_lists:
        XPATH_HOTEL_LINK = './/div[@class="listing_title"]/a/@href'

        XPATH_REVIEWS = './/span[@class="more review_count"]//text()'
        XPATH_RANK = './/div[@class="popRanking"]//text()'
        XPATH_RATING = './/div[@class="rating"]//span[contains(@class,"bubble_rating")]/@alt'
        XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
        XPATH_HOTEL_FEATURES = './/a[contains(@class,"tag")]/text()'
        XPATH_HOTEL_PRICE = './/div[contains(@class,"price")]/text()'
        XPATH_VIEW_DEALS = './/div[contains(@id,"VIEW_ALL_DEALS")]//span[@class="taLnk"]/text()'
        XPATH_BOOKING_PROVIDER = './/div[contains(@class,"providerLogo")]/img/@alt'

        raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
        raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
        raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
        raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
        raw_rank = hotel.xpath(XPATH_RANK)
        raw_rating = hotel.xpath(XPATH_RATING)
        raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
        raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
        raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)

        url = 'http://www.tripadvisor.com' + raw_hotel_link[0] if raw_hotel_link else  None

        reviews = re.findall('(\d+\,?\d+)', raw_no_of_reviews[0])[0].replace(',', '') if raw_no_of_reviews else None
        rank = ''.join(raw_rank) if raw_rank else None
        rating = ''.join(raw_rating).replace(' of 5 bubbles', '') if raw_rating else None
        name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
        hotel_features = ','.join(raw_hotel_features)
        print("name is ", name)
        #price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
        price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
        no_of_deals = re.sub('\D+', '', ''.join(raw_no_of_deals)) if raw_no_of_deals else None
        # no_of_deals = re.sub('\D+','',no_of_deals)
        booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None



        data = {
            'hotel_name': name,
            'url': url,
            'locality': locality,
            'reviews': reviews,
            'tripadvisor_rating': rating,
            'checkOut': checkOut,
            'checkIn': checkIn,
            'hotel_features': hotel_features,
            'price_per_night': price_per_night,
            'no_of_deals': no_of_deals,
            'booking_provider': booking_provider

        }
        hotel_data.append(data)
    return hotel_data


 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('checkin_date', help='Hotel Check In Date (Format: YYYY/MM/DD')
    parser.add_argument('checkout_date', help='Hotel Chek Out Date (Format: YYYY/MM/DD)')
    sortorder_help = """
    available sort orders are :\n
    priceLow - hotels with lowest price,
    distLow : Hotels located near to the search center,
    recommended: highest rated hotels based on traveler reviews,
    popularity :Most popular hotels as chosen by Tipadvisor users 
    """
    parser.add_argument('sort', help=sortorder_help, default='popularity ')
    parser.add_argument('locality', help='Search Locality')
    args = parser.parse_args()
    locality = args.locality
    checkin_date = datetime.strptime(args.checkin_date, "%Y/%m/%d")
    checkout_date = datetime.strptime(args.checkout_date, "%Y/%m/%d")
    sort = args.sort
    checkIn = checkin_date.strftime("%Y/%m/%d")
    checkOut = checkout_date.strftime("%Y/%m/%d")
    today = datetime.now()

    if today < datetime.strptime(checkIn, "%Y/%m/%d") and datetime.strptime(checkIn, "%Y/%m/%d") < datetime.strptime(
            checkOut, "%Y/%m/%d"):
        data = parse(locality, checkin_date, checkout_date, sort)

        print("Writing to output file tripadvisor_data.csv")
        with open('tripadvisor_data.csv', 'wb')as csvfile:
            fieldnames = ['hotel_name', 'url', 'locality', 'reviews', 'tripadvisor_rating', 'checkIn', 'checkOut',
                          'price_per_night', 'booking_provider', 'no_of_deals', 'hotel_features']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            #print(data)
            for row in data:
                writer.writerow((row))

    # checking whether the entered date is already passed
    elif today > datetime.strptime(checkIn, "%Y/%m/%d") or today > datetime.strptime(checkOut, "%Y/%m/%d"):
        print("Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed")

    elif datetime.strptime(checkIn, "%Y/%m/%d") > datetime.strptime(checkOut, "%Y/%m/%d"):
        print("Invalid Checkin date: CheckIn date must be less than checkOut date")
diff --git a/tripadvisor_scraper_hotel.py b/tripadvisor_scraper_hotel.py
 from lxml import html
 import requests
 from collections import OrderedDict
 import pprint
 import json
 import argparse

 def parse(url):
 	print "Fetching "+url
 	response = requests.get(url).text
 	parser = html.fromstring(response)
 	
 	XPATH_RATING = '//div[@id="ratingFilter"]//ul//li'
 	XPATH_NAME = '//h1[@id="HEADING"]//text()'
 	XPATH_HOTEL_RATING = '//span[@property="ratingValue"]//@content'
 	XPATH_REVIEWS = '//a[@property="reviewCount"]/@content'
 	XPATH_RANK = '//div[contains(@class,"popRanking")]//text()'
 	XPATH_STREET_ADDRESS = "//span[@class='street-address']//text()"
 	XPATH_LOCALITY  = '//span[@class="format_address"]//span[@class="locality"]//span[@property="v:locality"]//text()'
 	XPATH_ZIP = '//span[@property="v:postal-code"]//text()'
 	XPATH_COUNTRY = '//span[@class="country-name"]/@content'
 	XPATH_AMENITIES = '//div[@id="AMENITIES_TAB"]//div[contains(@class,"amenity_row")]' 
 	XPATH_HIGHLIGHTS = '//div[@class="property_tags_wrap"]//li//text()'
 	XPATH_OFFICIAL_DESCRIPTION = '//div[contains(@class,"additional_info")]//span[contains(@class,"tabs_descriptive_text")]//text()'
 	XPATH_ADDITIONAL_INFO = '//div[@class="additional_info_amenities"]//div[@class="content"]//text()'
 	
 	ratings = parser.xpath(XPATH_RATING)
 	raw_name = parser.xpath(XPATH_NAME)
 	raw_rank = parser.xpath(XPATH_RANK)
 	raw_street_address = parser.xpath(XPATH_STREET_ADDRESS)
 	raw_locality = parser.xpath(XPATH_LOCALITY)
 	raw_zipcode =  parser.xpath(XPATH_ZIP)
 	raw_country = parser.xpath(XPATH_COUNTRY)
 	raw_review_count = parser.xpath(XPATH_REVIEWS)
 	raw_rating = parser.xpath(XPATH_HOTEL_RATING)
 	amenities = parser.xpath(XPATH_AMENITIES)
 	raw_highlights = parser.xpath(XPATH_HIGHLIGHTS)
 	raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
 	raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO)

 					
 	name = ''.join(raw_name).strip() if raw_name else None
 	rank = ''.join(raw_rank).strip() if raw_rank else None
 	street_address = raw_street_address[0].strip() if raw_street_address else None
 	locality = raw_locality[0].strip() if raw_locality else None
 	zipcode = ''.join(raw_zipcode).strip() if raw_zipcode else None
 	country  = raw_country[0].strip() if raw_country else None
 	review_count = ''.join(raw_review_count).strip() if raw_review_count else None
 	hotel_rating = ''.join(raw_rating).strip() if raw_rating else None
 	official_description = ' '.join(' '.join(raw_official_description).split()) if raw_official_description else None
 	additional_info = ' '.join(''.join(raw_additional_info).split()) if raw_additional_info else None
 	cleaned_highlights = filter(lambda x:x != '\n', raw_highlights)
 	
 	highlights = ','.join(cleaned_highlights).replace('\n','')
 	# Ordereddict is for preserve the site order
 	ratings_dict = OrderedDict()
 	for rating in ratings:
 		XPATH_RATING_KEY = './/div[@class="row_label"]//text()'
 		XPATH_RATING_VALUE = './/span[@class="row_bar"]/following-sibling::span//text()'

 		raw_rating_key = rating.xpath(XPATH_RATING_KEY)
 		raw_rating_value = rating.xpath(XPATH_RATING_VALUE)

 		cleaned_rating_key = ''.join(raw_rating_key).replace('\n','')
 		cleaned_rating_value = ''.join(raw_rating_value).replace('\n','')
 		ratings_dict.update({cleaned_rating_key:cleaned_rating_value})
 	

 	amenity_dict = OrderedDict()
 	for amenity in amenities:
 		XPATH_AMENITY_KEY = './/div[@class="amenity_hdr"]//text()'
 		XPATH_AMENITY_VALUE = './/div[@class="amenity_lst"]//li/text()'

 		raw_amenity_key = amenity.xpath(XPATH_AMENITY_KEY)
 		raw_amenity_value = amenity.xpath(XPATH_AMENITY_VALUE)
 		cleaned_aminity_value = filter(lambda x:x != ' ', raw_amenity_value)

 		amenity_key = ''.join(raw_amenity_key).replace('\n','')
 		amenity_value = ' ,'.join(cleaned_aminity_value).replace('\n','')
 		amenity_dict.update({amenity_key:amenity_value})
 	
 	address = {		'street_address':street_address,
 					'locality':locality,
 					'zipcode':zipcode,
 					'country':country
 	}

 	data = {
 				'address':address,
 				'ratings':ratings_dict,
 				'amenities':amenity_dict,
 				'official_description':official_description,
 				'additional_info':additional_info,
 				'rating':hotel_rating,
 				'review_count':review_count,
 				'name':name,
 				'rank':rank,
 				'highlights':highlights
 	}

 	return data

 if __name__=='__main__':
 	parser = argparse.ArgumentParser()
 	parser.add_argument('url',help='Tripadvisor hotel url')
 	args = parser.parse_args()
 	url = args.url
 	scraped_data = parse(url)
 	with open('tripadvisor_hotel_scraped_data.json','w') as f:
 		json.dump(scraped_data,f,indent=4)
	#!/usr/bin/env python
	from datetime import datetime
	from time import time
	from lxml import html, etree
	import requests, re
	import os, sys
	import unicodecsv as csv
	import argparse


	def parse(locality, checkin_date, checkout_date, sort):
	checkIn = checkin_date.strftime("%Y/%m/%d")
	checkOut = checkout_date.strftime("%Y/%m/%d")
	print("Scraper Inititated for Locality:%s" % locality)
	# TA rendering the autocomplete list using this API
	print("Finding search result page URL")
	geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime=' + str(int(
	time())) + '&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query=' + locality
	api_response = requests.get(geo_url).json()
	# getting the TA url for th equery from the autocomplete response
	url_from_autocomplete = "http://www.tripadvisor.com" + api_response['results'][0]['url']
	print('URL found %s' % url_from_autocomplete)
	geo = api_response['results'][0]['value']
	# Formating date for writing to file

	date = checkin_date.strftime("%Y_%m_%d") + "_" + checkout_date.strftime("%Y_%m_%d")
	# form data to get the hotels list from TA for the selected date
	form_data = {
	'adults': '2',
	'dateBumped': 'NONE',
	'displayedSortOrder': sort,
	'geo': geo,
	'hs': '',
	'isFirstPageLoad': 'false',
	'rad': '0',
	'refineForm': 'true',
	'requestingServlet': 'Hotels',
	'rooms': '1',
	'scid': 'null_coupon',
	'searchAll': 'false',
	'seen': '0',
	'sequence': '7',
	'o': "0",
	'staydates': date
	}
	# Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
	headers = {
	'Accept': 'text/javascript, text/html, application/xml, text/xml, /',
	'Accept-Encoding': 'gzip,deflate',
	'Accept-Language': 'en-US,en;q=0.5',
	'Cache-Control': 'no-cache',
	'Connection': 'keep-alive',
	'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
	'Host': 'www.tripadvisor.com',
	'Pragma': 'no-cache',
	'Referer': url_from_autocomplete,
	'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
	'X-Requested-With': 'XMLHttpRequest'
	}
	print("Downloading search results page")
	page_response = requests.post(url="https://www.tripadvisor.com/Hotels", data=form_data, headers=headers).text
	#print(page_response)
	print("Parsing results ")
	parser = html.fromstring(page_response)
	hotel_lists = parser.xpath('//div[contains(@class,"hotel_content easyClear sem")]')
	#print(hotel_lists)
	hotel_data = []
	for hotel in hotel_lists:
	XPATH_HOTEL_LINK = './/div[@class="listing_title"]/a/@href'

	XPATH_REVIEWS = './/span[@class="more review_count"]//text()'
	XPATH_RANK = './/div[@class="popRanking"]//text()'
	XPATH_RATING = './/div[@class="rating"]//span[contains(@class,"bubble_rating")]/@alt'
	XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
	XPATH_HOTEL_FEATURES = './/a[contains(@class,"tag")]/text()'
	XPATH_HOTEL_PRICE = './/div[contains(@class,"price")]/text()'
	XPATH_VIEW_DEALS = './/div[contains(@id,"VIEW_ALL_DEALS")]//span[@class="taLnk"]/text()'
	XPATH_BOOKING_PROVIDER = './/div[contains(@class,"providerLogo")]/img/@alt'

	raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
	raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
	raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
	raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
	raw_rank = hotel.xpath(XPATH_RANK)
	raw_rating = hotel.xpath(XPATH_RATING)
	raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
	raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
	raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)

	url = 'http://www.tripadvisor.com' + raw_hotel_link[0] if raw_hotel_link else None

	reviews = re.findall('(\d+\,?\d+)', raw_no_of_reviews[0])[0].replace(',', '') if raw_no_of_reviews else None
	rank = ''.join(raw_rank) if raw_rank else None
	rating = ''.join(raw_rating).replace(' of 5 bubbles', '') if raw_rating else None
	name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
	hotel_features = ','.join(raw_hotel_features)
	print("name is ", name)
	#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
	price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
	no_of_deals = re.sub('\D+', '', ''.join(raw_no_of_deals)) if raw_no_of_deals else None
	# no_of_deals = re.sub('\D+','',no_of_deals)
	booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None



	data = {
	'hotel_name': name,
	'url': url,
	'locality': locality,
	'reviews': reviews,
	'tripadvisor_rating': rating,
	'checkOut': checkOut,
	'checkIn': checkIn,
	'hotel_features': hotel_features,
	'price_per_night': price_per_night,
	'no_of_deals': no_of_deals,
	'booking_provider': booking_provider

	}
	hotel_data.append(data)
	return hotel_data


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('checkin_date', help='Hotel Check In Date (Format: YYYY/MM/DD')
	parser.add_argument('checkout_date', help='Hotel Chek Out Date (Format: YYYY/MM/DD)')
	sortorder_help = """
	available sort orders are :\n
	priceLow - hotels with lowest price,
	distLow : Hotels located near to the search center,
	recommended: highest rated hotels based on traveler reviews,
	popularity :Most popular hotels as chosen by Tipadvisor users
	"""
	parser.add_argument('sort', help=sortorder_help, default='popularity ')
	parser.add_argument('locality', help='Search Locality')
	args = parser.parse_args()
	locality = args.locality
	checkin_date = datetime.strptime(args.checkin_date, "%Y/%m/%d")
	checkout_date = datetime.strptime(args.checkout_date, "%Y/%m/%d")
	sort = args.sort
	checkIn = checkin_date.strftime("%Y/%m/%d")
	checkOut = checkout_date.strftime("%Y/%m/%d")
	today = datetime.now()

	if today < datetime.strptime(checkIn, "%Y/%m/%d") and datetime.strptime(checkIn, "%Y/%m/%d") < datetime.strptime(
	checkOut, "%Y/%m/%d"):
	data = parse(locality, checkin_date, checkout_date, sort)

	print("Writing to output file tripadvisor_data.csv")
	with open('tripadvisor_data.csv', 'wb')as csvfile:
	fieldnames = ['hotel_name', 'url', 'locality', 'reviews', 'tripadvisor_rating', 'checkIn', 'checkOut',
	'price_per_night', 'booking_provider', 'no_of_deals', 'hotel_features']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	#print(data)
	for row in data:
	writer.writerow((row))

	# checking whether the entered date is already passed
	elif today > datetime.strptime(checkIn, "%Y/%m/%d") or today > datetime.strptime(checkOut, "%Y/%m/%d"):
	print("Invalid Checkin date: Please enter a valid checkin and checkout dates,entered date is already passed")

	elif datetime.strptime(checkIn, "%Y/%m/%d") > datetime.strptime(checkOut, "%Y/%m/%d"):
	print("Invalid Checkin date: CheckIn date must be less than checkOut date")
	from lxml import html
	import requests
	from collections import OrderedDict
	import pprint
	import json
	import argparse

	def parse(url):
	print "Fetching "+url
	response = requests.get(url).text
	parser = html.fromstring(response)

	XPATH_RATING = '//div[@id="ratingFilter"]//ul//li'
	XPATH_NAME = '//h1[@id="HEADING"]//text()'
	XPATH_HOTEL_RATING = '//span[@property="ratingValue"]//@content'
	XPATH_REVIEWS = '//a[@property="reviewCount"]/@content'
	XPATH_RANK = '//div[contains(@class,"popRanking")]//text()'
	XPATH_STREET_ADDRESS = "//span[@class='street-address']//text()"
	XPATH_LOCALITY = '//span[@class="format_address"]//span[@class="locality"]//span[@property="v:locality"]//text()'
	XPATH_ZIP = '//span[@property="v:postal-code"]//text()'
	XPATH_COUNTRY = '//span[@class="country-name"]/@content'
	XPATH_AMENITIES = '//div[@id="AMENITIES_TAB"]//div[contains(@class,"amenity_row")]'
	XPATH_HIGHLIGHTS = '//div[@class="property_tags_wrap"]//li//text()'
	XPATH_OFFICIAL_DESCRIPTION = '//div[contains(@class,"additional_info")]//span[contains(@class,"tabs_descriptive_text")]//text()'
	XPATH_ADDITIONAL_INFO = '//div[@class="additional_info_amenities"]//div[@class="content"]//text()'

	ratings = parser.xpath(XPATH_RATING)
	raw_name = parser.xpath(XPATH_NAME)
	raw_rank = parser.xpath(XPATH_RANK)
	raw_street_address = parser.xpath(XPATH_STREET_ADDRESS)
	raw_locality = parser.xpath(XPATH_LOCALITY)
	raw_zipcode = parser.xpath(XPATH_ZIP)
	raw_country = parser.xpath(XPATH_COUNTRY)
	raw_review_count = parser.xpath(XPATH_REVIEWS)
	raw_rating = parser.xpath(XPATH_HOTEL_RATING)
	amenities = parser.xpath(XPATH_AMENITIES)
	raw_highlights = parser.xpath(XPATH_HIGHLIGHTS)
	raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
	raw_additional_info = parser.xpath(XPATH_ADDITIONAL_INFO)


	name = ''.join(raw_name).strip() if raw_name else None
	rank = ''.join(raw_rank).strip() if raw_rank else None
	street_address = raw_street_address[0].strip() if raw_street_address else None
	locality = raw_locality[0].strip() if raw_locality else None
	zipcode = ''.join(raw_zipcode).strip() if raw_zipcode else None
	country = raw_country[0].strip() if raw_country else None
	review_count = ''.join(raw_review_count).strip() if raw_review_count else None
	hotel_rating = ''.join(raw_rating).strip() if raw_rating else None
	official_description = ' '.join(' '.join(raw_official_description).split()) if raw_official_description else None
	additional_info = ' '.join(''.join(raw_additional_info).split()) if raw_additional_info else None
	cleaned_highlights = filter(lambda x:x != '\n', raw_highlights)

	highlights = ','.join(cleaned_highlights).replace('\n','')
	# Ordereddict is for preserve the site order
	ratings_dict = OrderedDict()
	for rating in ratings:
	XPATH_RATING_KEY = './/div[@class="row_label"]//text()'
	XPATH_RATING_VALUE = './/span[@class="row_bar"]/following-sibling::span//text()'

	raw_rating_key = rating.xpath(XPATH_RATING_KEY)
	raw_rating_value = rating.xpath(XPATH_RATING_VALUE)

	cleaned_rating_key = ''.join(raw_rating_key).replace('\n','')
	cleaned_rating_value = ''.join(raw_rating_value).replace('\n','')
	ratings_dict.update({cleaned_rating_key:cleaned_rating_value})


	amenity_dict = OrderedDict()
	for amenity in amenities:
	XPATH_AMENITY_KEY = './/div[@class="amenity_hdr"]//text()'
	XPATH_AMENITY_VALUE = './/div[@class="amenity_lst"]//li/text()'

	raw_amenity_key = amenity.xpath(XPATH_AMENITY_KEY)
	raw_amenity_value = amenity.xpath(XPATH_AMENITY_VALUE)
	cleaned_aminity_value = filter(lambda x:x != ' ', raw_amenity_value)

	amenity_key = ''.join(raw_amenity_key).replace('\n','')
	amenity_value = ' ,'.join(cleaned_aminity_value).replace('\n','')
	amenity_dict.update({amenity_key:amenity_value})

	address = { 'street_address':street_address,
	'locality':locality,
	'zipcode':zipcode,
	'country':country
	}

	data = {
	'address':address,
	'ratings':ratings_dict,
	'amenities':amenity_dict,
	'official_description':official_description,
	'additional_info':additional_info,
	'rating':hotel_rating,
	'review_count':review_count,
	'name':name,
	'rank':rank,
	'highlights':highlights
	}

	return data

	if __name__=='__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('url',help='Tripadvisor hotel url')
	args = parser.parse_args()
	url = args.url
	scraped_data = parse(url)
	with open('tripadvisor_hotel_scraped_data.json','w') as f:
	json.dump(scraped_data,f,indent=4)