-
-
Save scrapehero/cc8647960afeb2c15a6061d7ab7e594e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Written as part of https://www.scrapehero.com/how-to-scrape-amazon-product-reviews-using-python/ | |
from lxml import html | |
import json | |
import requests | |
import json,re | |
from dateutil import parser as dateparser | |
from time import sleep | |
def ParseReviews(asin): | |
# Added Retrying | |
for i in range(5): | |
try: | |
#This script has only been tested with Amazon.com | |
amazon_url = 'http://www.amazon.com/dp/'+asin | |
# Add some recent user agent to prevent amazon from blocking the request | |
# Find some chrome user agent strings here https://udger.com/resources/ua-list/browser-detail?browser=Chrome | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} | |
page = requests.get(amazon_url,headers = headers) | |
page_response = page.text | |
parser = html.fromstring(page_response) | |
XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]' | |
XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]' | |
XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]' | |
XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr' | |
XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()' | |
XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()' | |
raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE) | |
product_price = ''.join(raw_product_price).replace(',','') | |
raw_product_name = parser.xpath(XPATH_PRODUCT_NAME) | |
product_name = ''.join(raw_product_name).strip() | |
total_ratings = parser.xpath(XPATH_AGGREGATE_RATING) | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_1) | |
if not reviews: | |
reviews = parser.xpath(XPATH_REVIEW_SECTION_2) | |
ratings_dict = {} | |
reviews_list = [] | |
if not reviews: | |
raise ValueError('unable to find reviews in page') | |
#grabing the rating section in product page | |
for ratings in total_ratings: | |
extracted_rating = ratings.xpath('./td//a//text()') | |
if extracted_rating: | |
rating_key = extracted_rating[0] | |
raw_raing_value = extracted_rating[1] | |
rating_value = raw_raing_value | |
if rating_key: | |
ratings_dict.update({rating_key:rating_value}) | |
#Parsing individual reviews | |
for review in reviews: | |
XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()' | |
XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()' | |
XPATH_REVIEW_POSTED_DATE = './/a[contains(@href,"/profile/")]/parent::span/following-sibling::span/text()' | |
XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()' | |
XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview' | |
XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()' | |
XPATH_AUTHOR = './/a[contains(@href,"/profile/")]/parent::span//text()' | |
XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()' | |
raw_review_author = review.xpath(XPATH_AUTHOR) | |
raw_review_rating = review.xpath(XPATH_RATING) | |
raw_review_header = review.xpath(XPATH_REVIEW_HEADER) | |
raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE) | |
raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1) | |
raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2) | |
raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3) | |
author = ' '.join(' '.join(raw_review_author).split()).strip('By') | |
#cleaning data | |
review_rating = ''.join(raw_review_rating).replace('out of 5 stars','') | |
review_header = ' '.join(' '.join(raw_review_header).split()) | |
review_posted_date = dateparser.parse(''.join(raw_review_posted_date)).strftime('%d %b %Y') | |
review_text = ' '.join(' '.join(raw_review_text1).split()) | |
#grabbing hidden comments if present | |
if raw_review_text2: | |
json_loaded_review_data = json.loads(raw_review_text2[0]) | |
json_loaded_review_data_text = json_loaded_review_data['rest'] | |
cleaned_json_loaded_review_data_text = re.sub('<.*?>','',json_loaded_review_data_text) | |
full_review_text = review_text+cleaned_json_loaded_review_data_text | |
else: | |
full_review_text = review_text | |
if not raw_review_text1: | |
full_review_text = ' '.join(' '.join(raw_review_text3).split()) | |
raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS) | |
review_comments = ''.join(raw_review_comments) | |
review_comments = re.sub('[A-Za-z]','',review_comments).strip() | |
review_dict = { | |
'review_comment_count':review_comments, | |
'review_text':full_review_text, | |
'review_posted_date':review_posted_date, | |
'review_header':review_header, | |
'review_rating':review_rating, | |
'review_author':author | |
} | |
reviews_list.append(review_dict) | |
data = { | |
'ratings':ratings_dict, | |
'reviews':reviews_list, | |
'url':amazon_url, | |
'price':product_price, | |
'name':product_name | |
} | |
return data | |
except ValueError: | |
print "Retrying to get the correct response" | |
return {"error":"failed to process the page","asin":asin} | |
def ReadAsin(): | |
#Add your own ASINs here | |
AsinList = ['B01ETPUQ6E','B017HW9DEW'] | |
extracted_data = [] | |
for asin in AsinList: | |
print "Downloading and processing page http://www.amazon.com/dp/"+asin | |
extracted_data.append(ParseReviews(asin)) | |
sleep(5) | |
f=open('data.json','w') | |
json.dump(extracted_data,f,indent=4) | |
if __name__ == '__main__': | |
ReadAsin() |
Vscrape.com are providing amazon scraping tools without IP blocked and Banned.Using that tools any one can scrape million of records easily.
Below is Few Tools we provide
1.Amazon Scraping and Reprice tools
2.Amazon competitor products monitor tools
3.FBA scraping tools
4.Buybox Scraping tools
5.Amazon title modifications alert tools
6.Amazon to Ebay Price comparisons
7.Amazon to Ebay automatic scraping and listing tools and maintain price and stocks
8.Aliexpress to Ebay Automatic listing tools and maintain price and stocks
9.Walmart,Bhphotovideo,best buy and many other website to Ebay listing tools and maintain price and stocks
10.Ebay scraping tools and Tracking tools
11.ASIN track tools
12.Ebay Listing tools
13.Scrape million of data from any website. etc.....
based on your needs i can develop or modify this tools
Contact us for demo
#1 Web Scraping Software - Vscrape.com | Free Developer Support
Hey, may you explain what @data-hook, for example on line 26 is exactly meant for?
Please suggest the approach to follow for extraction of all the reviews.
Hi! Thanks for sharing the code! Could you please help, what should I do with that error?
InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
(of course I'll dive into the url provided and try to follow the instructions, but should the error appear by default?)
Thanks again
I'm getting only following output after executing this code. Can you suggest how to make it work?
Downloading and processing page http://www.amazon.com/dp/B01ETPUQ6E
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Downloading and processing page http://www.amazon.com/dp/B017HW9DEW
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
I am getting an error:
line 4, in
from lxml import html
ImportError: No module named lxml
Any ideas?
The output is the following if we run the above code.
Downloading and processing page http://www.amazon.com/dp/B01ETPUQ6E
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Downloading and processing page http://www.amazon.com/dp/B017HW9DEW
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
Retrying to get the correct response
It created a .json file which said that the connection to the page was unsuccessful. Please do rectify this.
nice way of scraping amazon reviews. thanks for your tips!!!
before it as a non-tech user, I am using amazon scraping tool https://e-scraper.com/useful-articles/a-hassle-free-method-to-scrape-amazon-reviews/. maybe it helps to somebody too.
thanks this code works but only for limited reviews. how to get all reviews?