-
-
Save aditikhandalkar/1b15f213ed16e702eee905acbefb21a5 to your computer and use it in GitHub Desktop.
Amazon Page Scraper from PDP page
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto | |
from boto.s3.connection import S3Connection | |
s3 = boto.connect_s3() | |
mybucket = s3.get_bucket("YOUR_BUCKET_NAME") | |
print mybucket | |
page_id = "/PAGE_NAME_OR_ID" | |
key_name = "prefix_or_subfolder"+ page_id | |
uploaded_file = mybucket.get_key(key_name) | |
uploaded_file.get_contents_to_filename('downloaded-file.html') | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import codecs | |
f=codecs.open("downloaded-file.html", 'r') | |
page=f.read() | |
#print page | |
#page = urllib2.urlopen("file://downloaded-file.html").read() | |
#print page | |
soup = BeautifulSoup(page) | |
try: | |
ASIN=soup.find("input", id="ASIN") | |
print(ASIN.text) | |
except: | |
print("ASIN not found") | |
try: | |
title = soup.find("span", id="productTitle") | |
print(title.text) | |
except: | |
print("Title not found") | |
try: | |
price_div = soup.find("div", id="_price") | |
price = price_div.find("span") | |
print(price.text) | |
except: | |
print("Price not found") | |
try: | |
reviews = soup.find("span", id="acrCustomerReviewText") | |
print(reviews.text) | |
except: | |
print("reviews not found") | |
try: | |
customerReviewDiv=soup.find("span", id="acrPopover") | |
print(customerReviewDiv["title"]) | |
except: | |
print("CustomerReviewDiv not found") | |
# | |
# these can have multiple definitions | |
# | |
try: | |
availability=soup.find("div", id="availability") | |
print(availability.text) | |
except: | |
print("availability not found") | |
try: | |
brand_name=soup.find("a", id="bylineInfo") | |
print(brand_name.text) | |
except: | |
print("Data not found") | |
#soup.select_one('#name > #numbers').next_sibling | |
try: | |
product_desc_div=soup.find("div", id="productDescription") | |
product_desc=product_desc_div.find("p") | |
print(product_desc.text) | |
except: | |
print("product_desc not found") | |
try: | |
product_desc_div=soup.find("div", id="productDescription") | |
product_desc=product_desc_div.find("p") | |
print(product_desc.text) | |
except: | |
print("product_desc not found") | |
try: | |
bookrank=soup.select("div.kindleAuthorRank > div.nodeRank") | |
#product_desc=bookrank.find("p") | |
print(bookrank.text) | |
except: | |
print("product_desc not found") | |
try: | |
wayfinding-breadcrumbs_feature_div | |
breadcrumbs=soup.find("div", id="wayfinding-breadcrumbs_container") | |
breadcrumbs_striped=breadcrumbs.text.replace("&","&") | |
breadcrumb_final = breadcrumbs_striped.replace("?",">") | |
print(breadcrumb_final) | |
except: | |
print("No breadcrumb found") | |
print "=========\n" | |
try: | |
data = [] | |
table = soup.find('table',id="productDetails_detailBullets_sections1") | |
rows = table.findAll('tr') | |
for row in rows: | |
colHeader = row.findAll('th')[0].string | |
cols = row.findAll('td') | |
cols = [ele.text.strip() for ele in cols] | |
data.append(colHeader) | |
data.append([ele for ele in cols if ele]) | |
#print(data) | |
for row in data: | |
"".join(row) | |
except: | |
print("Table 1 not found") | |
#detail-bullets | |
try: | |
table = [] | |
table_div = soup.find('table',id="productDetailsTable") | |
rows = table.findAll('tr') | |
for row in rows: | |
colHeader = row.findAll('th')[0].string | |
cols = row.findAll('td') | |
cols = [ele.text.strip() for ele in cols] | |
table.append(colHeader) | |
table.append([ele for ele in cols if ele]) | |
print(table) | |
except: | |
print("Table 2 not found") | |
try: | |
table2 = [] | |
table_div_bullets = soup.find('div',id="detail-bullets") | |
#print(table_div_bullets) | |
table_div_bullets_internal = table_div_bullets.find('ul') | |
#print(table_div_bullets_internal) | |
rows = table_div_bullets_internal.findAll('li') | |
#print(rows) | |
for row in rows: | |
print(row.text.replace("&", "&").replace(">",">")) | |
except: | |
print("Table 3 not found") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment