Skip to content

Instantly share code, notes, and snippets.

@aditikhandalkar
Last active January 11, 2018 09:27
Show Gist options
  • Save aditikhandalkar/1b15f213ed16e702eee905acbefb21a5 to your computer and use it in GitHub Desktop.
Save aditikhandalkar/1b15f213ed16e702eee905acbefb21a5 to your computer and use it in GitHub Desktop.
Amazon Page Scraper from PDP page
import boto
from boto.s3.connection import S3Connection
s3 = boto.connect_s3()
mybucket = s3.get_bucket("YOUR_BUCKET_NAME")
print mybucket
page_id = "/PAGE_NAME_OR_ID"
key_name = "prefix_or_subfolder"+ page_id
uploaded_file = mybucket.get_key(key_name)
uploaded_file.get_contents_to_filename('downloaded-file.html')
import urllib2
from BeautifulSoup import BeautifulSoup
import codecs
f=codecs.open("downloaded-file.html", 'r')
page=f.read()
#print page
#page = urllib2.urlopen("file://downloaded-file.html").read()
#print page
soup = BeautifulSoup(page)
try:
ASIN=soup.find("input", id="ASIN")
print(ASIN.text)
except:
print("ASIN not found")
try:
title = soup.find("span", id="productTitle")
print(title.text)
except:
print("Title not found")
try:
price_div = soup.find("div", id="_price")
price = price_div.find("span")
print(price.text)
except:
print("Price not found")
try:
reviews = soup.find("span", id="acrCustomerReviewText")
print(reviews.text)
except:
print("reviews not found")
try:
customerReviewDiv=soup.find("span", id="acrPopover")
print(customerReviewDiv["title"])
except:
print("CustomerReviewDiv not found")
#
# these can have multiple definitions
#
try:
availability=soup.find("div", id="availability")
print(availability.text)
except:
print("availability not found")
try:
brand_name=soup.find("a", id="bylineInfo")
print(brand_name.text)
except:
print("Data not found")
#soup.select_one('#name > #numbers').next_sibling
try:
product_desc_div=soup.find("div", id="productDescription")
product_desc=product_desc_div.find("p")
print(product_desc.text)
except:
print("product_desc not found")
try:
product_desc_div=soup.find("div", id="productDescription")
product_desc=product_desc_div.find("p")
print(product_desc.text)
except:
print("product_desc not found")
try:
bookrank=soup.select("div.kindleAuthorRank > div.nodeRank")
#product_desc=bookrank.find("p")
print(bookrank.text)
except:
print("product_desc not found")
try:
wayfinding-breadcrumbs_feature_div
breadcrumbs=soup.find("div", id="wayfinding-breadcrumbs_container")
breadcrumbs_striped=breadcrumbs.text.replace("&","&")
breadcrumb_final = breadcrumbs_striped.replace("?",">")
print(breadcrumb_final)
except:
print("No breadcrumb found")
print "=========\n"
try:
data = []
table = soup.find('table',id="productDetails_detailBullets_sections1")
rows = table.findAll('tr')
for row in rows:
colHeader = row.findAll('th')[0].string
cols = row.findAll('td')
cols = [ele.text.strip() for ele in cols]
data.append(colHeader)
data.append([ele for ele in cols if ele])
#print(data)
for row in data:
"".join(row)
except:
print("Table 1 not found")
#detail-bullets
try:
table = []
table_div = soup.find('table',id="productDetailsTable")
rows = table.findAll('tr')
for row in rows:
colHeader = row.findAll('th')[0].string
cols = row.findAll('td')
cols = [ele.text.strip() for ele in cols]
table.append(colHeader)
table.append([ele for ele in cols if ele])
print(table)
except:
print("Table 2 not found")
try:
table2 = []
table_div_bullets = soup.find('div',id="detail-bullets")
#print(table_div_bullets)
table_div_bullets_internal = table_div_bullets.find('ul')
#print(table_div_bullets_internal)
rows = table_div_bullets_internal.findAll('li')
#print(rows)
for row in rows:
print(row.text.replace("&", "&").replace(">",">"))
except:
print("Table 3 not found")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment