aditikhandalkar · January 11, 2018 09:27
diff --git a/py b/py
 import boto
 from boto.s3.connection import S3Connection
 s3 = boto.connect_s3()
 mybucket = s3.get_bucket("YOUR_BUCKET_NAME")
 print mybucket
 page_id = "/PAGE_NAME_OR_ID"
 key_name = "prefix_or_subfolder"+ page_id
 uploaded_file = mybucket.get_key(key_name)
 uploaded_file.get_contents_to_filename('downloaded-file.html')

 import urllib2
 from BeautifulSoup import BeautifulSoup

 import codecs
 f=codecs.open("downloaded-file.html", 'r')
 page=f.read()
 #print page

 #page = urllib2.urlopen("file://downloaded-file.html").read()
 #print page
 soup = BeautifulSoup(page)


 try:
    ASIN=soup.find("input", id="ASIN")
    print(ASIN.text)
 except:
    print("ASIN not found")

 try:
    title = soup.find("span", id="productTitle")
    print(title.text)
 except:
    print("Title not found")

 try:
    price_div = soup.find("div", id="_price")
    price = price_div.find("span")
    print(price.text)
 except:
    print("Price not found")

 try:
    reviews = soup.find("span", id="acrCustomerReviewText")
    print(reviews.text)
 except:
    print("reviews not found")

 try:
    customerReviewDiv=soup.find("span", id="acrPopover")
    print(customerReviewDiv["title"])
 except:
    print("CustomerReviewDiv not found")

 #
 # these can have multiple definitions
 #
 try:
    availability=soup.find("div", id="availability")
    print(availability.text)
 except:
    print("availability not found")

 try:
    brand_name=soup.find("a", id="bylineInfo")
    print(brand_name.text)
 except:
    print("Data not found")
 #soup.select_one('#name > #numbers').next_sibling
 try:
    product_desc_div=soup.find("div", id="productDescription")
    product_desc=product_desc_div.find("p")
    print(product_desc.text)
 except:
    print("product_desc not found")

 try:
    product_desc_div=soup.find("div", id="productDescription")
    product_desc=product_desc_div.find("p")
    print(product_desc.text)
 except:
    print("product_desc not found")

 try:
    bookrank=soup.select("div.kindleAuthorRank > div.nodeRank")
    #product_desc=bookrank.find("p")
    print(bookrank.text)
 except:
    print("product_desc not found")

 try:
    wayfinding-breadcrumbs_feature_div
    breadcrumbs=soup.find("div", id="wayfinding-breadcrumbs_container")
    breadcrumbs_striped=breadcrumbs.text.replace("&amp;","&")
    breadcrumb_final = breadcrumbs_striped.replace("?",">")
    print(breadcrumb_final)
 except:
    print("No breadcrumb found")

 print "=========\n"

 try:
    data = []
    table = soup.find('table',id="productDetails_detailBullets_sections1")
    rows = table.findAll('tr')
    for row in rows:
        colHeader = row.findAll('th')[0].string
        cols = row.findAll('td')
        cols = [ele.text.strip() for ele in cols]
        data.append(colHeader)
        data.append([ele for ele in cols if ele])
    #print(data)

    for row in data:
        "".join(row)
 except:
    print("Table 1 not found")

 #detail-bullets

 try:
    table = []
    table_div = soup.find('table',id="productDetailsTable")
    rows = table.findAll('tr')
    for row in rows:
        colHeader = row.findAll('th')[0].string
        cols = row.findAll('td')
        cols = [ele.text.strip() for ele in cols]
        table.append(colHeader)
        table.append([ele for ele in cols if ele])
    print(table)
 except:
    print("Table 2 not found")

 try:
    table2 = []
    table_div_bullets = soup.find('div',id="detail-bullets")
    #print(table_div_bullets)
    table_div_bullets_internal = table_div_bullets.find('ul')
    #print(table_div_bullets_internal)
    rows = table_div_bullets_internal.findAll('li')
    #print(rows)
    for row in rows:
        print(row.text.replace("&amp;", "&").replace("&gt;",">"))
 except:
    print("Table 3 not found")
	import boto
	from boto.s3.connection import S3Connection
	s3 = boto.connect_s3()
	mybucket = s3.get_bucket("YOUR_BUCKET_NAME")
	print mybucket
	page_id = "/PAGE_NAME_OR_ID"
	key_name = "prefix_or_subfolder"+ page_id
	uploaded_file = mybucket.get_key(key_name)
	uploaded_file.get_contents_to_filename('downloaded-file.html')

	import urllib2
	from BeautifulSoup import BeautifulSoup

	import codecs
	f=codecs.open("downloaded-file.html", 'r')
	page=f.read()
	#print page

	#page = urllib2.urlopen("file://downloaded-file.html").read()
	#print page
	soup = BeautifulSoup(page)


	try:
	ASIN=soup.find("input", id="ASIN")
	print(ASIN.text)
	except:
	print("ASIN not found")

	try:
	title = soup.find("span", id="productTitle")
	print(title.text)
	except:
	print("Title not found")

	try:
	price_div = soup.find("div", id="_price")
	price = price_div.find("span")
	print(price.text)
	except:
	print("Price not found")

	try:
	reviews = soup.find("span", id="acrCustomerReviewText")
	print(reviews.text)
	except:
	print("reviews not found")

	try:
	customerReviewDiv=soup.find("span", id="acrPopover")
	print(customerReviewDiv["title"])
	except:
	print("CustomerReviewDiv not found")

	#
	# these can have multiple definitions
	#
	try:
	availability=soup.find("div", id="availability")
	print(availability.text)
	except:
	print("availability not found")

	try:
	brand_name=soup.find("a", id="bylineInfo")
	print(brand_name.text)
	except:
	print("Data not found")
	#soup.select_one('#name > #numbers').next_sibling
	try:
	product_desc_div=soup.find("div", id="productDescription")
	product_desc=product_desc_div.find("p")
	print(product_desc.text)
	except:
	print("product_desc not found")

	try:
	product_desc_div=soup.find("div", id="productDescription")
	product_desc=product_desc_div.find("p")
	print(product_desc.text)
	except:
	print("product_desc not found")

	try:
	bookrank=soup.select("div.kindleAuthorRank > div.nodeRank")
	#product_desc=bookrank.find("p")
	print(bookrank.text)
	except:
	print("product_desc not found")

	try:
	wayfinding-breadcrumbs_feature_div
	breadcrumbs=soup.find("div", id="wayfinding-breadcrumbs_container")
	breadcrumbs_striped=breadcrumbs.text.replace("&","&")
	breadcrumb_final = breadcrumbs_striped.replace("?",">")
	print(breadcrumb_final)
	except:
	print("No breadcrumb found")

	print "=========\n"

	try:
	data = []
	table = soup.find('table',id="productDetails_detailBullets_sections1")
	rows = table.findAll('tr')
	for row in rows:
	colHeader = row.findAll('th')[0].string
	cols = row.findAll('td')
	cols = [ele.text.strip() for ele in cols]
	data.append(colHeader)
	data.append([ele for ele in cols if ele])
	#print(data)

	for row in data:
	"".join(row)
	except:
	print("Table 1 not found")

	#detail-bullets

	try:
	table = []
	table_div = soup.find('table',id="productDetailsTable")
	rows = table.findAll('tr')
	for row in rows:
	colHeader = row.findAll('th')[0].string
	cols = row.findAll('td')
	cols = [ele.text.strip() for ele in cols]
	table.append(colHeader)
	table.append([ele for ele in cols if ele])
	print(table)
	except:
	print("Table 2 not found")

	try:
	table2 = []
	table_div_bullets = soup.find('div',id="detail-bullets")
	#print(table_div_bullets)
	table_div_bullets_internal = table_div_bullets.find('ul')
	#print(table_div_bullets_internal)
	rows = table_div_bullets_internal.findAll('li')
	#print(rows)
	for row in rows:
	print(row.text.replace("&", "&").replace(">",">"))
	except:
	print("Table 3 not found")