sweetmoniker · October 13, 2017 19:02
diff --git a/Pinterest Scrape.py b/Pinterest Scrape.py
 import urllib.request
 from bs4 import BeautifulSoup
 import time
 import datetime
 import csv
 import json
 from selenium import webdriver

 ###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.###

 page_url = 'https://www.pinterest.com/cambriainns/'

 def request_until_succeed(url):
    req = urllib.request.Request(url)
    success = False
    while success is False:
        try:
            response = urllib.request.urlopen(req)
            if response.getcode() == 200:
                success = True
        except Exception as e:
            print(e)
            print("Error for URL {}: {}".format(url, datetime.datetime.now()))
            print("Retrying in 5 seconds.")
            time.sleep(5)
    return response.read().decode(response.headers.get_content_charset())

 def board_list(page_url):
    page_source = request_until_succeed(page_url)
    page = BeautifulSoup(page_source, 'html.parser')
    boards = page.find_all('a', 'boardLinkWrapper')
    board_url_list = []
    base_url = 'https://www.pinterest.com'
    for i in range(len(boards)):
        board_url_list.append(base_url + boards[i]['href'])
    return board_url_list

 def process_pin(pin, board):
    try:
        pin_id = str(pin['id'])
        link = 'https://www.pinterest.com/pin/' + pin['id']
        saves = pin['aggregated_pin_data']['aggregated_stats']['saves']
        done = pin['aggregated_pin_data']['aggregated_stats']['done']
        likes = pin['like_count']
        comments = pin['comment_count']
        repins = pin['repin_count']
        is_repin = pin['is_repin']
        is_video = pin['is_video']
        description_html = pin['description_html']
        board_id = pin['board']['id']
        board_name  = pin['board']['name']
        board_url = board
        pinner = pin['pinner']['username']
        pinner_id = pin['pinner']['id']
        keywords = pin['url_keywords']
        pin_type = pin['type']
        attribution = pin['attribution']
        description = pin['description']
        #not all pins have rich metadata
        try:
            amp_valid = pin['rich_metadata']['amp_valid']
            site_name = pin['rich_metadata']['site_name']
            rich_description = pin['rich_metadata']['description']
            link_status = pin['rich_metadata']['link_status']
            title = pin['rich_metadata']['title']
            locale = pin['rich_metadata']['locale']
            url = pin['rich_metadata']['url']
        except:
            amp_valid = ''
            site_name = ''
            rich_description = ''
            link_status = ''
            title = ''
            locale = ''
            url = ''
        url_keywords = pin['url_keywords']
        created_at = pin['created_at']
        try:
            annotation = pin['pin_join']['visual_annotation']
        except:
            annotation = ''
    #in case weird layout
    except:
        pin_id = 'not able to parse pin data'
        link = ''
        saves = ''
        done = ''
        likes = ''
        comments = ''
        repins = ''
        is_repin = ''
        is_video = ''
        description_html = ''
        board_id = ''
        board_name  = ''
        board_url = board
        pinner = ''
        pinner_id = ''
        keywords = ''
        pin_type = ''
        attribution = ''
        description = ''
        amp_valid = ''
        site_name = ''
        rich_description = ''
        link_status = ''
        title = ''
        locale = ''
        url = ''
        url_keywords = ''
        created_at = ''
        annotation = ''
    return(pin_id, link, saves, done, likes, comments, repins, is_repin, is_video, description_html,\
           board_id, board_name, board_url, pinner, pinner_id, keywords, pin_type,\
           attribution, description, amp_valid, site_name, rich_description,\
           link_status, title, locale, url, url_keywords, created_at, annotation)
    
 def scrape_pinterest(page_url):
    with open('C:\\Users\\[user_name]\\Desktop\\{}_pinterest.csv'.format(page_url.replace('https://www.pinterest.com/', '').replace('/', '')),'w', newline='', encoding='utf-8') as file:
        w = csv.writer(file)
        w.writerow(["pin_id", "link", "saves", "done", "likes", "comments", "repins", "is_repin", "is_video", "description_html",\
             "board_id", "board_name", "board_url", "pinner", "pinner_id", "keywords", "pin_type",\
             "attribution", "description", "amp_valid", "site_name", "rich_description",\
             "link_status", "title", "locale", "url", "url_keywords", "created_at", "annotation"])
        num_processed = 0
        num_errors = 0
        scrape_starttime = datetime.datetime.now()
        #get list of board urls
        board_url_list = board_list(page_url)
        print("There are {} boards.".format(len(board_url_list)))
        for board in board_url_list:
            while True:
                try:
                    #get basic board data
                    page_source = request_until_succeed(board)
                    page = BeautifulSoup(page_source, 'html.parser')
                    script = page.find('script', id = 'jsInit1', type = 'application/json')
                    json_data = json.loads(script.contents[0])
                    num_pins = json_data['resourceDataCache'][0]['data']['pin_count']
                    board_name = json_data['resourceDataCache'][0]['data']['name']
                    print("There are {} pins in board '{}'".format(num_pins, board_name))
                    #open board in browser to get pin data
                    driver = webdriver.Chrome()
                    time.sleep(1)
                    driver.get(board)
                    time.sleep(3)
                    page_source = driver.page_source
                    page = BeautifulSoup(page_source, 'html.parser')
                    pin_data = page.find_all('div', 'GrowthUnauthPinImage')
                    #expand page if not all pins are present
                    while True:
                        if len(pin_data) < num_pins:
                            driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
                            time.sleep(2)
                            page_source = driver.page_source
                            page = BeautifulSoup(page_source, 'html.parser')
                            pin_data = page.find_all('div', 'GrowthUnauthPinImage')
                        else:
                            break
                    driver.quit()
                    #make list of pin urls
                    hrefs = []
                    for i in range(len(pin_data)):
                        hrefs.append('https://www.pinterest.com' + pin_data[i].find('a')['href'])
                    #get pin and process
                    for href in range(num_pins):
                        while True:
                            try:
                                print('pin id: {}'.format(hrefs[href].replace('https://www.pinterest.com/pin/', '').replace('/', '')))
                                page_source = request_until_succeed(hrefs[href])
                                page = BeautifulSoup(page_source, 'html.parser')
                                script = page.find('script', id = 'jsInit1', type = 'application/json')
                                json_data = json.loads(script.contents[0])
                                #code fails on 'pin = json_data...' a lot with error 'list index out of range.'
                                #The data that comes out when this happens does not conform with the standard format.
                                #Error handler forces a retry, and it always works the second time.
                                pin = json_data['resourceDataCache'][0]['data']
                                w.writerow(process_pin(pin, board))
                                num_processed += 1
                            except Exception as e:
                                print(e)
                                print("Error processing pin in '{}'. Retrying.".format(board_name))
                                num_errors += 1
                                time.sleep(1)
                                continue
                            except (KeyboardInterrupt, SystemExit):
                                print("Program Stopped.")
                                raise
                            break
                except Exception as e:
                    print(e)
                    print("Error processing board '{}'. Retrying.".format(board_name))
                    num_errors += 1
                    driver.quit()
                    time.sleep(1)
                    continue
                except (KeyboardInterrupt, SystemExit):
                    print("Program Stopped.")
                    raise
                break
        print("\nDone!\n{} Pins Processed in {}".format(num_processed, datetime.datetime.now() - scrape_starttime))
        print("{} Errors.".format(num_errors))
        file.close()               

 scrape_pinterest(page_url)
	import urllib.request
	from bs4 import BeautifulSoup
	import time
	import datetime
	import csv
	import json
	from selenium import webdriver

	###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.###

	page_url = 'https://www.pinterest.com/cambriainns/'

	def request_until_succeed(url):
	req = urllib.request.Request(url)
	success = False
	while success is False:
	try:
	response = urllib.request.urlopen(req)
	if response.getcode() == 200:
	success = True
	except Exception as e:
	print(e)
	print("Error for URL {}: {}".format(url, datetime.datetime.now()))
	print("Retrying in 5 seconds.")
	time.sleep(5)
	return response.read().decode(response.headers.get_content_charset())

	def board_list(page_url):
	page_source = request_until_succeed(page_url)
	page = BeautifulSoup(page_source, 'html.parser')
	boards = page.find_all('a', 'boardLinkWrapper')
	board_url_list = []
	base_url = 'https://www.pinterest.com'
	for i in range(len(boards)):
	board_url_list.append(base_url + boards[i]['href'])
	return board_url_list

	def process_pin(pin, board):
	try:
	pin_id = str(pin['id'])
	link = 'https://www.pinterest.com/pin/' + pin['id']
	saves = pin['aggregated_pin_data']['aggregated_stats']['saves']
	done = pin['aggregated_pin_data']['aggregated_stats']['done']
	likes = pin['like_count']
	comments = pin['comment_count']
	repins = pin['repin_count']
	is_repin = pin['is_repin']
	is_video = pin['is_video']
	description_html = pin['description_html']
	board_id = pin['board']['id']
	board_name = pin['board']['name']
	board_url = board
	pinner = pin['pinner']['username']
	pinner_id = pin['pinner']['id']
	keywords = pin['url_keywords']
	pin_type = pin['type']
	attribution = pin['attribution']
	description = pin['description']
	#not all pins have rich metadata
	try:
	amp_valid = pin['rich_metadata']['amp_valid']
	site_name = pin['rich_metadata']['site_name']
	rich_description = pin['rich_metadata']['description']
	link_status = pin['rich_metadata']['link_status']
	title = pin['rich_metadata']['title']
	locale = pin['rich_metadata']['locale']
	url = pin['rich_metadata']['url']
	except:
	amp_valid = ''
	site_name = ''
	rich_description = ''
	link_status = ''
	title = ''
	locale = ''
	url = ''
	url_keywords = pin['url_keywords']
	created_at = pin['created_at']
	try:
	annotation = pin['pin_join']['visual_annotation']
	except:
	annotation = ''
	#in case weird layout
	except:
	pin_id = 'not able to parse pin data'
	link = ''
	saves = ''
	done = ''
	likes = ''
	comments = ''
	repins = ''
	is_repin = ''
	is_video = ''
	description_html = ''
	board_id = ''
	board_name = ''
	board_url = board
	pinner = ''
	pinner_id = ''
	keywords = ''
	pin_type = ''
	attribution = ''
	description = ''
	amp_valid = ''
	site_name = ''
	rich_description = ''
	link_status = ''
	title = ''
	locale = ''
	url = ''
	url_keywords = ''
	created_at = ''
	annotation = ''
	return(pin_id, link, saves, done, likes, comments, repins, is_repin, is_video, description_html,\
	board_id, board_name, board_url, pinner, pinner_id, keywords, pin_type,\
	attribution, description, amp_valid, site_name, rich_description,\
	link_status, title, locale, url, url_keywords, created_at, annotation)

	def scrape_pinterest(page_url):
	with open('C:\\Users\\[user_name]\\Desktop\\{}_pinterest.csv'.format(page_url.replace('https://www.pinterest.com/', '').replace('/', '')),'w', newline='', encoding='utf-8') as file:
	w = csv.writer(file)
	w.writerow(["pin_id", "link", "saves", "done", "likes", "comments", "repins", "is_repin", "is_video", "description_html",\
	"board_id", "board_name", "board_url", "pinner", "pinner_id", "keywords", "pin_type",\
	"attribution", "description", "amp_valid", "site_name", "rich_description",\
	"link_status", "title", "locale", "url", "url_keywords", "created_at", "annotation"])
	num_processed = 0
	num_errors = 0
	scrape_starttime = datetime.datetime.now()
	#get list of board urls
	board_url_list = board_list(page_url)
	print("There are {} boards.".format(len(board_url_list)))
	for board in board_url_list:
	while True:
	try:
	#get basic board data
	page_source = request_until_succeed(board)
	page = BeautifulSoup(page_source, 'html.parser')
	script = page.find('script', id = 'jsInit1', type = 'application/json')
	json_data = json.loads(script.contents[0])
	num_pins = json_data['resourceDataCache'][0]['data']['pin_count']
	board_name = json_data['resourceDataCache'][0]['data']['name']
	print("There are {} pins in board '{}'".format(num_pins, board_name))
	#open board in browser to get pin data
	driver = webdriver.Chrome()
	time.sleep(1)
	driver.get(board)
	time.sleep(3)
	page_source = driver.page_source
	page = BeautifulSoup(page_source, 'html.parser')
	pin_data = page.find_all('div', 'GrowthUnauthPinImage')
	#expand page if not all pins are present
	while True:
	if len(pin_data) < num_pins:
	driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
	time.sleep(2)
	page_source = driver.page_source
	page = BeautifulSoup(page_source, 'html.parser')
	pin_data = page.find_all('div', 'GrowthUnauthPinImage')
	else:
	break
	driver.quit()
	#make list of pin urls
	hrefs = []
	for i in range(len(pin_data)):
	hrefs.append('https://www.pinterest.com' + pin_data[i].find('a')['href'])
	#get pin and process
	for href in range(num_pins):
	while True:
	try:
	print('pin id: {}'.format(hrefs[href].replace('https://www.pinterest.com/pin/', '').replace('/', '')))
	page_source = request_until_succeed(hrefs[href])
	page = BeautifulSoup(page_source, 'html.parser')
	script = page.find('script', id = 'jsInit1', type = 'application/json')
	json_data = json.loads(script.contents[0])
	#code fails on 'pin = json_data...' a lot with error 'list index out of range.'
	#The data that comes out when this happens does not conform with the standard format.
	#Error handler forces a retry, and it always works the second time.
	pin = json_data['resourceDataCache'][0]['data']
	w.writerow(process_pin(pin, board))
	num_processed += 1
	except Exception as e:
	print(e)
	print("Error processing pin in '{}'. Retrying.".format(board_name))
	num_errors += 1
	time.sleep(1)
	continue
	except (KeyboardInterrupt, SystemExit):
	print("Program Stopped.")
	raise
	break
	except Exception as e:
	print(e)
	print("Error processing board '{}'. Retrying.".format(board_name))
	num_errors += 1
	driver.quit()
	time.sleep(1)
	continue
	except (KeyboardInterrupt, SystemExit):
	print("Program Stopped.")
	raise
	break
	print("\nDone!\n{} Pins Processed in {}".format(num_processed, datetime.datetime.now() - scrape_starttime))
	print("{} Errors.".format(num_errors))
	file.close()

	scrape_pinterest(page_url)