Skip to content

Instantly share code, notes, and snippets.

@sweetmoniker
Created October 13, 2017 19:02
Show Gist options
  • Save sweetmoniker/dec5a1e3faebacad34d5bc38d90778d3 to your computer and use it in GitHub Desktop.
Save sweetmoniker/dec5a1e3faebacad34d5bc38d90778d3 to your computer and use it in GitHub Desktop.
This code will scrape all pins when given a domain with a list of boards. It is admittedly a bit clunky. The only way I could find to effectively scrape all pins was to open a web browser for each board, pull in the urls for all the pins, then process each pin url one at a time. Process time is about 2 seconds per pin. This functions as of 13 Oc…
import urllib.request
from bs4 import BeautifulSoup
import time
import datetime
import csv
import json
from selenium import webdriver
###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.###
page_url = 'https://www.pinterest.com/cambriainns/'
def request_until_succeed(url):
req = urllib.request.Request(url)
success = False
while success is False:
try:
response = urllib.request.urlopen(req)
if response.getcode() == 200:
success = True
except Exception as e:
print(e)
print("Error for URL {}: {}".format(url, datetime.datetime.now()))
print("Retrying in 5 seconds.")
time.sleep(5)
return response.read().decode(response.headers.get_content_charset())
def board_list(page_url):
page_source = request_until_succeed(page_url)
page = BeautifulSoup(page_source, 'html.parser')
boards = page.find_all('a', 'boardLinkWrapper')
board_url_list = []
base_url = 'https://www.pinterest.com'
for i in range(len(boards)):
board_url_list.append(base_url + boards[i]['href'])
return board_url_list
def process_pin(pin, board):
try:
pin_id = str(pin['id'])
link = 'https://www.pinterest.com/pin/' + pin['id']
saves = pin['aggregated_pin_data']['aggregated_stats']['saves']
done = pin['aggregated_pin_data']['aggregated_stats']['done']
likes = pin['like_count']
comments = pin['comment_count']
repins = pin['repin_count']
is_repin = pin['is_repin']
is_video = pin['is_video']
description_html = pin['description_html']
board_id = pin['board']['id']
board_name = pin['board']['name']
board_url = board
pinner = pin['pinner']['username']
pinner_id = pin['pinner']['id']
keywords = pin['url_keywords']
pin_type = pin['type']
attribution = pin['attribution']
description = pin['description']
#not all pins have rich metadata
try:
amp_valid = pin['rich_metadata']['amp_valid']
site_name = pin['rich_metadata']['site_name']
rich_description = pin['rich_metadata']['description']
link_status = pin['rich_metadata']['link_status']
title = pin['rich_metadata']['title']
locale = pin['rich_metadata']['locale']
url = pin['rich_metadata']['url']
except:
amp_valid = ''
site_name = ''
rich_description = ''
link_status = ''
title = ''
locale = ''
url = ''
url_keywords = pin['url_keywords']
created_at = pin['created_at']
try:
annotation = pin['pin_join']['visual_annotation']
except:
annotation = ''
#in case weird layout
except:
pin_id = 'not able to parse pin data'
link = ''
saves = ''
done = ''
likes = ''
comments = ''
repins = ''
is_repin = ''
is_video = ''
description_html = ''
board_id = ''
board_name = ''
board_url = board
pinner = ''
pinner_id = ''
keywords = ''
pin_type = ''
attribution = ''
description = ''
amp_valid = ''
site_name = ''
rich_description = ''
link_status = ''
title = ''
locale = ''
url = ''
url_keywords = ''
created_at = ''
annotation = ''
return(pin_id, link, saves, done, likes, comments, repins, is_repin, is_video, description_html,\
board_id, board_name, board_url, pinner, pinner_id, keywords, pin_type,\
attribution, description, amp_valid, site_name, rich_description,\
link_status, title, locale, url, url_keywords, created_at, annotation)
def scrape_pinterest(page_url):
with open('C:\\Users\\[user_name]\\Desktop\\{}_pinterest.csv'.format(page_url.replace('https://www.pinterest.com/', '').replace('/', '')),'w', newline='', encoding='utf-8') as file:
w = csv.writer(file)
w.writerow(["pin_id", "link", "saves", "done", "likes", "comments", "repins", "is_repin", "is_video", "description_html",\
"board_id", "board_name", "board_url", "pinner", "pinner_id", "keywords", "pin_type",\
"attribution", "description", "amp_valid", "site_name", "rich_description",\
"link_status", "title", "locale", "url", "url_keywords", "created_at", "annotation"])
num_processed = 0
num_errors = 0
scrape_starttime = datetime.datetime.now()
#get list of board urls
board_url_list = board_list(page_url)
print("There are {} boards.".format(len(board_url_list)))
for board in board_url_list:
while True:
try:
#get basic board data
page_source = request_until_succeed(board)
page = BeautifulSoup(page_source, 'html.parser')
script = page.find('script', id = 'jsInit1', type = 'application/json')
json_data = json.loads(script.contents[0])
num_pins = json_data['resourceDataCache'][0]['data']['pin_count']
board_name = json_data['resourceDataCache'][0]['data']['name']
print("There are {} pins in board '{}'".format(num_pins, board_name))
#open board in browser to get pin data
driver = webdriver.Chrome()
time.sleep(1)
driver.get(board)
time.sleep(3)
page_source = driver.page_source
page = BeautifulSoup(page_source, 'html.parser')
pin_data = page.find_all('div', 'GrowthUnauthPinImage')
#expand page if not all pins are present
while True:
if len(pin_data) < num_pins:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2)
page_source = driver.page_source
page = BeautifulSoup(page_source, 'html.parser')
pin_data = page.find_all('div', 'GrowthUnauthPinImage')
else:
break
driver.quit()
#make list of pin urls
hrefs = []
for i in range(len(pin_data)):
hrefs.append('https://www.pinterest.com' + pin_data[i].find('a')['href'])
#get pin and process
for href in range(num_pins):
while True:
try:
print('pin id: {}'.format(hrefs[href].replace('https://www.pinterest.com/pin/', '').replace('/', '')))
page_source = request_until_succeed(hrefs[href])
page = BeautifulSoup(page_source, 'html.parser')
script = page.find('script', id = 'jsInit1', type = 'application/json')
json_data = json.loads(script.contents[0])
#code fails on 'pin = json_data...' a lot with error 'list index out of range.'
#The data that comes out when this happens does not conform with the standard format.
#Error handler forces a retry, and it always works the second time.
pin = json_data['resourceDataCache'][0]['data']
w.writerow(process_pin(pin, board))
num_processed += 1
except Exception as e:
print(e)
print("Error processing pin in '{}'. Retrying.".format(board_name))
num_errors += 1
time.sleep(1)
continue
except (KeyboardInterrupt, SystemExit):
print("Program Stopped.")
raise
break
except Exception as e:
print(e)
print("Error processing board '{}'. Retrying.".format(board_name))
num_errors += 1
driver.quit()
time.sleep(1)
continue
except (KeyboardInterrupt, SystemExit):
print("Program Stopped.")
raise
break
print("\nDone!\n{} Pins Processed in {}".format(num_processed, datetime.datetime.now() - scrape_starttime))
print("{} Errors.".format(num_errors))
file.close()
scrape_pinterest(page_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment