Created
October 13, 2017 19:02
-
-
Save sweetmoniker/dec5a1e3faebacad34d5bc38d90778d3 to your computer and use it in GitHub Desktop.
This code will scrape all pins when given a domain with a list of boards. It is admittedly a bit clunky. The only way I could find to effectively scrape all pins was to open a web browser for each board, pull in the urls for all the pins, then process each pin url one at a time. Process time is about 2 seconds per pin. This functions as of 13 Oc…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from bs4 import BeautifulSoup | |
import time | |
import datetime | |
import csv | |
import json | |
from selenium import webdriver | |
###This script runs on selenium with Chrome. Follow the instructions here to install the webdriver: http://selenium-python.readthedocs.io/installation.html#drivers You probably have to change your PATH. Google it.### | |
page_url = 'https://www.pinterest.com/cambriainns/' | |
def request_until_succeed(url): | |
req = urllib.request.Request(url) | |
success = False | |
while success is False: | |
try: | |
response = urllib.request.urlopen(req) | |
if response.getcode() == 200: | |
success = True | |
except Exception as e: | |
print(e) | |
print("Error for URL {}: {}".format(url, datetime.datetime.now())) | |
print("Retrying in 5 seconds.") | |
time.sleep(5) | |
return response.read().decode(response.headers.get_content_charset()) | |
def board_list(page_url): | |
page_source = request_until_succeed(page_url) | |
page = BeautifulSoup(page_source, 'html.parser') | |
boards = page.find_all('a', 'boardLinkWrapper') | |
board_url_list = [] | |
base_url = 'https://www.pinterest.com' | |
for i in range(len(boards)): | |
board_url_list.append(base_url + boards[i]['href']) | |
return board_url_list | |
def process_pin(pin, board): | |
try: | |
pin_id = str(pin['id']) | |
link = 'https://www.pinterest.com/pin/' + pin['id'] | |
saves = pin['aggregated_pin_data']['aggregated_stats']['saves'] | |
done = pin['aggregated_pin_data']['aggregated_stats']['done'] | |
likes = pin['like_count'] | |
comments = pin['comment_count'] | |
repins = pin['repin_count'] | |
is_repin = pin['is_repin'] | |
is_video = pin['is_video'] | |
description_html = pin['description_html'] | |
board_id = pin['board']['id'] | |
board_name = pin['board']['name'] | |
board_url = board | |
pinner = pin['pinner']['username'] | |
pinner_id = pin['pinner']['id'] | |
keywords = pin['url_keywords'] | |
pin_type = pin['type'] | |
attribution = pin['attribution'] | |
description = pin['description'] | |
#not all pins have rich metadata | |
try: | |
amp_valid = pin['rich_metadata']['amp_valid'] | |
site_name = pin['rich_metadata']['site_name'] | |
rich_description = pin['rich_metadata']['description'] | |
link_status = pin['rich_metadata']['link_status'] | |
title = pin['rich_metadata']['title'] | |
locale = pin['rich_metadata']['locale'] | |
url = pin['rich_metadata']['url'] | |
except: | |
amp_valid = '' | |
site_name = '' | |
rich_description = '' | |
link_status = '' | |
title = '' | |
locale = '' | |
url = '' | |
url_keywords = pin['url_keywords'] | |
created_at = pin['created_at'] | |
try: | |
annotation = pin['pin_join']['visual_annotation'] | |
except: | |
annotation = '' | |
#in case weird layout | |
except: | |
pin_id = 'not able to parse pin data' | |
link = '' | |
saves = '' | |
done = '' | |
likes = '' | |
comments = '' | |
repins = '' | |
is_repin = '' | |
is_video = '' | |
description_html = '' | |
board_id = '' | |
board_name = '' | |
board_url = board | |
pinner = '' | |
pinner_id = '' | |
keywords = '' | |
pin_type = '' | |
attribution = '' | |
description = '' | |
amp_valid = '' | |
site_name = '' | |
rich_description = '' | |
link_status = '' | |
title = '' | |
locale = '' | |
url = '' | |
url_keywords = '' | |
created_at = '' | |
annotation = '' | |
return(pin_id, link, saves, done, likes, comments, repins, is_repin, is_video, description_html,\ | |
board_id, board_name, board_url, pinner, pinner_id, keywords, pin_type,\ | |
attribution, description, amp_valid, site_name, rich_description,\ | |
link_status, title, locale, url, url_keywords, created_at, annotation) | |
def scrape_pinterest(page_url): | |
with open('C:\\Users\\[user_name]\\Desktop\\{}_pinterest.csv'.format(page_url.replace('https://www.pinterest.com/', '').replace('/', '')),'w', newline='', encoding='utf-8') as file: | |
w = csv.writer(file) | |
w.writerow(["pin_id", "link", "saves", "done", "likes", "comments", "repins", "is_repin", "is_video", "description_html",\ | |
"board_id", "board_name", "board_url", "pinner", "pinner_id", "keywords", "pin_type",\ | |
"attribution", "description", "amp_valid", "site_name", "rich_description",\ | |
"link_status", "title", "locale", "url", "url_keywords", "created_at", "annotation"]) | |
num_processed = 0 | |
num_errors = 0 | |
scrape_starttime = datetime.datetime.now() | |
#get list of board urls | |
board_url_list = board_list(page_url) | |
print("There are {} boards.".format(len(board_url_list))) | |
for board in board_url_list: | |
while True: | |
try: | |
#get basic board data | |
page_source = request_until_succeed(board) | |
page = BeautifulSoup(page_source, 'html.parser') | |
script = page.find('script', id = 'jsInit1', type = 'application/json') | |
json_data = json.loads(script.contents[0]) | |
num_pins = json_data['resourceDataCache'][0]['data']['pin_count'] | |
board_name = json_data['resourceDataCache'][0]['data']['name'] | |
print("There are {} pins in board '{}'".format(num_pins, board_name)) | |
#open board in browser to get pin data | |
driver = webdriver.Chrome() | |
time.sleep(1) | |
driver.get(board) | |
time.sleep(3) | |
page_source = driver.page_source | |
page = BeautifulSoup(page_source, 'html.parser') | |
pin_data = page.find_all('div', 'GrowthUnauthPinImage') | |
#expand page if not all pins are present | |
while True: | |
if len(pin_data) < num_pins: | |
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);") | |
time.sleep(2) | |
page_source = driver.page_source | |
page = BeautifulSoup(page_source, 'html.parser') | |
pin_data = page.find_all('div', 'GrowthUnauthPinImage') | |
else: | |
break | |
driver.quit() | |
#make list of pin urls | |
hrefs = [] | |
for i in range(len(pin_data)): | |
hrefs.append('https://www.pinterest.com' + pin_data[i].find('a')['href']) | |
#get pin and process | |
for href in range(num_pins): | |
while True: | |
try: | |
print('pin id: {}'.format(hrefs[href].replace('https://www.pinterest.com/pin/', '').replace('/', ''))) | |
page_source = request_until_succeed(hrefs[href]) | |
page = BeautifulSoup(page_source, 'html.parser') | |
script = page.find('script', id = 'jsInit1', type = 'application/json') | |
json_data = json.loads(script.contents[0]) | |
#code fails on 'pin = json_data...' a lot with error 'list index out of range.' | |
#The data that comes out when this happens does not conform with the standard format. | |
#Error handler forces a retry, and it always works the second time. | |
pin = json_data['resourceDataCache'][0]['data'] | |
w.writerow(process_pin(pin, board)) | |
num_processed += 1 | |
except Exception as e: | |
print(e) | |
print("Error processing pin in '{}'. Retrying.".format(board_name)) | |
num_errors += 1 | |
time.sleep(1) | |
continue | |
except (KeyboardInterrupt, SystemExit): | |
print("Program Stopped.") | |
raise | |
break | |
except Exception as e: | |
print(e) | |
print("Error processing board '{}'. Retrying.".format(board_name)) | |
num_errors += 1 | |
driver.quit() | |
time.sleep(1) | |
continue | |
except (KeyboardInterrupt, SystemExit): | |
print("Program Stopped.") | |
raise | |
break | |
print("\nDone!\n{} Pins Processed in {}".format(num_processed, datetime.datetime.now() - scrape_starttime)) | |
print("{} Errors.".format(num_errors)) | |
file.close() | |
scrape_pinterest(page_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment