Last active
July 6, 2017 15:21
-
-
Save sweetmoniker/9b74d5a49291b2e5f9e69d1f7b508c4c to your computer and use it in GitHub Desktop.
This is a little bit simplified and more efficient than what I previously posted. This python code scrapes an instagram profile as of 6 July 2017. Props go to Minimaxir for the first module and some syntax inspiration.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import json | |
from bs4 import BeautifulSoup | |
import csv | |
import time | |
import datetime | |
page_url = "https://www.instagram.com/bastedmind/" | |
def request_until_succeed(url): | |
req = urllib.request.Request(url) | |
success = False | |
while success is False: | |
try: | |
response = urllib.request.urlopen(req) | |
if response.getcode() == 200: | |
success = True | |
except Exception as e: | |
print(e) | |
time.sleep(5) | |
print("Error fur URL {}: {}".format(url, datetime.datetime.now())) | |
print("retrying.") | |
return response.read().decode('utf-8') | |
def get_more_posts(page_url, end_cursor): | |
url = page_url + '?max_id=' + end_cursor | |
page_source = request_until_succeed(url) | |
page = BeautifulSoup(page_source) | |
scripts = page.find_all('script') | |
for script in scripts: | |
if script.text[:18] == "window._sharedData": | |
break | |
json_data = json.loads(script.contents[0][21:-1]) | |
return json_data | |
def process_post(json_data): | |
owner_id = json_data['owner']['id'] | |
post_id = json_data['id'] | |
display_src = json_data['display_src'] | |
media_preview = json_data['media_preview'] | |
comments_count = json_data['comments']['count'] | |
gating_info = json_data['gating_info'] | |
caption = '' | |
try: | |
caption = json_data['caption'] | |
except: | |
pass | |
thumbnail_src = json_data['thumbnail_src'] | |
comments_disabled = json_data['comments_disabled'] | |
likes_count = json_data['likes']['count'] | |
is_video = json_data['is_video'] | |
dimensions = json_data['dimensions'] | |
typename = json_data['__typename'] | |
code = json_data['code'] | |
link = 'https://www.instagram.com/p/' + code | |
video_views = '' | |
try: | |
video_views = json_data['video_views'] | |
except: | |
pass | |
date = json_data['date'] | |
date = str(datetime.datetime.utcfromtimestamp(date).year)+'-'+str(datetime.datetime.utcfromtimestamp(date).month)+'-'+str(datetime.datetime.utcfromtimestamp(date).day)+' '+str(datetime.datetime.utcfromtimestamp(date).hour)+':'+str(datetime.datetime.utcfromtimestamp(date).minute)+':'+str(datetime.datetime.utcfromtimestamp(date).second) | |
date = datetime.datetime.strptime(date,'%Y-%m-%d %H:%M:%S') | |
date = date + datetime.timedelta(hours=-5) #EST | |
return(owner_id, post_id, display_src, media_preview, comments_count, gating_info, caption, thumbnail_src, comments_disabled, likes_count, is_video, dimensions, typename, link, video_views, date) | |
def scrape_instagram(page_url): | |
scrape_starttime = datetime.datetime.now() | |
page_source = request_until_succeed(page_url) | |
page = BeautifulSoup(page_source) | |
scripts = page.find_all('script') | |
for script in scripts: | |
if script.text[:18] == "window._sharedData": | |
break | |
json_data = json.loads(script.contents[0][21:-1]) | |
has_next_page = True | |
num_posts = json_data['entry_data']['ProfilePage'][0]['user']['media']['count'] | |
username = json_data['entry_data']['ProfilePage'][0]['user']['username'] | |
num_processed = 0 | |
print("Scraping {}'s {} Instagram Posts: {}\n".format(username, num_posts, scrape_starttime)) | |
with open('C:\\Users\\xhargrav\\Desktop\\{}_Instagram.csv'.format(username),'w', newline='', encoding='utf-8') as file: | |
csv.writer(file).writerow(["owner_id", "post_id", "display_src", \ | |
"media_preview", "comments_count", "gating_info", "caption", \ | |
"thumbnail_src", "comments_disabled", "likes_count", "is_video", \ | |
"dimensions", "typename", "code", "video_views", "date"]) | |
while has_next_page: | |
for i in range(0,len(json_data['entry_data']['ProfilePage'][0]['user']['media']['nodes'])): | |
post_to_process = json_data['entry_data']['ProfilePage'][0]['user']['media']['nodes'][i] | |
csv.writer(file).writerow(process_post(post_to_process)) | |
num_processed += 1 | |
if num_processed % 100 == 0: | |
print("{} Statuses Processed: {}".format(num_processed, datetime.datetime.now())) | |
has_next_page = json_data['entry_data']['ProfilePage'][0]['user']['media']['page_info']['has_next_page'] | |
if has_next_page == True: | |
end_cursor = json_data['entry_data']['ProfilePage'][0]['user']['media']['page_info']['end_cursor'] | |
json_data = get_more_posts(page_url, end_cursor) | |
else: | |
has_next_page = False | |
file.close() | |
print("Done! {} posts processed in {}".format(num_processed, datetime.datetime.now() - scrape_starttime)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment