-
-
Save ketankr9/6e48c6c205907e6ae35ef789e7a03634 to your computer and use it in GitHub Desktop.
Script for scraping public instagram profile's timeline photos.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pylint: skip-file | |
import time | |
import re | |
import md5 | |
import requests | |
import json | |
INSTAGRAM_URL = "https://www.instagram.com" | |
HASHTAG_ENDPOINT = "/graphql/query/?query_hash={}&variables={}" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" | |
def get_first_page(username): | |
r = requests.get(INSTAGRAM_URL + "/{}/".format(username), headers={"user-agent": USER_AGENT}) | |
json_obj = json.loads(r.text.split("window._sharedData = ")[1].split(";</script>")[0]) | |
end_cursor = get_end_cursor_from_html(r.text) | |
edges = json_obj["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] | |
return (r, [o['node'] for o in edges], end_cursor) | |
def get_csrf_token(cookies): | |
return cookies.get("csrftoken") | |
def get_query_id(html): | |
script_path = re.search(r'/static(.*)ProfilePageContainer\.js/(.*).js', html).group(0) | |
script_req = requests.get(INSTAGRAM_URL + script_path) | |
return re.findall('e\\.profilePosts\\.byUserId\\.get\\(t\\)\\)\\?n\\.pagination:n},queryId:"([^"]*)"', script_req.text)[0] | |
def get_user_id(html): | |
return re.search(r'logging_page_id":"([^"]*)"', html).group(1).split("_")[1] | |
def get_rhx_gis(html): | |
return re.search(r'rhx_gis":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_html(html): | |
return re.search(r'end_cursor":"([^"]*)"', html).group(1) | |
def get_end_cursor_from_json(json_obj): | |
return json_obj['data']['user']['edge_owner_to_timeline_media']['page_info']['end_cursor'] | |
def get_params(id, end_cursor): | |
return '{{"id":"{}","first":12,"after":"{}"}}'.format(id, end_cursor) | |
def get_ig_gis(rhx_gis, params): | |
return md5.new(rhx_gis + ":" + params).hexdigest() | |
def get_posts_from_json(json_obj): | |
edges = json_obj['data']['user']['edge_owner_to_timeline_media']['edges'] | |
return [o['node'] for o in edges] | |
def make_cookies(csrf_token): | |
return { | |
"csrftoken": csrf_token, | |
} | |
def make_headers(ig_gis): | |
return { | |
"x-instagram-gis": ig_gis, | |
"x-requested-with": "XMLHttpRequest", | |
"user-agent": USER_AGENT | |
} | |
def get_next_page(csrf_token, ig_gis, query_id, params): | |
cookies = make_cookies(csrf_token) | |
headers = make_headers(ig_gis) | |
url = INSTAGRAM_URL + HASHTAG_ENDPOINT.format(query_id, params) | |
req = requests.get(url, headers=headers, cookies=cookies) | |
req.raise_for_status() | |
json_obj = req.json() | |
end_cursor = get_end_cursor_from_json(json_obj) | |
posts = get_posts_from_json(json_obj) | |
return posts, end_cursor | |
def scrape_username(username, sleep=3): | |
""" | |
Yields scraped posts, one by one | |
""" | |
r, posts, end_cursor = get_first_page(username) | |
csrf_token = get_csrf_token(r.cookies) | |
query_id = get_query_id(r.text) | |
rhx_gis = get_rhx_gis(r.text) | |
id = get_user_id(r.text) | |
for post in posts: | |
yield post | |
time.sleep(sleep) | |
while end_cursor != None: | |
params = get_params(id, end_cursor) | |
ig_gis = get_ig_gis(rhx_gis, params) | |
posts, end_cursor = get_next_page(csrf_token, ig_gis, query_id, params) | |
for post in posts: | |
yield post | |
time.sleep(sleep) | |
# main | |
for post in scrape_username("utsav_ketankr9"): | |
print post['id'],post['display_url'] | |
# do stuff |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment