Skip to content

Instantly share code, notes, and snippets.

@ZenulAbidin
Last active May 28, 2020 22:59
Show Gist options
  • Save ZenulAbidin/1c2bb127d2a08b7f78c093e8a01ae31c to your computer and use it in GitHub Desktop.
Save ZenulAbidin/1c2bb127d2a08b7f78c093e8a01ae31c to your computer and use it in GitHub Desktop.
Rudimentary instagram scraper that dumps the links of images in a text file, and saves the images in their own folder. pip install beautifulsoup4 selenium urllib3 requests pandas numpy
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import re
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd, numpy as np
import argparse
import requests
import os
print("Instagram Scraper Copyright (c) 2020 Ali Sherief")
parser = argparse.ArgumentParser(description='Scrapes an intsagram page for image links and alt text')
parser.add_argument('username', metavar='username', type=str,
help='username of the profile to scrape')
args = parser.parse_args()
username=args.username
browser = webdriver.Chrome()
browser.get('https://www.instagram.com/'+username+'/?hl=en')
PagelengthOld = 0
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")
l=[]
# Extract links from user profile page
# This script skips videos because it's dumb and doesn't
# know how to extract them yet.
# Remove the sleep timers at your own risk. They ensure your
# IP doesn't get blocked, and that Instagram has enough time
# to update a scrolled page.
while PagelengthOld != Pagelength:
source = browser.page_source
data=bs(source, 'html.parser')
body = data.find('body')
img = body.find_all('img', style='object-fit: cover;')
for i in img:
try:
l.append(i['src'])
except KeyError as e:
pass
time.sleep(2);
PagelengthOld = Pagelength;
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")
time.sleep(2)
source = browser.page_source
data=bs(source, 'html.parser')
body = data.find('body')
img = body.find_all('img', style='object-fit: cover;')
for i in img:
try:
l.append(i['src'])
except KeyError as e:
pass
links = []
for ll in l:
if ll not in links:
links.append(ll)
with open(username+'-links.txt', 'w') as f:
f.write('\n'.join(links)+'\n')
try:
os.mkdir(username)
except FileExistsError as e:
pass
i = 1
for link in links:
r = requests.get(link)
if r.ok:
with open(os.path.join(username,'img'+str(i)+'.jpg'), 'wb') as f:
f.write(r.content)
print('Image ' + str(i) + ' downloaded')
i+=1
time.sleep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment