Last active
May 28, 2020 22:59
-
-
Save ZenulAbidin/1c2bb127d2a08b7f78c093e8a01ae31c to your computer and use it in GitHub Desktop.
Rudimentary instagram scraper that dumps the links of images in a text file, and saves the images in their own folder. pip install beautifulsoup4 selenium urllib3 requests pandas numpy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from bs4 import BeautifulSoup as bs | |
import time | |
import re | |
from urllib.request import urlopen | |
import json | |
from pandas.io.json import json_normalize | |
import pandas as pd, numpy as np | |
import argparse | |
import requests | |
import os | |
print("Instagram Scraper Copyright (c) 2020 Ali Sherief") | |
parser = argparse.ArgumentParser(description='Scrapes an intsagram page for image links and alt text') | |
parser.add_argument('username', metavar='username', type=str, | |
help='username of the profile to scrape') | |
args = parser.parse_args() | |
username=args.username | |
browser = webdriver.Chrome() | |
browser.get('https://www.instagram.com/'+username+'/?hl=en') | |
PagelengthOld = 0 | |
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;") | |
l=[] | |
# Extract links from user profile page | |
# This script skips videos because it's dumb and doesn't | |
# know how to extract them yet. | |
# Remove the sleep timers at your own risk. They ensure your | |
# IP doesn't get blocked, and that Instagram has enough time | |
# to update a scrolled page. | |
while PagelengthOld != Pagelength: | |
source = browser.page_source | |
data=bs(source, 'html.parser') | |
body = data.find('body') | |
img = body.find_all('img', style='object-fit: cover;') | |
for i in img: | |
try: | |
l.append(i['src']) | |
except KeyError as e: | |
pass | |
time.sleep(2); | |
PagelengthOld = Pagelength; | |
Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;") | |
time.sleep(2) | |
source = browser.page_source | |
data=bs(source, 'html.parser') | |
body = data.find('body') | |
img = body.find_all('img', style='object-fit: cover;') | |
for i in img: | |
try: | |
l.append(i['src']) | |
except KeyError as e: | |
pass | |
links = [] | |
for ll in l: | |
if ll not in links: | |
links.append(ll) | |
with open(username+'-links.txt', 'w') as f: | |
f.write('\n'.join(links)+'\n') | |
try: | |
os.mkdir(username) | |
except FileExistsError as e: | |
pass | |
i = 1 | |
for link in links: | |
r = requests.get(link) | |
if r.ok: | |
with open(os.path.join(username,'img'+str(i)+'.jpg'), 'wb') as f: | |
f.write(r.content) | |
print('Image ' + str(i) + ' downloaded') | |
i+=1 | |
time.sleep(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment