ZenulAbidin · May 28, 2020 22:59
diff --git a/instagram_scraper.py b/instagram_scraper.py
 from selenium import webdriver
 from bs4 import BeautifulSoup as bs
 import time
 import re
 from urllib.request import urlopen
 import json
 from pandas.io.json import json_normalize
 import pandas as pd, numpy as np
 import argparse
 import requests
 import os

 print("Instagram Scraper Copyright (c) 2020 Ali Sherief")

 parser = argparse.ArgumentParser(description='Scrapes an intsagram page for image links and alt text')
 parser.add_argument('username', metavar='username', type=str,
                    help='username of the profile to scrape')
 args = parser.parse_args()

 username=args.username
 browser = webdriver.Chrome()
 browser.get('https://www.instagram.com/'+username+'/?hl=en')
 PagelengthOld = 0
 Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")

 l=[]

 # Extract links from user profile page
 # This script skips videos because it's dumb and doesn't
 # know how to extract them yet.
 # Remove the sleep timers at your own risk. They ensure your
 # IP doesn't get blocked, and that Instagram has enough time
 # to update a scrolled page.
 while PagelengthOld != Pagelength:
    source = browser.page_source
    data=bs(source, 'html.parser')
    body = data.find('body')
    img = body.find_all('img', style='object-fit: cover;')
    for i in img:
        try:
            l.append(i['src'])
        except KeyError as e:
            pass
    time.sleep(2);
    PagelengthOld = Pagelength;
    Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")

 time.sleep(2)
 source = browser.page_source
 data=bs(source, 'html.parser')
 body = data.find('body')
 img = body.find_all('img', style='object-fit: cover;')
 for i in img:
    try:
        l.append(i['src'])
    except KeyError as e:
        pass

 links = []
 for ll in l:
    if ll not in links:
        links.append(ll)

 with open(username+'-links.txt', 'w') as f:
    f.write('\n'.join(links)+'\n')

 try:
    os.mkdir(username)
 except FileExistsError as e:
    pass

 i = 1
 for link in links:
    r = requests.get(link)
    if r.ok:
        with open(os.path.join(username,'img'+str(i)+'.jpg'), 'wb') as f:
            f.write(r.content)
        print('Image ' + str(i) + ' downloaded')
        i+=1
    time.sleep(2)
	from selenium import webdriver
	from bs4 import BeautifulSoup as bs
	import time
	import re
	from urllib.request import urlopen
	import json
	from pandas.io.json import json_normalize
	import pandas as pd, numpy as np
	import argparse
	import requests
	import os

	print("Instagram Scraper Copyright (c) 2020 Ali Sherief")

	parser = argparse.ArgumentParser(description='Scrapes an intsagram page for image links and alt text')
	parser.add_argument('username', metavar='username', type=str,
	help='username of the profile to scrape')
	args = parser.parse_args()

	username=args.username
	browser = webdriver.Chrome()
	browser.get('https://www.instagram.com/'+username+'/?hl=en')
	PagelengthOld = 0
	Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")

	l=[]

	# Extract links from user profile page
	# This script skips videos because it's dumb and doesn't
	# know how to extract them yet.
	# Remove the sleep timers at your own risk. They ensure your
	# IP doesn't get blocked, and that Instagram has enough time
	# to update a scrolled page.
	while PagelengthOld != Pagelength:
	source = browser.page_source
	data=bs(source, 'html.parser')
	body = data.find('body')
	img = body.find_all('img', style='object-fit: cover;')
	for i in img:
	try:
	l.append(i['src'])
	except KeyError as e:
	pass
	time.sleep(2);
	PagelengthOld = Pagelength;
	Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight); return document.body.scrollHeight;")

	time.sleep(2)
	source = browser.page_source
	data=bs(source, 'html.parser')
	body = data.find('body')
	img = body.find_all('img', style='object-fit: cover;')
	for i in img:
	try:
	l.append(i['src'])
	except KeyError as e:
	pass

	links = []
	for ll in l:
	if ll not in links:
	links.append(ll)

	with open(username+'-links.txt', 'w') as f:
	f.write('\n'.join(links)+'\n')

	try:
	os.mkdir(username)
	except FileExistsError as e:
	pass

	i = 1
	for link in links:
	r = requests.get(link)
	if r.ok:
	with open(os.path.join(username,'img'+str(i)+'.jpg'), 'wb') as f:
	f.write(r.content)
	print('Image ' + str(i) + ' downloaded')
	i+=1
	time.sleep(2)