Created
January 1, 2017 14:01
-
-
Save mkbouaziz/441182209b8e22721ebdfdf69baaa78d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*-coding:Utf-8 -* | |
import json | |
import os | |
import pickle | |
import sys | |
import time | |
import urllib | |
import requests | |
from io import open as iopen | |
import sys | |
import facebook | |
ID = 'ID_PAGE' | |
TOKEN = '' # access token - replace this with token value before running . | |
SAFE_CHARS = '-_() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' | |
def save(res, name='data'): | |
"""Save data to a file""" | |
with open('%s.lst' % name, 'w') as f: | |
pickle.dump(res, f) | |
def read(name='data'): | |
"""Read data from a file""" | |
with open('%s.lst' % name, 'r') as f: | |
res = pickle.load(f) | |
return res | |
def fetch(limit=100000, depth=10, last=None, id=ID, token=TOKEN): | |
"""Fetch the data using Facebook's Graph API""" | |
lst = [] | |
graph = facebook.GraphAPI(token) | |
url = '%s/photos?type=uploaded' % id | |
args = {'fields': ['images','name'], 'limit': limit} | |
res = graph.request('%s/photos?type=uploaded' % id, args) | |
# continue fetching till all photos are found | |
while bool(res.get('paging',{}).get('next')): | |
try: | |
url = res['paging']['next'] | |
response=json.loads(urllib.urlopen(url).read()) | |
res['data'].extend(response.get('data')) | |
res['paging']['next']=response.get('paging',{}).get('next',"") | |
#process(lst, res['data']) | |
except: | |
break | |
# setup toolbar | |
lenData=len(res['data']) | |
print('Reading %s images..' % lenData) | |
toolbar_width = 40 | |
step=lenData / toolbar_width | |
sys.stdout.write("[%s]" % (" " * toolbar_width)) | |
sys.stdout.flush() | |
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '[' | |
for j,i in enumerate(res['data']): | |
dataPhoto=graph.request(i.get('id')) | |
i['source']=dataPhoto.get('images')[0].get('source') | |
i['name']=dataPhoto.get('name') | |
if not bool(j % step): | |
sys.stdout.write("-") | |
sys.stdout.flush() | |
sys.stdout.write("]\n") | |
process(lst, res['data']) | |
save(url, 'last_url') | |
return lst | |
def process(res, dat): | |
"""Extract required data from a row""" | |
err = [] | |
for d in dat: | |
if not d.get('source'): | |
err.append(d) | |
continue | |
src = d['source'] | |
if d.get('name'): | |
name = ''.join(c for c in d['name'][:99] if c in SAFE_CHARS) + src[-4:] | |
else: | |
name = src[src.rfind('/')+1:] | |
res.append({'name': name, 'src': src}) | |
if err: | |
print '%d errors.' % len(err) | |
print err | |
print '%d photos found.' % len(dat) | |
def download(res): | |
"""Download the list of files""" | |
start = time.clock() | |
if not os.path.isdir(ID): | |
os.mkdir(ID) | |
os.chdir(ID) | |
""" Using an counter to download 100 divisible photos & wait for some time """ | |
delayCounter =1 | |
for i,p in enumerate(res): | |
if delayCounter % 100 ==0 : | |
#print "x - Downloaded %s picture " %delayCounter | |
time.sleep(3) | |
response = requests.get(p.get('src')) | |
ext=response.headers.get('Content-Type').split('/')[-1] | |
file_name='%s.%s' %(i,ext) | |
if response.status_code == requests.codes.ok: | |
with iopen(file_name, 'wb') as file: | |
file.write(response.content) | |
#print delayCounter | |
delayCounter = delayCounter + 1 | |
print "Downloaded %s pictures in %.3f sec." % (len(res), time.clock()-start) | |
if __name__ == '__main__': | |
# download 500 photos, fetch details about 100 at a time | |
lst = fetch(limit=100000, depth=5000) | |
save(lst, 'photos') | |
#lst=read('photos') | |
download(lst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment