-
-
Save nmoinvaz/92e1be6c2ba25de009f7593dcf0229cd to your computer and use it in GitHub Desktop.
Script to scrape images from a flickr account.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" Script to scrape images from a flickr account. | |
Author: Ralph Bean <[email protected]> | |
Added ability to get specific image sizes | |
Added thread pooling for faster retrieval of large sets of images | |
Added flickr api request caching for faster recovery | |
Author: Nathan Moinvaziri <[email protected]> | |
""" | |
import ConfigParser | |
import urllib | |
import requests | |
import os | |
import time | |
import hashlib | |
import codecs | |
import json | |
import collections | |
from concurrent.futures import ThreadPoolExecutor | |
config = ConfigParser.ConfigParser() | |
config.read(['flickr.ini', '/etc/flickr.ini']) | |
def read_json(path): | |
data = {} | |
with codecs.open(path, 'r', 'utf-8') as fin: | |
data = json.load(fin, object_pairs_hook=collections.OrderedDict) | |
return data | |
def write_json(path, data): | |
with codecs.open(path, 'w', 'utf-8') as fout: | |
fout.write(json.dumps(data, ensure_ascii=False, indent=4)) | |
def flickr_request(**kwargs): | |
flickr_api_key = config.get('general', 'flickr_api_key') | |
flickr_url = config.get('general', 'flickr_url') | |
# Get sha1 to save response in cache | |
url_params = dict( | |
api_key=flickr_api_key, | |
format='json', | |
nojsoncallback=1, | |
**kwargs) | |
sha_1 = hashlib.sha1() | |
sha_1.update(flickr_url) | |
sha_1.update(json.dumps(url_params)) | |
if not os.path.exists('cache'): | |
os.makedirs('cache') | |
cache_path = os.path.join('cache', sha_1.hexdigest()) | |
if not os.path.exists(cache_path): | |
response = requests.get(flickr_url, params=url_params) | |
result = response.json() | |
if result['stat'] != 'ok': | |
print result['stat'] | |
exit | |
write_json(cache_path, result) | |
else: | |
result = read_json(cache_path) | |
return result | |
def get_flickr_photos_by_person(nsid, page=1): | |
# https://secure.flickr.com/services/api/flickr.people.getPhotos.html | |
print 'Getting list of photos - {0} (Page {1})'.format(nsid, page) | |
return flickr_request( | |
method='flickr.people.getPhotos', | |
user_id=nsid, | |
content_type=1, # photos only | |
page=page, | |
per_page=500 | |
) | |
def get_flickr_photo_size(photo, size): | |
print 'Getting photo size - {0} ({1})'.format(photo['id'], photo['title']) | |
# https://secure.flickr.com/services/api/flickr.photos.getSizes.html | |
d = flickr_request( | |
method='flickr.photos.getSizes', | |
photo_id=photo['id'] | |
) | |
for s in d['sizes']['size']: | |
if s['label'].lower() == size: | |
return s | |
return None | |
def get_photos_for_person(nsid): | |
a = get_flickr_photos_by_person(nsid)['photos'] | |
# Step backwards through the pictures | |
photos = a['photo'] | |
for page in range(a['pages'], 1, -1): | |
d = get_flickr_photos_by_person(nsid, page=page) | |
photos.extend(d['photos']['photo']) | |
return photos | |
def download_flickr_photo(photo, size): | |
# Get the correct size for the photo | |
print 'Processing photo - {0} ({1})'.format(photo['id'], photo['title']) | |
photo_size = get_flickr_photo_size(photo, size) | |
if photo_size is None: | |
return | |
# Construct url and local output path | |
url = photo_size['source'] | |
url_path, url_ext = os.path.splitext(url) | |
output = config.get('general', 'output_dir') | |
local = os.path.join(output, photo['title'] + url_ext) | |
if not os.path.exists(local) or os.path.getsize(local) == 0: | |
print '* Saving url {0}\n as {1}'.format(url, local) | |
urllib.urlretrieve(url, local) | |
def main(): | |
# https://www.webpagefx.com/tools/idgettr/ | |
nsid = config.get('general', 'nsid') | |
output = config.get('general', 'output_dir') | |
if not os.path.exists(output): | |
os.makedirs(output) | |
output_size = config.get('general', 'output_size') | |
workers = config.getint('general', 'max_workers') | |
# First get all photos | |
photos = get_photos_for_person(nsid) | |
print 'Retrieved {0} photos for {1}'.format(len(photos), nsid) | |
# Launch thread pool for quickest download | |
with ThreadPoolExecutor(max_workers=workers) as executor: | |
for photo in photos: | |
executor.submit(download_flickr_photo, photo, output_size) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[general] | |
flickr_url=https://api.flickr.com/services/rest/ | |
flickr_api_key=put-your-api-key-here | |
output_dir=images | |
cache_dir=cache | |
output_size=original | |
nsid=put-nsid-for-user-account | |
max_workers=8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment