Last active
December 23, 2018 16:15
-
-
Save orangeblock/8a4ad3e64bc9b37cbba0de3a7a976881 to your computer and use it in GitHub Desktop.
Customizable concurrent URL downloader. Works with Python 2.7x and 3.x, only using core libraries. Can be ran as a command line script or used as a library. Simply pass a list of URLs to download_files. For the rest simply read the docstring or --help from the CL.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import os | |
import sys | |
import csv | |
import time | |
import shutil | |
from multiprocessing.dummy import Pool as ThreadPool | |
from collections import Iterable | |
try: | |
from urllib2 import urlopen | |
from urlparse import urlparse | |
from urllib import unquote | |
except ImportError: | |
from urllib.parse import unquote, urlparse | |
from urllib.request import urlopen | |
try: | |
basestring | |
except NameError: | |
basestring = str | |
def _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout): | |
def worker(url, filename=None): | |
if filename is None: | |
filename = os.path.basename(urlparse(unquote(url)).path) | |
if not filename: | |
raise ValueError("Could not extract filename from URL") | |
req = urlopen(url) | |
path = os.path.join(dest, filename) | |
with open(path, 'wb') as f: | |
shutil.copyfileobj(req, f, chunk_size) | |
if verbose: | |
sys.stdout.write('[+] %s -> %s\n' % (url, filename)) | |
return filename, os.path.abspath(path) | |
pool = ThreadPool(concurrency) | |
result = pool.map_async(lambda args: worker(*args), tups) | |
# async allows interrupts to be triggered | |
while not result.ready(): | |
time.sleep(_timeout) | |
return result.get() | |
def download_files(xs, dest='.', concurrency=10, chunk_size=128*1024, verbose=False, _timeout=.1): | |
""" | |
Spawn a pool of workers that will download the given urls. | |
`xs`: List of strings or a list of tuples in the format (url, [filename]). | |
Strings have to be the urls to download. In the tuple format you can also | |
specify the filename for the saved file. If None, or missing, it is extracted | |
from the URL. | |
`dest`: The directory to save the downloaded files into. Can be relative or absolute. | |
`concurrency`: The number of threads to run in parallel. | |
`chunk_size`: The max size of the request chunk, in bytes. | |
`verbose`: If True, logs additional data. | |
`_timeout`: The amount to sleep between checks for completion. | |
Returns a list of tuples in the format (filename, full_path_to_file) | |
""" | |
if type(xs) != list: | |
raise ValueError('Input must be a list') | |
tups = [] | |
for x in xs: | |
if isinstance(x, basestring): | |
tups.append((x, None)) | |
elif isinstance(x, Iterable): | |
if len(x) not in [1,2]: | |
raise ValueError('Invalid tuple size: %s' % len(x)) | |
tups.append((x[0], x[1] if len(x) == 2 else None)) | |
return _download_files(tups, dest, concurrency, chunk_size, verbose, _timeout) | |
if __name__ == '__main__': | |
import argparse | |
def _valid_dir(maybe_dir): | |
if not os.path.isdir(maybe_dir): | |
raise argparse.ArgumentTypeError('%s is not a valid directory' % maybe_dir) | |
if not os.access(maybe_dir, os.W_OK): | |
raise argparse.ArgumentTypeError('Cannot write to %s' % maybe_dir) | |
return maybe_dir | |
parser = argparse.ArgumentParser() | |
parser.add_argument('urls', type=argparse.FileType('r'), | |
help='File with a url on each line, optionally with a target filename separated by a comma') | |
parser.add_argument('dest', nargs='?', default=os.getcwd(), type=_valid_dir, | |
help='Destination directory to download the files into (defaults to current)') | |
parser.add_argument('-t', '--threads', type=int, default=10, | |
help='Number of parallel threads to run (default=%(default)s)') | |
parser.add_argument('-c', '--chunk-size', type=int, default=128*1024, | |
help='Size of chunk to download at a time, in bytes (default=%(default)s)') | |
args = parser.parse_args() | |
download_files(list(csv.reader(args.urls)), args.dest, args.threads, args.chunk_size, verbose=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment