-
-
Save ajeden/b17b407984d5ca78dab713f430608a52 to your computer and use it in GitHub Desktop.
Script to download The dilbert comic strips
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Simple script to download the Dilbert comic strips in a defined period of time | |
If no arguments are passed to the script, it will download all the Dilbert comic | |
strips in the current folder (It may take a while). | |
Acknowledgments | |
--------------- | |
This script is strongly based in the work from: | |
https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics | |
2019-05-06: added downloading by default only missing dates. | |
""" | |
from __future__ import print_function | |
import datetime | |
import os | |
import re | |
import sys | |
import time | |
import argparse | |
from dateutil import rrule, parser | |
from os import walk | |
import glob | |
# for backwards compatibility | |
if sys.version_info[0] > 2: | |
import urllib.request as ul | |
else: | |
import urllib as ul | |
def main(): | |
args = parse_input_arguments() | |
# If a dump folder has been defiled, create if (if does not already exists) | |
# and move to it | |
try: | |
if args.output != '.' and not(os.path.isdir(args.output)): | |
os.makedirs(args.output) | |
except: | |
args.output = '.' | |
os.chdir(args.output) | |
download_strips(args.start_date, args.end_date) | |
def get_last_file_date(): | |
f = glob.glob('./??????????.jpg') | |
f.sort(reverse = True); | |
last_file_date = datetime.datetime.strptime(f[0][2:12], '%Y-%m-%d') | |
newstartdate = last_file_date + datetime.timedelta(days=1) | |
return newstartdate.strftime('%Y-%m-%d') | |
def parse_input_arguments(): | |
argp = argparse.ArgumentParser(description='Dilbert strips download script augmented.') | |
argp.add_argument("-s", "--start", | |
dest="start_date", | |
help="start date (1989-04-17, 1st published strip, defaults to last file found in current directory).", | |
default=get_last_file_date()) | |
argp.add_argument("-e", "--end", | |
dest="end_date", | |
help="End date (default, today)", | |
default=None) | |
argp.add_argument("-o", "--output", | |
dest="output", | |
help="Comics dump folder", | |
default='.') | |
args = argp.parse_args() | |
if args.end_date is None: | |
args.end_date = datetime.datetime.now().date() | |
else: | |
args.end_date = parser.parse(args.end_date) | |
args.start_date = parser.parse(args.start_date) | |
return args | |
def download_strips(start_date, end_date): | |
for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)): | |
comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day) | |
url = 'http://dilbert.com/strip/' + comic_date | |
comic_name = comic_date + '.jpg' | |
print('getting comic from', comic_date) | |
ul.urlretrieve(get_true_comic_url(url), comic_name) | |
#time.sleep(0.01) | |
def get_true_comic_url(comic_url, comic_name='comic'): | |
""" | |
get the true comic strip url from http://dilbert.com/strip/<date> | |
It looks like Scott Adams has protected himself against pointy haired | |
pirates by hiding him comic strips within the assets.amuniversal domain. | |
This function digs into the comic strip web-page, finds (and returns) | |
the URL where the original image lives. | |
""" | |
html=str(ul.urlopen(comic_url).read()) | |
comic_strip_pattern = 'http://assets\.amuniversal\.com/[a-zA-Z\d]+' | |
return re.search(comic_strip_pattern, html).group() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment