ajeden · May 7, 2019 07:39
diff --git a/dilbert.py b/dilbert.py
 #!/usr/bin/env python
 """
 Simple script to download the Dilbert comic strips in a defined period of time

 If no arguments are passed to the script, it will download all the Dilbert comic
 strips in the current folder (It may take a while).

 Acknowledgments
 ---------------
 This script is strongly based in the work from:
 https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics

 2019-05-06: added downloading by default only missing dates.
 """

 from __future__ import print_function

 import datetime
 import os
 import re
 import sys
 import time
 import argparse
 from dateutil import rrule, parser
 from os import walk
 import glob

 # for backwards compatibility
 if sys.version_info[0] > 2:
 	import urllib.request as ul
 else:
 	import urllib as ul


 def main():
 	args = parse_input_arguments()

 	# If a dump folder has been defiled, create if (if does not already exists)
 	# and move to it
 	try:
 		if args.output != '.' and not(os.path.isdir(args.output)):
 			os.makedirs(args.output)
 	except:
 		args.output = '.'
 	os.chdir(args.output)

 	download_strips(args.start_date, args.end_date)

 def get_last_file_date():
 	f = glob.glob('./??????????.jpg')
 	f.sort(reverse = True);
 	last_file_date = datetime.datetime.strptime(f[0][2:12], '%Y-%m-%d')
 	newstartdate = last_file_date + datetime.timedelta(days=1) 
 	return newstartdate.strftime('%Y-%m-%d')

 def parse_input_arguments():
 	argp = argparse.ArgumentParser(description='Dilbert strips download script augmented.')
 	
 	argp.add_argument("-s", "--start",
 						dest="start_date",
 						help="start date (1989-04-17, 1st published strip, defaults to last file found in current directory).",
 						default=get_last_file_date())
 	argp.add_argument("-e", "--end",
 						dest="end_date",
 						help="End date (default, today)",
 						default=None)
 	argp.add_argument("-o", "--output",
 						dest="output",
 						help="Comics dump folder",
 						default='.')

 	args = argp.parse_args()
 	if args.end_date is None:
 		args.end_date = datetime.datetime.now().date()
 	else:
 		args.end_date = parser.parse(args.end_date)
 	args.start_date = parser.parse(args.start_date)
 	
 	return args
 	
 def download_strips(start_date, end_date):
 	for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)):
 		comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day)
 		url  = 'http://dilbert.com/strip/' + comic_date
 		comic_name = comic_date + '.jpg'
 		print('getting comic from', comic_date)
 		ul.urlretrieve(get_true_comic_url(url), comic_name)
 		#time.sleep(0.01)


 def get_true_comic_url(comic_url, comic_name='comic'):
 	"""
 	get the true comic strip url from http://dilbert.com/strip/<date>

 	It looks like Scott Adams has protected himself against pointy haired
 	pirates by hiding him comic strips within the assets.amuniversal domain.
 	This function digs into the comic strip web-page, finds (and returns)
 	the URL where the original image lives.
 	"""

 	html=str(ul.urlopen(comic_url).read())
 	comic_strip_pattern = 'http://assets\.amuniversal\.com/[a-zA-Z\d]+'
 	return re.search(comic_strip_pattern, html).group()


 if __name__ == '__main__':
 	main()
	#!/usr/bin/env python
	"""
	Simple script to download the Dilbert comic strips in a defined period of time

	If no arguments are passed to the script, it will download all the Dilbert comic
	strips in the current folder (It may take a while).

	Acknowledgments
	---------------
	This script is strongly based in the work from:
	https://community.spiceworks.com/scripts/show/982-download-all-dilbert-comics

	2019-05-06: added downloading by default only missing dates.
	"""

	from __future__ import print_function

	import datetime
	import os
	import re
	import sys
	import time
	import argparse
	from dateutil import rrule, parser
	from os import walk
	import glob

	# for backwards compatibility
	if sys.version_info[0] > 2:
	import urllib.request as ul
	else:
	import urllib as ul


	def main():
	args = parse_input_arguments()

	# If a dump folder has been defiled, create if (if does not already exists)
	# and move to it
	try:
	if args.output != '.' and not(os.path.isdir(args.output)):
	os.makedirs(args.output)
	except:
	args.output = '.'
	os.chdir(args.output)

	download_strips(args.start_date, args.end_date)

	def get_last_file_date():
	f = glob.glob('./??????????.jpg')
	f.sort(reverse = True);
	last_file_date = datetime.datetime.strptime(f[0][2:12], '%Y-%m-%d')
	newstartdate = last_file_date + datetime.timedelta(days=1)
	return newstartdate.strftime('%Y-%m-%d')

	def parse_input_arguments():
	argp = argparse.ArgumentParser(description='Dilbert strips download script augmented.')

	argp.add_argument("-s", "--start",
	dest="start_date",
	help="start date (1989-04-17, 1st published strip, defaults to last file found in current directory).",
	default=get_last_file_date())
	argp.add_argument("-e", "--end",
	dest="end_date",
	help="End date (default, today)",
	default=None)
	argp.add_argument("-o", "--output",
	dest="output",
	help="Comics dump folder",
	default='.')

	args = argp.parse_args()
	if args.end_date is None:
	args.end_date = datetime.datetime.now().date()
	else:
	args.end_date = parser.parse(args.end_date)
	args.start_date = parser.parse(args.start_date)

	return args

	def download_strips(start_date, end_date):
	for date in list(rrule.rrule(rrule.DAILY, dtstart=start_date, until=end_date)):
	comic_date = '%04d-%02d-%02d' % (date.year, date.month, date.day)
	url = 'http://dilbert.com/strip/' + comic_date
	comic_name = comic_date + '.jpg'
	print('getting comic from', comic_date)
	ul.urlretrieve(get_true_comic_url(url), comic_name)
	#time.sleep(0.01)


	def get_true_comic_url(comic_url, comic_name='comic'):
	"""
	get the true comic strip url from http://dilbert.com/strip/<date>

	It looks like Scott Adams has protected himself against pointy haired
	pirates by hiding him comic strips within the assets.amuniversal domain.
	This function digs into the comic strip web-page, finds (and returns)
	the URL where the original image lives.
	"""

	html=str(ul.urlopen(comic_url).read())
	comic_strip_pattern = 'http://assets\.amuniversal\.com/[a-zA-Z\d]+'
	return re.search(comic_strip_pattern, html).group()


	if __name__ == '__main__':
	main()