Created
February 9, 2019 16:23
-
-
Save Nosgoroth/0fab5bf3888e114ab42b6b1dcf4159da to your computer and use it in GitHub Desktop.
Download all chapters of Taishou Otome Otogibanashi to cbz from Sea Otter Scans for archival
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys, zipfile, re, requests, json, tempfile, time | |
from pprint import pprint | |
class Chapter: | |
url = None | |
title = None | |
def __init__(self, tup): self.url, self.title = tup | |
def getImageListForChapterWithUrl(url): | |
#var pages = | |
x = requests.get(url) | |
r = re.search(r'var pages = (\[.*\]);', x.text) | |
if not r: | |
print "Couldn't retrieve image list from page" | |
return None | |
try: | |
pages = json.loads(r.group(1)) | |
except: | |
print "Couldn't parse image list JSON" | |
return None | |
return [x["url"] for x in pages] | |
def getChapterListFromBaseUrl(url): | |
#<a href="https://reader.seaotterscans.com/read/taishau_wotome_otogibanashi/en/0/2/" title="Chapter 2: Memory of Spring on a Winter Night "> | |
x = requests.get(url) | |
# Using regex to parse html. Zalgo is Tony the Pony he COMES | |
r = re.findall(r'<a href="(http[^"]+\/read\/[^"]+)" title="([^"]+)"', x.text) | |
if not r: | |
print "Couldn't retrieve chapter list from page" | |
return None | |
return [Chapter(x) for x in r] | |
def downloadImageListToZip(title, images): | |
tempdir = tempfile.mkdtemp("taishoo") | |
success = True | |
zipname = None | |
try: | |
zipname = re.sub(r'[^\d\w\.\-\_]', ' ', title)+".cbz" | |
zipname = re.sub(r'[\s]+', ' ', zipname) | |
zipname = re.sub(r'[\s]+\.cbz$', '.cbz', zipname) | |
if os.path.exists(zipname): | |
print "File already exists" | |
return False | |
with zipfile.ZipFile(zipname, 'w') as zipobj: | |
i = 0 | |
for imageurl in images: | |
i += 1 | |
filepath = None | |
try: | |
root, ext = os.path.splitext(imageurl) | |
filepath = os.path.join(tempdir,str(i).zfill(3)+ext) | |
r = requests.get(imageurl, allow_redirects=True) | |
open(filepath, 'wb').write(r.content) | |
zipobj.write(filepath) | |
os.remove(filepath) | |
print "Downloaded image", i | |
except KeyboardInterrupt: | |
if filepath: | |
try: os.remove(filepath) | |
except: pass | |
raise | |
except: | |
if filepath: | |
try: os.remove(filepath) | |
except: pass | |
print "Error getting image", i | |
success = False | |
time.sleep(2) | |
except KeyboardInterrupt: | |
if zipname: | |
try: os.remove(zipname) | |
except: pass | |
raise | |
except: | |
if zipname: | |
try: os.remove(zipname) | |
except: pass | |
try: os.rmdir(tempdir) | |
except: pass | |
print "An error ocurred" | |
success = False | |
finally: | |
try: os.rmdir(tempdir) | |
except: pass | |
return success | |
def main(): | |
try: | |
print "Retrieving chapter list..." | |
chapters = getChapterListFromBaseUrl("https://reader.seaotterscans.com/series/taishau_wotome_otogibanashi/") | |
print "Found", len(chapters), "chapters." | |
errors = False | |
for chapter in chapters: | |
print "Processing chapter:", chapter.title | |
images = getImageListForChapterWithUrl(chapter.url) | |
print "Found", len(images), "images." | |
res = downloadImageListToZip(chapter.title, images) | |
if res: | |
print "Chapter downloaded successfully" | |
else: | |
print "Chapter downloaded with errors, or not downloaded" | |
errors = True | |
if errors: | |
print "Finished with errors" | |
else: | |
print "Finished successfully!" | |
except KeyboardInterrupt: | |
pass | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I don't really know, but you could try to add the
verify = False
parameter to the requests calls, according to Stack Overflow.