Created
November 24, 2020 08:13
-
-
Save bebosudo/e61501b122aec11826e3296639dc3f42 to your computer and use it in GitHub Desktop.
Download all chapters of a given manga from mangaeden.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# | |
# Install selenium with 'pip install selenium' and the related | |
# geckodriver for the Firefox browser. | |
# Extracts all chapters of the given manga url, but downloads only the given | |
# chapter and the following ones, to avoid fetch already downloaded chapters. | |
# | |
# The trick is to use a small piece of js code to trigger the download, | |
# which will auto-download since we load Firefox with a profile that | |
# never asks to download mangaeden images. | |
# Selenium is required because the CloudFront CDN recognizes non-browser | |
# downloads such as curl, even if you use cookies copied from a browser. | |
# | |
from selenium import webdriver | |
import re | |
import sys | |
import json | |
import time | |
from pathlib import Path | |
from pprint import pprint | |
FILENAME_DIGITS = 5 # Save output files with {N} digits -> up to (10**N)-1 images per chapter | |
FILENAME_SEP = "___" | |
COOLDOWN_SECONDS_BETWEEN_IMAGES = 1 # Below 1 sec you may get 503 errors | |
COOLDOWN_SECONDS_BETWEEN_CHAPTERS = 15 | |
class MangaURL: | |
def __init__(self, user_url): | |
self._user_url = user_url | |
self._url_re = r"https?:\/\/(?:www\.)?mangaeden\.com\/en\/(.*)\/(.*)\/(.+)\/\d+\/" | |
self._manga_url_base = "https://www.mangaeden.com/en/{}/{}/{}/1/" | |
self._setup_browser() | |
self._extract_manga_name_and_chapter() | |
self._main() | |
def _setup_browser(self): | |
fp = webdriver.FirefoxProfile() | |
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", | |
"image/jpeg,image/jpg,image/png") | |
self._browser = webdriver.Firefox(firefox_profile=fp) | |
def _extract_manga_name_and_chapter(self): | |
match = re.compile(self._url_re).match(self._user_url) | |
if match: | |
self._mangaeden, self.manga_name, self.ch_start = match.groups() | |
else: | |
raise SystemExit("URL {} is not from mangaeden.com; exit.".format( | |
self._user_url)) | |
def _extract_manga_chapters_reverse(self): | |
self._browser.get(self._user_url) | |
try: | |
xpath_box = '//*[@id="combobox"]' | |
chapters_prnt = self._browser.find_elements_by_xpath(xpath_box)[0] | |
except IndexError as e: | |
raise SystemExit("Couldn't find the chapter list in the page; " | |
"is the url correct? Check if the page is a 404.") | |
chapter_html_elem = chapters_prnt.find_elements_by_tag_name('option') | |
return [el.text for el in chapter_html_elem] | |
def _create_ch_url(self, chapter_num): | |
return self._manga_url_base.format(self._mangaeden, self.manga_name, | |
chapter_num) | |
def _extract_chapters_to_download(self): | |
all_ch = self._extract_manga_chapters_reverse() | |
# Chapters in the html page are in reverse order. | |
ch_after_start = all_ch[all_ch.index(self.ch_start)::-1] | |
return [(name, self._create_ch_url(name)) for name in ch_after_start] | |
def _extract_chapter_pages_list(self, chapter_url): | |
self._browser.get(chapter_url) | |
regex = r"var\ pages\ =\ \[.*\]" | |
match = re.compile(regex).search(self._browser.page_source) | |
if not match: | |
raise SystemExit("Couldn't find the js array within the chapter " | |
"source page") | |
text = match.group() | |
list_img = json.loads(text[text.find("["):]) | |
return list_img | |
def _fetch_all_images(self, ch_name, list_img): | |
# Move browser to first image path to avoid CORS error due to different | |
# cdn subdomain. Downloads continue in background thanks to js. | |
self._browser.get("https:" + list_img[0]["fs"]) | |
for img_dict in list_img: | |
img_url = "https:" + img_dict["fs"] | |
n_padded = f'{img_dict["n"]+1:0{FILENAME_DIGITS}}' | |
js_fetcher = """ | |
fetch('{}') | |
.then(resp => resp.blob()) | |
.then(blob => {{ | |
const url = window.URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.style.display = 'none'; | |
a.href = url; | |
a.download = '{}.jpg'; // filename to save the file as | |
document.body.appendChild(a); | |
a.click(); | |
window.URL.revokeObjectURL(url); | |
}}) | |
.catch(() => alert('oh no!')); | |
""".format(img_url, ch_name + FILENAME_SEP + n_padded) | |
self._browser.execute_script(js_fetcher) | |
print(f'{n_padded}/{len(list_img):0{FILENAME_DIGITS}}:', img_url) | |
time.sleep(COOLDOWN_SECONDS_BETWEEN_IMAGES) | |
def _rename_downloaded_files(self, ch_name): | |
downloaded_files = Path.cwd().glob(ch_name + FILENAME_SEP + "*") | |
if not downloaded_files: | |
print("Execute this script from within the Download dir where " | |
"files are downloaded into to reorganize them in subdirs.", | |
file=sys.stderr) | |
for p in downloaded_files: | |
Path(ch_name).mkdir(exist_ok=True) | |
p.rename(Path(ch_name) / p.name[p.name.find(FILENAME_SEP)+len(FILENAME_SEP):]) | |
def _main(self): | |
chapters_list = self._extract_chapters_to_download() | |
print("Chapters to download:") | |
pprint(chapters_list) | |
for ch_name, ch_url in chapters_list: | |
list_img = self._extract_chapter_pages_list(ch_url) | |
print("Start downloading chapter: {}".format(ch_url)) | |
self._fetch_all_images(ch_name, list_img) | |
print("Wait for {} seconds for files to finish download".format( | |
ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS | |
)) | |
time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS) | |
self._rename_downloaded_files(ch_name) | |
print("Chapter {} completed, now wait for {} seconds to cooldown".format( | |
ch_url, COOLDOWN_SECONDS_BETWEEN_CHAPTERS | |
)) | |
time.sleep(COOLDOWN_SECONDS_BETWEEN_CHAPTERS) | |
if __name__ == "__main__": | |
try: | |
user_url = sys.argv[1] | |
except IndexError as e: | |
raise SystemExit("usage: {} mangaeden-url\n\n" | |
"Missing url of the manga on mangaeden that you " | |
"want to download, e.g. " | |
"https://www.mangaeden.com/en/en-manga/" | |
"onepunch-man/10/3/ and it will download all the " | |
"chapters following the one you " | |
"provide".format(sys.argv[0])) | |
manga_url = MangaURL(user_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated regex
https?:\/\/(?:www2?\.)?mangaeden\.com\/(?:en|it)\/(.*)\/(.*)\/(.+)\/\d+\/