Created
April 10, 2020 07:05
-
-
Save nikAizuddin/7a3f1f0c0cb8ab430bf3643f56f48983 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Download Free Springer books. | |
Requirements: | |
* Make sure you have Python 3.7. It is recommended that you use Python from Anaconda. | |
* Download chromedriver https://chromedriver.chromium.org/downloads and extract it | |
into the same directory as this script. | |
How to Execute: | |
$ conda create --name springer-download | |
$ conda activate springer-download | |
(springer-download) $ conda install python==3.7 | |
(springer-download) $ conda install beautifulsoup4 selenium pandas lxml html5lib requests tqdm | |
(springer-download) $ python springer-download.py [URL] | |
""" | |
import re | |
import os | |
import logging | |
import argparse | |
import tqdm | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import selenium | |
from selenium import webdriver | |
logger = logging.getLogger(__name__) | |
class WebScraper: | |
"""Scrap web-page. | |
""" | |
def __init__(self): | |
self.driver = self._init_driver() | |
def __del__(self): | |
self.driver.quit() | |
def _init_driver(self): | |
"""Initialize chromedriver | |
""" | |
chromedriver_file = self._find_driver_in_cwd() | |
driver = webdriver.Chrome(chromedriver_file) | |
return driver | |
@staticmethod | |
def _find_driver_in_cwd(): | |
"""Find chromedriver file. | |
""" | |
chromedriver_file = None | |
for root, directories, files in os.walk('.'): | |
for filename in files: | |
if re.match(r'chromedriver(|\.exe)', filename): | |
chromedriver_file = os.path.join(root, filename) | |
if chromedriver_file is None: | |
raise FileNotFoundError | |
return chromedriver_file | |
class SpringerScraper(WebScraper): | |
"""Scrap Springer. | |
""" | |
def get_dataframe_table(self, url): | |
"""Read HTML table into Pandas Dataframe | |
Parameters | |
---------- | |
url : str | |
URL to the main page containing table. | |
Returns | |
------- | |
pandas.DataFrame | |
Pandas DataFrame containing book informations. | |
""" | |
self.driver.get(url) | |
soup = BeautifulSoup(self.driver.page_source, 'lxml') | |
table = soup.find('table') | |
df = pd.read_html(str(table)) | |
return df | |
def download_pdf(self, url, outpdf): | |
"""Download PDF file from the given URL (with cookies). | |
Parameters | |
---------- | |
url : str | |
URL to the web-page containing PDF url. | |
outpdf : str | |
PDF filename to be written. | |
""" | |
self.driver.get(url) | |
cookies = {c['name']:c['value'] for c in self.driver.get_cookies()} | |
try: | |
pdfurl = self.driver.find_element_by_class_name('cta-button-container__item').find_element_by_css_selector('a').get_attribute('href') | |
except selenium.common.exceptions.NoSuchElementException as e: | |
logger.warning('Unable to download "{}"'.format(url)) | |
logger.warning(str(e)) | |
else: | |
response = requests.get(pdfurl, cookies=cookies) | |
with open(outpdf, 'wb') as f: | |
f.write(response.content) | |
def main(): | |
_init_logger() | |
args = _parse_args() | |
mainscrap = SpringerScraper() | |
df = mainscrap.get_dataframe_table(args.url) | |
book_urls = df[0]['S'][2:-1] | |
book_titles = df[0]['A'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and')) | |
book_categories = df[0]['L'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and')) | |
pdfdir = os.path.join('downloads') | |
if not os.path.exists(pdfdir): | |
logger.info('Creating "{}"'.format(pdfdir)) | |
os.mkdir(pdfdir) | |
for book_category in book_categories.drop_duplicates(): | |
if not os.path.exists(os.path.join(pdfdir, book_category)): | |
logger.info('Creating "{}"'.format(os.path.join(pdfdir, book_category))) | |
os.mkdir(os.path.join(pdfdir, book_category)) | |
for title, category, url in tqdm.tqdm(zip(book_titles, book_categories, book_urls), total=len(book_urls)): | |
pdfscrap = SpringerScraper() | |
outpdf = os.path.join(pdfdir, category, title + '.pdf') | |
logger.info('Downloading "{}" into "{}"'.format(url, outpdf)) | |
pdfscrap.download_pdf(url, outpdf) | |
del pdfscrap | |
def _init_logger(): | |
"""Initialize logger. | |
""" | |
formatter = logging.Formatter('%(asctime)s %(process)d:%(thread)d:%(levelname)s:%(name)s:%(lineno)d: %(message)s') | |
stream_handler = logging.StreamHandler() | |
stream_handler.setFormatter(formatter) | |
stream_handler.setLevel('INFO') | |
logger.addHandler(stream_handler) | |
logger.setLevel('DEBUG') | |
def _parse_args(): | |
"""Parse Arguments from command-line. | |
""" | |
parser = argparse.ArgumentParser( | |
description='Download Free Springer books', | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('url', type=str, help='URL to the list of books') | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment