Created
October 12, 2020 21:37
-
-
Save lokal-profil/a36e459d16f8f92f447841c79efbeb7e to your computer and use it in GitHub Desktop.
Short script for scraping literaturbanken book images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# short script for scraping literaturbanken book images | |
import requests | |
from tqdm import tqdm | |
def download_single(num, prefix, url): | |
num_s = '{0:04}'.format(num) | |
full_url = url.format(num_s) | |
output_file = '{1}_{0}.jpg'.format(num_s, prefix) | |
r = requests.get(full_url, stream=True) | |
chunk_size = 1024 | |
label = 'page_{}'.format(num_s) | |
with open(output_file, 'wb') as handle: | |
pbar = tqdm(desc=label, unit_scale=True, unit='B') | |
for data in r.iter_content(chunk_size=chunk_size): | |
if data: # filter out keep-alive new chunks | |
pbar.update(len(data)) | |
handle.write(data) | |
def download_all(book_id, prefix, pages, zoom_level=5): | |
url = 'https://litteraturbanken.se/txt/{0}/{0}_{1}/{0}_{1}_{{}}.jpeg'.format(book_id, zoom_level) | |
for num in range(1, pages+1): | |
download_single(num, prefix, url) | |
def example(): | |
download_all('lb3040872', 'Prinsarnes_blomsteralfabet', 70, 5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment