Skip to content

Instantly share code, notes, and snippets.

@lokal-profil
Created October 12, 2020 21:37
Show Gist options
  • Save lokal-profil/a36e459d16f8f92f447841c79efbeb7e to your computer and use it in GitHub Desktop.
Save lokal-profil/a36e459d16f8f92f447841c79efbeb7e to your computer and use it in GitHub Desktop.
Short script for scraping literaturbanken book images
#!/usr/bin/python
# -*- coding: utf-8 -*-
# short script for scraping literaturbanken book images
import requests
from tqdm import tqdm
def download_single(num, prefix, url):
num_s = '{0:04}'.format(num)
full_url = url.format(num_s)
output_file = '{1}_{0}.jpg'.format(num_s, prefix)
r = requests.get(full_url, stream=True)
chunk_size = 1024
label = 'page_{}'.format(num_s)
with open(output_file, 'wb') as handle:
pbar = tqdm(desc=label, unit_scale=True, unit='B')
for data in r.iter_content(chunk_size=chunk_size):
if data: # filter out keep-alive new chunks
pbar.update(len(data))
handle.write(data)
def download_all(book_id, prefix, pages, zoom_level=5):
url = 'https://litteraturbanken.se/txt/{0}/{0}_{1}/{0}_{1}_{{}}.jpeg'.format(book_id, zoom_level)
for num in range(1, pages+1):
download_single(num, prefix, url)
def example():
download_all('lb3040872', 'Prinsarnes_blomsteralfabet', 70, 5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment