Created
July 14, 2016 14:34
-
-
Save Aubreymcfato/e8cf89aa71abe4f5fcd7243644e625af to your computer and use it in GitHub Desktop.
Python script for the English Wikisource: it scrapes ns0 books, getting all the metadata, and also other metadata from the Index page.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import unicodecsv | |
import re | |
#uses unicodecsv for generating a CSV. I'm using python 2.7 | |
#if you use Python 3.X, go with "csv" and change the related instructions accordingly. | |
#Beware of | |
# csv.writer(open("FILE.csv"), "wt") | |
#otherwise it gives you an error. | |
# create output CSV file | |
out = unicodecsv.writer(open("en_metadata.csv", "wb"), encoding="utf-8") | |
head = ("TITLE", "TYPE", "URL", "CCE", "SUBJECT", "CREATOR", "CONTRIBUTOR", "DESCRIPTION", "PUBLISHER", "DATE", "FORMAT", "SOURCE", "LANGUAGE", "RELATION", "COVERAGE", "RIGHTS", "IMMAGINE", "IMMAGINE ED.") | |
out.writerow(head) | |
# script for retrieving the metadata from the Index page | |
# bit different from itws_scraper.py: here I'm using regex | |
# and also I'm taking many other data which are not in the ns0 | |
def get_index_metadata(index): | |
rr = requests.get("https://en.wikisource.org/wiki/" + index) | |
#if it were the only image, this would do: image= soup_cover.img['src'] | |
try: | |
image= re.search(r"//upload\.wikimedia\.org/(.*?)\.djvu\.jpg", rr.text) | |
cover_url= "https:" + image.group(0) | |
except AttributeError: | |
cover_url = None | |
data = BeautifulSoup(rr.text) | |
try: | |
publishers = data.find('td', attrs={"id" : "ws-publisher"}) | |
publisher = publishers.text | |
except AttributeError: | |
publisher = None | |
try: | |
places = data.find ('td', attrs={"id" : "ws-place"}) | |
place=places.text | |
except AttributeError: | |
place = None | |
try: | |
dates = data.find ('td', attrs={"id" : "ws-year"}) | |
date = dates.text | |
except AttributeError: | |
date = None | |
return cover_url, publisher, place, date | |
#script for getting all the metadata. | |
#it basically scrapes them using the microformat using BeautifulSoup | |
def get_bookmetadata(book): | |
book = book.strip() | |
book_url = "https://en.wikisource.org/wiki/" + book | |
r = requests.get(book_url) | |
soup = BeautifulSoup(r.text) | |
try: | |
titles = soup.find ('span', attrs={"id" : "ws-title"}) #I think there was a problem with titles too long (they break) | |
title=titles.text | |
except AttributeError: | |
title = None | |
try: | |
authors = soup.find ('span', attrs={"id" : "ws-author"}) | |
author = authors.text | |
except AttributeError: | |
author = None | |
try: | |
years = soup.find ('span', attrs={"id" : "ws-year"}) | |
year = years.text | |
except AttributeError: | |
year = None | |
i=re.search(r"Index:(.*)\.djvu", r.text) | |
#not all the books have the Index, here's the YES option | |
if i is not None: | |
index= i.group(0) | |
metadata=get_index_metadata(index) | |
cover_url=metadata[0] | |
publisher = metadata[1] | |
place = metadata[2] | |
date = metadata[3] | |
else: | |
index = None | |
cover_url=None | |
publisher = None | |
place = None | |
date = None | |
#not sure, but I think there was an issue for this script to work for both books: with and without Index page. | |
#Here I wrote 2 different instructions for the output CSV | |
#without Source | |
#out.writerow([unicode(title), "E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", None, u"HTML | EPUB", unicode(date), u"inglese", u"http://en.wikisource.org/wiki/"+ unicode(book), None, u"Pubblico dominio", "cover_url", u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"]) | |
#with Source | |
out.writerow([unicode(title),"E-book Open", u"http://wsexport.wmflabs.org/tool/book.php?lang=en&format=epub&page=" + unicode(book), None, None, unicode(author), None, None, u"Wikisource, la biblioteca libera. <en.wikisource.org>", year, u"HTML | EPUB", unicode(date) + " | " + unicode(publisher) + " | " + unicode(place), u"inglese", u"http://en.wikisource.org/wiki/"+ book, None, u"Pubblico dominio", cover_url, u"https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Wikisource-logo.svg/229px-Wikisource-logo.svg.png"]) | |
#I read the books from an input list. | |
#of course every file (CSV, etc.) is OK, but the important data is the ns0 title | |
with open("en_list") as books: | |
for book in books: | |
get_bookmetadata(book) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment