Skip to content

Instantly share code, notes, and snippets.

@SerpentChris
Last active April 17, 2018 02:53
Show Gist options
  • Save SerpentChris/66d85d1f55274b47acb5299c6859be55 to your computer and use it in GitHub Desktop.
Save SerpentChris/66d85d1f55274b47acb5299c6859be55 to your computer and use it in GitHub Desktop.
Downloads Latin phrases from Wikipedia's Latin phrases appendix.
## Copyright (c) 2018 Christian Calderon
##
## Permission is hereby granted, free of charge, to any person obtaining a copy
## of this software and associated documentation files (the "Software"), to deal
## in the Software without restriction, including without limitation the rights
## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
## copies of the Software, and to permit persons to whom the Software is
## furnished to do so, subject to the following conditions:
##
## The above copyright notice and this permission notice shall be included in all
## copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
## SOFTWARE.
#
# This script uses the request and Beautiful Soup packages get Latin phrases from Wikipedia.
# These packages can be installed with pip using this command: pip install requests beautifulsoup4
# This script was written for python 3.5 so it may require some modification to run on other python versions.
import requests
from bs4 import BeautifulSoup
import time
import os
PARSER = 'html.parser'
VERBOSE = True
base_url = 'https://en.wiktionary.org/wiki/Appendix:List_of_Latin_phrases_'
# FYI: Wikipedia uses weird, non-ascii hyphens for these urls. '–' != '-'
urls = [base_url + pair for pair in ('(A–E)', '(F–O)', '(P–Z)')]
newline = '\n'
latin_phrases = []
for i, url in enumerate(urls):
r = requests.get(url)
bad_code = 'Don\'t know how to handle status code %d.' % r.status_code
assert r.status_code == 200, bad_code
html = r.text
soup = BeautifulSoup(html, PARSER)
# The phrases are organized into tables by their starting letter.
# They all have the class wikitable
tables = soup.find_all('table', {'class': 'wikitable'})
for table in tables:
# first three rows are two empty lines and a row of formatting info.
# we can just throws those out.
rows = table.children
next(rows); next(rows); next(rows);
for row in rows:
# each row is separated by a newline, and
# each cell of each row is separated by a newline
if str(row) != newline:
cells = row.children
next(cells) # this throws out the newline
# the first cell's text is always the latin phrase
latin_phrases.append(next(cells).text)
if i < 2:
# Be nice to Wikipedia's servers by waiting one second between downloads.
time.sleep(1)
fname = os.path.expanduser(
os.path.join('~',
'Documents',
'latin_phrases.txt'))
with open(fname, encoding='utf-8', mode='w') as f:
f.write('\n'.join(latin_phrases))
print('Wrote latin phrases to', fname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment