SerpentChris · April 17, 2018 02:53
diff --git a/download_latin_phrases.py b/download_latin_phrases.py
 ## Copyright (c) 2018 Christian Calderon
 ## 
 ## Permission is hereby granted, free of charge, to any person obtaining a copy
 ## of this software and associated documentation files (the "Software"), to deal
 ## in the Software without restriction, including without limitation the rights
 ## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 ## copies of the Software, and to permit persons to whom the Software is
 ## furnished to do so, subject to the following conditions:
 ## 
 ## The above copyright notice and this permission notice shall be included in all
 ## copies or substantial portions of the Software.
 ## 
 ## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 ## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 ## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 ## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 ## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 ## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 ## SOFTWARE.
 #
 # This script uses the request and Beautiful Soup packages get Latin phrases from Wikipedia.
 # These packages can be installed with pip using this command: pip install requests beautifulsoup4
 # This script was written for python 3.5 so it may require some modification to run on other python versions.
 import requests
 from bs4 import BeautifulSoup
 import time
 import os

 PARSER = 'html.parser'
 VERBOSE = True
 base_url = 'https://en.wiktionary.org/wiki/Appendix:List_of_Latin_phrases_'
 # FYI: Wikipedia uses weird, non-ascii hyphens for these urls. '–' != '-'
 urls = [base_url + pair for pair in ('(A–E)', '(F–O)', '(P–Z)')]
 newline = '\n'
 latin_phrases = []
 for i, url in enumerate(urls):
    r = requests.get(url)
    bad_code = 'Don\'t know how to handle status code %d.' % r.status_code
    assert r.status_code == 200, bad_code
    html = r.text
    soup = BeautifulSoup(html, PARSER)
    # The phrases are organized into tables by their starting letter.
    # They all have the class wikitable
    tables = soup.find_all('table', {'class': 'wikitable'})

    for table in tables:
        # first three rows are two empty lines and a row of formatting info.
        # we can just throws those out.
        rows = table.children
        next(rows); next(rows); next(rows);

        for row in rows:
            # each row is separated by a newline, and
            # each cell of each row is separated by a newline
            if str(row) != newline:
                cells = row.children
                next(cells) # this throws out the newline
                # the first cell's text is always the latin phrase
                latin_phrases.append(next(cells).text)
    if i < 2:
        # Be nice to Wikipedia's servers by waiting one second between downloads.
        time.sleep(1)

 fname = os.path.expanduser(
    os.path.join('~',
                 'Documents',
                 'latin_phrases.txt'))
 with open(fname, encoding='utf-8', mode='w') as f:
    f.write('\n'.join(latin_phrases))
 print('Wrote latin phrases to', fname)
	## Copyright (c) 2018 Christian Calderon
	##
	## Permission is hereby granted, free of charge, to any person obtaining a copy
	## of this software and associated documentation files (the "Software"), to deal
	## in the Software without restriction, including without limitation the rights
	## to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	## copies of the Software, and to permit persons to whom the Software is
	## furnished to do so, subject to the following conditions:
	##
	## The above copyright notice and this permission notice shall be included in all
	## copies or substantial portions of the Software.
	##
	## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	## OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	## SOFTWARE.
	#
	# This script uses the request and Beautiful Soup packages get Latin phrases from Wikipedia.
	# These packages can be installed with pip using this command: pip install requests beautifulsoup4
	# This script was written for python 3.5 so it may require some modification to run on other python versions.
	import requests
	from bs4 import BeautifulSoup
	import time
	import os

	PARSER = 'html.parser'
	VERBOSE = True
	base_url = 'https://en.wiktionary.org/wiki/Appendix:List_of_Latin_phrases_'
	# FYI: Wikipedia uses weird, non-ascii hyphens for these urls. '–' != '-'
	urls = [base_url + pair for pair in ('(A–E)', '(F–O)', '(P–Z)')]
	newline = '\n'
	latin_phrases = []
	for i, url in enumerate(urls):
	r = requests.get(url)
	bad_code = 'Don\'t know how to handle status code %d.' % r.status_code
	assert r.status_code == 200, bad_code
	html = r.text
	soup = BeautifulSoup(html, PARSER)
	# The phrases are organized into tables by their starting letter.
	# They all have the class wikitable
	tables = soup.find_all('table', {'class': 'wikitable'})

	for table in tables:
	# first three rows are two empty lines and a row of formatting info.
	# we can just throws those out.
	rows = table.children
	next(rows); next(rows); next(rows);

	for row in rows:
	# each row is separated by a newline, and
	# each cell of each row is separated by a newline
	if str(row) != newline:
	cells = row.children
	next(cells) # this throws out the newline
	# the first cell's text is always the latin phrase
	latin_phrases.append(next(cells).text)
	if i < 2:
	# Be nice to Wikipedia's servers by waiting one second between downloads.
	time.sleep(1)

	fname = os.path.expanduser(
	os.path.join('~',
	'Documents',
	'latin_phrases.txt'))
	with open(fname, encoding='utf-8', mode='w') as f:
	f.write('\n'.join(latin_phrases))
	print('Wrote latin phrases to', fname)