Last active
November 10, 2018 20:48
-
-
Save mhbeals/91930a5c83dfe7f7d40e11e5e673f96f to your computer and use it in GitHub Desktop.
A simple python script (Based on work for Programming for the Humanities and Social Sciences workshop series at Loughborough University) to display a barcode of the n most common words in any Gutenberg eBook.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import libraries | |
import re | |
import requests | |
from matplotlib import pyplot as plt | |
# import stop words | |
stop = ["a","about","above","after","again","against","all","am","an","and","any","are","arent","as","at","be","because","been","before","being","below","between","both","but","by","cant","cannot","could","couldnt","did","didnt","do","does","doesnt","doing","dont","down","during","each","few","for","from","further","had","hadnt","has","hasnt","have","havent","having","he","hed","hell","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","ill","im","ive","if","in","into","is","isnt","it","its","its","itself","lets","me","more","most","mustnt","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours ourselves","out","over","own","same","shant","she","shed","shell","shes","should","shouldnt","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","under","until","up","very","was","wasnt","we","wed","well","were","weve","were","werent","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wont","would","wouldnt","you","youd","youll","youre","youve","your","yours","yourself","yourselves"] | |
# Ask the user questions | |
text_to_download = input('Provide a Gutenberg Book Number: ') | |
number_of_words = int(input('How many words would you like to display? ')) | |
# download the text | |
downloaded_string = requests.get("http://www.gutenberg.org/files/" + text_to_download + "/" + text_to_download + ".txt") | |
# format the data as plain text | |
downloaded_string = downloaded_string.text | |
# capture the title (before normalising the text) | |
# all Gutenberg books have this line immediately following the title | |
title_end = downloaded_string.find('\nThis eBook is for the use of anyone') | |
# use the end defined above and the removal of the first 31 standard characters which proceed the title to obtain the title | |
# use re.sub to remove any extra whitespace | |
title = re.sub('[\s]+',' ',downloaded_string[:title_end]) | |
title = re.sub('The','the',title) | |
title = re.sub('EBook','eBook',title) | |
# Let the user know you are processing the right book | |
print("\nNow processing " + title) | |
# remove punctuation | |
my_text = re.sub('[^\w^\s_]*','',downloaded_string) | |
# make lower case | |
my_text = my_text.lower() | |
# find the start of the header | |
start = my_text.find('start of this project gutenberg ebook') | |
# find the start of the footer | |
end = my_text.find('end of this project gutenberg ebook') | |
# trim of the header and footer from the text | |
my_text = my_text[start+37:end] | |
# split text into a word list and count the words | |
my_words = my_text.split() | |
wordcount = len(my_words) + 1 | |
#create a blank dictionary | |
dictionary = {} | |
#set your word number iterator to 1 | |
i = 1 | |
#create a for loop for your word list | |
for word in my_words: | |
#create a blank list for your index numbers | |
list = [] | |
#assign the existing value for that key word to an entry variable | |
entry = dictionary.get(word) | |
#ask if that entry is None | |
if entry == None: | |
#if it is, assign a list of that word number [i] as the value of that key in your dictionary | |
entry = [i] | |
dictionary[word] = entry | |
#use the else command | |
else: | |
#append the new word number to your entry list | |
entry.append(i) | |
#assign the expaned entry list as the value of that key in your dictionary | |
dictionary[word] = entry | |
# increase you iterator | |
i = i +1 | |
# create a value variable with a value of 1 | |
value = 1 | |
# create for loop through your dictionary | |
for key,instances in dictionary.items(): | |
# if the length of the current word's list is greater than value | |
# and if it is not in the stop list | |
if len(dictionary[key]) > value and key not in stop: | |
#update the value of value | |
value = len(dictionary[key]) | |
# create a blank dictionary for your top words | |
top_words ={} | |
# create a counter to stop after the required number of words | |
i = 0 | |
# create a while loop counting down to zero from the highest instance number | |
while value > 0: | |
# create a for loop to go through your dictionary | |
for key,counter in dictionary.items(): | |
# check if the length of the list (value) is the same as your current counter | |
if len(dictionary[key]) == value and i < number_of_words: | |
# append the word to your top five list | |
top_words[key] = dictionary[key] | |
# increase your counter | |
i = i + 1 | |
# subtract one from the counter | |
value = value - 1 | |
# instantiate the figure | |
plt.figure(figsize=(20,5)) | |
# create a for loop for your top words | |
for word,numbers in top_words.items(): | |
# create the legend text | |
legend_label = word + ": " + str(len(top_words[word])) | |
# create the bars | |
plt.bar(numbers,1,label=legend_label, width=10) | |
print("Completed visualisation of " + word) | |
# set the limits of the x-axis | |
plt.xlim(0,wordcount) | |
plt.ylim(0,1) | |
# set the tick marks on the x and y-axes | |
plt.xticks([]) | |
plt.yticks([]) | |
# set the chart title | |
plt.title(str(number_of_words) + " Most Common Words in " + title) | |
# set the legend placement | |
plt.legend(loc="upper right") | |
# show the chart | |
plt.show() | |
# close the chart | |
plt.close() | |
#You may need to run twice if importing libraries for the first time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment