mhbeals · November 10, 2018 20:48
diff --git a/gutenberg_barcodes.py b/gutenberg_barcodes.py
 # import libraries
 import re
 import requests
 from matplotlib import pyplot as plt

 # import stop words
 stop = ["a","about","above","after","again","against","all","am","an","and","any","are","arent","as","at","be","because","been","before","being","below","between","both","but","by","cant","cannot","could","couldnt","did","didnt","do","does","doesnt","doing","dont","down","during","each","few","for","from","further","had","hadnt","has","hasnt","have","havent","having","he","hed","hell","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","ill","im","ive","if","in","into","is","isnt","it","its","its","itself","lets","me","more","most","mustnt","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours 	ourselves","out","over","own","same","shant","she","shed","shell","shes","should","shouldnt","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","under","until","up","very","was","wasnt","we","wed","well","were","weve","were","werent","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wont","would","wouldnt","you","youd","youll","youre","youve","your","yours","yourself","yourselves"]

 # Ask the user questions
 text_to_download = input('Provide a Gutenberg Book Number: ')
 number_of_words = int(input('How many words would you like to display? '))

 # download the text
 downloaded_string = requests.get("http://www.gutenberg.org/files/" + text_to_download + "/" + text_to_download + ".txt")

 # format the data as plain text
 downloaded_string = downloaded_string.text

 # capture the title (before normalising the text)

 # all Gutenberg books have this line immediately following the title
 title_end = downloaded_string.find('\nThis eBook is for the use of anyone')

 # use the end defined above and the removal of the first 31 standard characters which proceed the title to obtain the title
 # use re.sub to remove any extra whitespace

 title = re.sub('[\s]+',' ',downloaded_string[:title_end])
 title = re.sub('The','the',title)
 title = re.sub('EBook','eBook',title)

 # Let the user know you are processing the right book
 print("\nNow processing " + title)

 # remove punctuation
 my_text = re.sub('[^\w^\s_]*','',downloaded_string)

 # make lower case
 my_text = my_text.lower()

 # find the start of the header
 start = my_text.find('start of this project gutenberg ebook')

 # find the start of the footer
 end = my_text.find('end of this project gutenberg ebook')

 # trim of the header and footer from the text
 my_text = my_text[start+37:end]

 # split text into a word list and count the words
 my_words = my_text.split()
 wordcount = len(my_words) + 1

 #create a blank dictionary
 dictionary = {}

 #set your word number iterator to 1
 i = 1

 #create a for loop for your word list
 for word in my_words:

    #create a blank list for your index numbers
    list = []
    
    #assign the existing value for that key word to an entry variable
    entry = dictionary.get(word)

    #ask if that entry is None
    if entry == None:

        #if it is, assign a list of that word number [i] as the value of that key in your dictionary
        entry = [i]
        
        dictionary[word] = entry

    #use the else command
    else:

        #append the new word number to your entry list
        entry.append(i)

        #assign the expaned entry list as the value of that key in your dictionary
        dictionary[word] = entry

    # increase you iterator
    i = i +1

 # create a value variable with a value of 1
 value = 1

 # create for loop through your dictionary
 for key,instances in dictionary.items():
 	
 	# if the length of the current word's list is greater than value
 	# and if it is not in the stop list
    if len(dictionary[key]) > value and key not in stop:
 	
        #update the value of value
        value = len(dictionary[key])

 # create a blank dictionary for your top words
 top_words ={}

 # create a counter to stop after the required number of words 
 i = 0

 # create a while loop counting down to zero from the highest instance number
 while value > 0:

    # create a for loop to go through your dictionary
    for key,counter in dictionary.items():

        # check if the length of the list (value) is the same as your current counter
        if len(dictionary[key]) == value and i < number_of_words:
        
            # append the word to your top five list
            top_words[key] = dictionary[key]
            
            # increase your counter
            i = i + 1       
            
    # subtract one from the counter
    value = value - 1
    
 # instantiate the figure
 plt.figure(figsize=(20,5))

 # create a for loop for your top words
 for word,numbers in top_words.items():
        
        # create the legend text
        legend_label = word + ": " + str(len(top_words[word]))
        
        # create the bars
        plt.bar(numbers,1,label=legend_label, width=10)
        print("Completed visualisation of " + word)
             
 # set the limits of the x-axis
 plt.xlim(0,wordcount)
 plt.ylim(0,1)

 # set the tick marks on the x and y-axes
 plt.xticks([])
 plt.yticks([])

 # set the chart title
 plt.title(str(number_of_words) + " Most Common Words in " + title)

 # set the legend placement
 plt.legend(loc="upper right")

 # show the chart
 plt.show()

 # close the chart
 plt.close()

 #You may need to run twice if importing libraries for the first time
	# import libraries
	import re
	import requests
	from matplotlib import pyplot as plt

	# import stop words
	stop = ["a","about","above","after","again","against","all","am","an","and","any","are","arent","as","at","be","because","been","before","being","below","between","both","but","by","cant","cannot","could","couldnt","did","didnt","do","does","doesnt","doing","dont","down","during","each","few","for","from","further","had","hadnt","has","hasnt","have","havent","having","he","hed","hell","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","ill","im","ive","if","in","into","is","isnt","it","its","its","itself","lets","me","more","most","mustnt","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours ourselves","out","over","own","same","shant","she","shed","shell","shes","should","shouldnt","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","under","until","up","very","was","wasnt","we","wed","well","were","weve","were","werent","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wont","would","wouldnt","you","youd","youll","youre","youve","your","yours","yourself","yourselves"]

	# Ask the user questions
	text_to_download = input('Provide a Gutenberg Book Number: ')
	number_of_words = int(input('How many words would you like to display? '))

	# download the text
	downloaded_string = requests.get("http://www.gutenberg.org/files/" + text_to_download + "/" + text_to_download + ".txt")

	# format the data as plain text
	downloaded_string = downloaded_string.text

	# capture the title (before normalising the text)

	# all Gutenberg books have this line immediately following the title
	title_end = downloaded_string.find('\nThis eBook is for the use of anyone')

	# use the end defined above and the removal of the first 31 standard characters which proceed the title to obtain the title
	# use re.sub to remove any extra whitespace

	title = re.sub('[\s]+',' ',downloaded_string[:title_end])
	title = re.sub('The','the',title)
	title = re.sub('EBook','eBook',title)

	# Let the user know you are processing the right book
	print("\nNow processing " + title)

	# remove punctuation
	my_text = re.sub('[^\w^\s_]*','',downloaded_string)

	# make lower case
	my_text = my_text.lower()

	# find the start of the header
	start = my_text.find('start of this project gutenberg ebook')

	# find the start of the footer
	end = my_text.find('end of this project gutenberg ebook')

	# trim of the header and footer from the text
	my_text = my_text[start+37:end]

	# split text into a word list and count the words
	my_words = my_text.split()
	wordcount = len(my_words) + 1

	#create a blank dictionary
	dictionary = {}

	#set your word number iterator to 1
	i = 1

	#create a for loop for your word list
	for word in my_words:

	#create a blank list for your index numbers
	list = []

	#assign the existing value for that key word to an entry variable
	entry = dictionary.get(word)

	#ask if that entry is None
	if entry == None:

	#if it is, assign a list of that word number [i] as the value of that key in your dictionary
	entry = [i]

	dictionary[word] = entry

	#use the else command
	else:

	#append the new word number to your entry list
	entry.append(i)

	#assign the expaned entry list as the value of that key in your dictionary
	dictionary[word] = entry

	# increase you iterator
	i = i +1

	# create a value variable with a value of 1
	value = 1

	# create for loop through your dictionary
	for key,instances in dictionary.items():

	# if the length of the current word's list is greater than value
	# and if it is not in the stop list
	if len(dictionary[key]) > value and key not in stop:

	#update the value of value
	value = len(dictionary[key])

	# create a blank dictionary for your top words
	top_words ={}

	# create a counter to stop after the required number of words
	i = 0

	# create a while loop counting down to zero from the highest instance number
	while value > 0:

	# create a for loop to go through your dictionary
	for key,counter in dictionary.items():

	# check if the length of the list (value) is the same as your current counter
	if len(dictionary[key]) == value and i < number_of_words:

	# append the word to your top five list
	top_words[key] = dictionary[key]

	# increase your counter
	i = i + 1

	# subtract one from the counter
	value = value - 1

	# instantiate the figure
	plt.figure(figsize=(20,5))

	# create a for loop for your top words
	for word,numbers in top_words.items():

	# create the legend text
	legend_label = word + ": " + str(len(top_words[word]))

	# create the bars
	plt.bar(numbers,1,label=legend_label, width=10)
	print("Completed visualisation of " + word)

	# set the limits of the x-axis
	plt.xlim(0,wordcount)
	plt.ylim(0,1)

	# set the tick marks on the x and y-axes
	plt.xticks([])
	plt.yticks([])

	# set the chart title
	plt.title(str(number_of_words) + " Most Common Words in " + title)

	# set the legend placement
	plt.legend(loc="upper right")

	# show the chart
	plt.show()

	# close the chart
	plt.close()

	#You may need to run twice if importing libraries for the first time