Created
March 25, 2017 15:24
-
-
Save ssophwang/3ee0c8ce3e0fc5f74443dcbcd87386a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, random | |
import numpy as np | |
import json | |
poem_files = ['pg17192.txt', 'Where the Sidewalk Ends by Shel Silverstein_djvu.txt'] | |
poem_lines = [] | |
firstword_count_dict = {} | |
firstword_prob_dict = {} | |
for poem_file in poem_files: | |
f = open(poem_file, 'r') | |
for line in f: | |
if len(line) > 1: | |
words = re.findall('\w+', line) | |
if len(words) > 1: | |
poem_lines.append(words) | |
f.close() | |
#print poem_lines | |
unicorns = set() | |
words = [] | |
last = 0 | |
lines = 0 | |
for line in poem_lines: | |
lines += 1 | |
if len(line) >= 1 and line[-1] == 'weary': | |
last += 1 | |
for word in line: | |
unicorns.add(word) | |
words.append(word) | |
# firstword_counts = {} | |
# | |
# for line in poem_lines: | |
# if len(line) > 2: | |
# first_word = line[0] | |
# | |
# if first_word in firstword_counts: | |
# firstword_counts[first_word] += 1 | |
# else: | |
# firstword_counts[first_word] = 1 | |
# | |
# print firstword_counts | |
# words = words.replace(',', '').replace('.', ' ') | |
# | |
# word_states = re.findall('\w+', words) | |
# | |
# print word_states | |
# print len(set(word_states)) | |
# | |
# counts_dict = {} | |
# | |
# for i in range(len(word_states)-1): | |
# first_word = word_states[i] | |
# next_word = word_states[i+1] | |
# | |
# if (first_word, next_word) in counts_dict: | |
# counts_dict[(first_word,next_word)] += 1 | |
# else: | |
# counts_dict[(first_word,next_word)] = 1 | |
# | |
# transition_probabilities = {} | |
# s = sum(counts_dict.values()) | |
# | |
# | |
# for key in counts_dict: | |
# transition_probabilities[key] = float(counts_dict[key])/s | |
# print transition_probabilities | |
# # float(counts_dict.keys())/s | |
# #for x in range(10): | |
# # sentence = '' | |
# # for i in range(10): | |
# # word = results[random.randint(0, len(results) - 1)] | |
# # sentence += ' ' + word | |
# # print sentence | |
# next_word_counts_dict = {} | |
# for word_list in poem_lines: | |
# word_list.append('\n') | |
# for i in range(len(word_list) - 1): | |
# thisword = word_list[i] | |
# nextword = word_list[i+1] | |
# | |
# if thisword not in next_word_counts_dict: | |
# next_word_counts_dict[thisword] = {nextword:1} | |
# else: | |
# if nextword not in next_word_counts_dict[thisword]: | |
# next_word_counts_dict[thisword][nextword] = 1 | |
# else: | |
# next_word_counts_dict[thisword][nextword] += 1 | |
for line in poem_lines: | |
if len(line) > 0: | |
first_word = line[0] | |
if first_word in firstword_count_dict: | |
firstword_count_dict[first_word] += 1 | |
else: | |
firstword_count_dict[first_word] = 1 | |
count_sum = sum(firstword_count_dict.values()) | |
for first_word in firstword_count_dict: | |
firstword_prob_dict[first_word] = firstword_count_dict[first_word]/float(count_sum) | |
print firstword_prob_dict | |
next_word_counts_dict = {} | |
for word_list in poem_lines: | |
for i in range(len(word_list)): | |
thisword = word_list[i] | |
if i < len(word_list)-1: | |
nextword = word_list[i+1] | |
else: | |
nextword = '\n' | |
if thisword not in next_word_counts_dict: | |
next_word_counts_dict[thisword] = {nextword:1} | |
else: | |
if nextword not in next_word_counts_dict[thisword]: | |
next_word_counts_dict[thisword][nextword] = 1 | |
else: | |
next_word_counts_dict[thisword][nextword] += 1 | |
for thisword in next_word_counts_dict: | |
count_sum = sum(next_word_counts_dict[thisword].values()) | |
for nextword in next_word_counts_dict[thisword]: | |
old_next_word_count = next_word_counts_dict[thisword][nextword] | |
next_word_counts_dict[thisword][nextword] = (float(next_word_counts_dict[thisword][nextword])/count_sum) | |
first_states = [] | |
pp = [] | |
for key in firstword_prob_dict: | |
first_states.append(key) | |
pp.append(firstword_prob_dict[key]) | |
print first_states | |
print pp | |
#for i in range(len(next_word_counts_dict.items)): | |
#print next_word_counts_dict.items() | |
for x in range(1000): | |
state = np.random.choice(first_states, p = pp) | |
sentence = [state] | |
for i in range(9): | |
next_states = [] | |
p = [] | |
if state in next_word_counts_dict: | |
for pair in next_word_counts_dict[state].items(): | |
next_states.append(pair[0]) | |
p.append(pair[1]) | |
state = np.random.choice(next_states, p = p) | |
sentence.append(state) | |
print ' '.join([w for w in sentence if w != '\n']) | |
#for word in next_word_counts_dict: | |
# print next_word_counts_dict[word].items() | |
# np.random.choice(word, p=) | |
# word, ':', next_word_counts_dict[word] | |
print len(next_word_counts_dict) | |
output_file = open('poem_model.json', "w") | |
output_file.write(json.dumps(next_word_counts_dict)) | |
output_file.close() | |
output_file2 = open('poem_model_firstword.json', "w") | |
output_file2.write(json.dumps(firstword_prob_dict)) | |
output_file2.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment