Created
July 29, 2017 23:42
-
-
Save alexrutherford/e324bde47f15513dea3d8a0cebf5c62b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import print_function | |
from keras.models import Sequential | |
from keras.layers import Dense, Activation | |
from keras.layers import LSTM | |
from keras.optimizers import RMSprop | |
from keras.utils.data_utils import get_file | |
import numpy as np | |
import random | |
import sys,os | |
import json | |
modelNumber=7 | |
while 'indices_char_%d.dat' % modelNumber in os.listdir('.'): | |
modelNumber+=1 | |
print('ModelNumber='+str(modelNumber)) | |
frac=4 | |
with open('un_speeches/compiled_no_breaks.txt','r') as inFile: | |
text=inFile.read() | |
for nFrac in range(frac): | |
print(nFrac*int(len(text)/frac),(nFrac+1)+int(len(text)/frac)) | |
sys.exit(1) | |
text=text[0:int(len(text)/frac)] | |
print('corpus length:', len(text)) | |
chars = sorted(list(set(text))) | |
print('total chars:', len(chars)) | |
char_indices = dict((c, i) for i, c in enumerate(chars)) | |
indices_char = dict((i, c) for i, c in enumerate(chars)) | |
# In[3]: | |
# cut the text in semi-redundant sequences of maxlen characters | |
maxlen = 30 | |
step = 5 | |
if False: | |
sentences = [] | |
next_chars = [] | |
for i in range(0, len(text) - maxlen, step): | |
sentences.append(text[i: i + maxlen]) | |
next_chars.append(text[i + maxlen]) | |
print('nb sequences:', len(sentences)) | |
else: | |
sentences=[text[i:i+maxlen] for i in range(0,len(text)-maxlen,step)] | |
#next_chars=[sentences[maxlen+i] for i in range(0,len(text)-maxlen,step)] | |
next_chars=[text[i] for i in range(maxlen,len(text),step)] | |
print('Steps: %d/%d' %(len(text),step)) | |
print('Range:'+','.join(map(str,range(0,len(text),step)[0:5]))) | |
print('nb sequences:', len(sentences)) | |
for i in range(5): | |
print(sentences[i]) | |
print(next_chars[i]) | |
print('') | |
print('Vectorization...') | |
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) | |
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) | |
for i, sentence in enumerate(sentences): | |
for t, char in enumerate(sentence): | |
X[i, t, char_indices[char]] = 1 | |
y[i, char_indices[next_chars[i]]] = 1 | |
# In[5]: | |
print('Build model...') | |
model = Sequential() | |
model.add(LSTM(128, input_shape=(maxlen,len(chars)))) | |
model.add(Dense(len(chars))) | |
model.add(Activation('softmax')) | |
# In[6]: | |
optimizer = RMSprop(lr=0.01) | |
model.compile(loss='categorical_crossentropy', optimizer=optimizer) | |
# In[ ]: | |
def sample(preds, temperature=1.0): | |
# helper function to sample an index from a probability array | |
preds = np.asarray(preds).astype('float64') | |
preds = np.log(preds) / temperature | |
exp_preds = np.exp(preds) | |
preds = exp_preds / np.sum(exp_preds) | |
probas = np.random.multinomial(1, preds, 1) | |
return np.argmax(probas) | |
# In[ ]: | |
# train the model, output generated text after each iteration | |
for iteration in range(1, 2): | |
print() | |
print('-' * 50) | |
print('Iteration', iteration) | |
model.fit(X, y, | |
batch_size=128,epochs=1 | |
) | |
# Use fit_generator here to process in chunks | |
# https://keras.io/models/sequential/ | |
start_index = random.randint(0, len(text) - maxlen - 1) | |
for diversity in [0.2, 0.35, 0.5, 0.75, 1.0, 1.2]: | |
print() | |
print('----- diversity:', diversity) | |
generated = '' | |
sentence = text[start_index: start_index + maxlen] | |
generated += sentence | |
print('----- Generating with seed: "' + sentence + '"') | |
sys.stdout.write(generated) | |
for i in range(400): | |
x = np.zeros((1, maxlen, len(chars))) | |
for t, char in enumerate(sentence): | |
x[0, t, char_indices[char]] = 1. | |
preds = model.predict(x, verbose=0)[0] | |
next_index = sample(preds, diversity) | |
next_char = indices_char[next_index] | |
generated += next_char | |
sentence = sentence[1:] + next_char | |
sys.stdout.write(next_char) | |
sys.stdout.flush() | |
print() | |
model.save('rnn_7_%d.hdfs' % (modelNumber)) | |
import pickle | |
with open('indices_char_%d.dat' % modelNumber,'w') as outFile: | |
pickle.dump(indices_char,outFile) | |
with open('char_indices_%d.dat' % modelNumber,'w') as outFile: | |
pickle.dump(char_indices,outFile) | |
with open('config_%d.csv' % modelNumber,'w') as outFile: | |
outFile.write('\n'.join(map(str,[maxlen,step,frac,len(sentences)]))) | |
with open('chars_%d.csv' % modelNumber,'w') as outFile: | |
pickle.dump(chars,outFile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment