This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pdfminer.high_level | |
import datetime | |
import requests | |
import sys | |
import os | |
import re | |
import unidecode | |
import collections | |
def split(delimiters, string, maxsplit=0): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def text_to_index(sentence): | |
# Remove punctuation characters except for the apostrophe | |
translator = str.maketrans('', '', string.punctuation.replace("'", '')) | |
tokens = sentence.translate(translator).lower().split() | |
return np.array([1] + [word_index[t] if t in word_index else 2 for t in tokens]) | |
def print_predictions(sentences, classifier): | |
indexes = [text_to_index(sentence) for sentence in sentences] | |
x = sequence.pad_sequences(indexes, | |
maxlen=sentence_size, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def my_initializer(shape=None, dtype=tf.float32, partition_info=None): | |
assert dtype is tf.float32 | |
return embedding_matrix | |
params = {'embedding_initializer': my_initializer} | |
cnn_pretrained_classifier = tf.estimator.Estimator( | |
model_fn=cnn_model_fn, | |
model_dir=os.path.join(model_dir, 'cnn_pretrained'), | |
params=params) | |
train_and_evaluate(cnn_pretrained_classifier) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
embedding_matrix = np.random.uniform(-1, 1, size=(vocab_size, embedding_size)) | |
for w, i in word_index.items(): | |
v = embeddings.get(w) | |
if v is not None and i < vocab_size: | |
embedding_matrix[i] = v |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
embeddings = {} | |
with open('glove.6B.50d.txt', 'r', encoding='utf-8') as f: | |
for line in f: | |
values = line.strip().split() | |
w = values[0] | |
vectors = np.asarray(values[1:], dtype='float32') | |
embeddings[w] = vectors |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(100) | |
_, final_states = tf.nn.dynamic_rnn( | |
lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32) | |
logits = tf.layers.dense(inputs=final_states.h, units=1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
initializer = tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)) | |
params = {'embedding_initializer': initializer} | |
cnn_classifier = tf.estimator.Estimator(model_fn=model_fn, | |
model_dir=os.path.join(model_dir, 'cnn'), | |
params=params) | |
train_and_evaluate(cnn_classifier) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
head = tf.contrib.estimator.binary_classification_head() | |
optimizer = tf.train.AdamOptimizer() | |
def _train_op_fn(loss): | |
tf.summary.scalar('loss', loss) | |
return optimizer.minimize( | |
loss=loss, | |
global_step=tf.train.get_global_step()) | |
return head.create_estimator_spec( | |
features=features, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
training = (mode == tf.estimator.ModeKeys.TRAIN) | |
dropout_emb = tf.layers.dropout(inputs=input_layer, | |
rate=0.2, | |
training=training) | |
conv = tf.layers.conv1d( | |
inputs=dropout_emb, | |
filters=32, | |
kernel_size=3, | |
padding="same", | |
activation=tf.nn.relu) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
input_layer = tf.contrib.layers.embed_sequence( | |
features['x'], | |
vocab_size, | |
embedding_size, | |
initializer=params['embedding_initializer']) |
NewerOlder