Last active
May 7, 2019 08:58
-
-
Save wckdouglas/a49a998b13b8bc4ffe2880cee6c738fb to your computer and use it in GitHub Desktop.
Deep-learning framework modified from DanQ model (a hybrid convolutional and recurrent deep neural network)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# load libraries | |
from keras import backend as K | |
from keras.models import Sequential, model_from_json | |
from keras.layers import Dense, Conv1D,\ | |
Flatten, MaxPool1D, \ | |
Dropout, LSTM, \ | |
Bidirectional | |
from sequencing_tools.fastq_tools import reverse_complement, \ | |
onehot_sequence_encoder | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score | |
def f1_metric(y_true, y_pred): | |
''' | |
https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa | |
''' | |
def recall(y_true, y_pred): | |
"""Recall metric. | |
Only computes a batch-wise average of recall. | |
Computes the recall, a metric for multi-label classification of | |
how many relevant items are selected. | |
""" | |
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) | |
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) | |
recall = true_positives / (possible_positives + K.epsilon()) | |
return recall | |
def precision(y_true, y_pred): | |
"""Precision metric. | |
Only computes a batch-wise average of precision. | |
Computes the precision, a metric for multi-label classification of | |
how many selected items are relevant. | |
""" | |
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) | |
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) | |
precision = true_positives / (predicted_positives + K.epsilon()) | |
return precision | |
precision = precision(y_true, y_pred) | |
recall = recall(y_true, y_pred) | |
return 2*((precision*recall)/(precision+recall+K.epsilon())) | |
# set up onehot sequence encoder | |
acceptable_nuc = list('ACTGN') | |
dna_encoder = onehot_sequence_encoder(''.join(acceptable_nuc)) | |
def generate_data(sequences, labels): | |
''' | |
Wrapper for generating one-hot-encoded sequences | |
''' | |
for i, (seq, label) in enumerate(zip(sequences, labels)): | |
if set(seq).issubset(acceptable_nuc): | |
label = 0 if label == "bound" else 1 | |
yield dna_encoder.transform(seq), label | |
def deep_model(): | |
''' | |
adopted from DanQ: https://www.ncbi.nlm.nih.gov/pubmed/27084946 | |
Convolution layer -> Max Pooling -> Bidirectional LTSM -> Fully connected | |
''' | |
model = Sequential() | |
model.add(Conv1D(filters=160, | |
kernel_size = 26, | |
strides = 1, | |
padding = 'valid', | |
input_shape = (frag_size,5), | |
activation = 'relu')) | |
model.add(MaxPool1D(pool_size=50, strides=13)) | |
model.add(Dropout(0.2)) | |
model.add(Bidirectional(LSTM(64))) | |
model.add(Dropout(0.5)) | |
model.add(Dense(25, activation='relu')) | |
model.add(Dense(1, activation='sigmoid')) | |
model.compile(loss='binary_crossentropy', | |
optimizer='rmsprop', | |
metrics=[f1_metric,'binary_accuracy']) | |
return model | |
# generate data | |
data = [(seq, lab) for seq, lab in generate_data(sequences, labels)] | |
X, y = zip(*data) | |
X = np.array(X) | |
y = np.array(y) | |
# split data from evaluation | |
X_train, X_test, y_train, y_test = train_test_split(X, Y, | |
test_size=0.1, | |
random_state=0) | |
# train model with 25 epochs | |
model = deep_model() | |
model.fit(X_train, | |
y_test, | |
validation_split = 0.1, | |
epochs = 25) | |
# Test on testset | |
y_pred = model.predict_proba(X_test) | |
y_pred = y_pred.flatten() | |
print("Precision: %1.3f" % precision_score(y_test, y_pred.round())) | |
print("Recall: %1.3f" % recall_score(y_test, y_pred.round())) | |
print("F1: %1.3f" % f1_score(y_test, y_pred.round())) | |
print("AUROC: %1.3f" % roc_auc_score(y_test, y_pred)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment