Created
September 14, 2022 01:50
-
-
Save nvg/01bd9397038e50a9b2c29f856035edb5 to your computer and use it in GitHub Desktop.
Binary classification with BERT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
import tensorflow_addons as tfa | |
import tensorflow_hub as hub | |
import tensorflow_text as text | |
import pandas as pd | |
## Dataset is obtained from https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset | |
df=pd.read_csv('spam.csv', encoding_errors='ignore') | |
df.drop(df.columns[2:5], axis=1, inplace=True) | |
df.rename(columns = {'v1':'Category', 'v2':'Message'}, inplace = True) | |
df['Category'] = df['Category'].apply(lambda c: 0 if c == 'ham' else 1) | |
## Train/test split | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], stratify=df['Category']) | |
## BERT | |
PREP = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" | |
ENCODER = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" | |
# hub.load(PREP) | |
# hub.load(ENCODER) | |
bert_preprocess = hub.KerasLayer(PREP) | |
bert_encoder = hub.KerasLayer(ENCODER) | |
# Bert layers | |
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') | |
preprocessed_text = bert_preprocess(text_input) | |
outputs = bert_encoder(preprocessed_text) | |
# Neural network layers | |
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output']) | |
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l) | |
model = tf.keras.Model(inputs=[text_input], outputs = [l]) | |
model.compile(optimizer=tf.keras.optimizers.Adam(), | |
loss=tf.keras.losses.BinaryCrossentropy(), | |
metrics=[tf.keras.metrics.BinaryAccuracy(), | |
tf.keras.metrics.AUC(), | |
tfa.metrics.F1Score(num_classes=1, average='macro',threshold=0.5), | |
tfa.metrics.FBetaScore(beta=2.0, num_classes=1, average='macro',threshold=0.5) | |
]) | |
model.fit(X_train, y_train, epochs=5, batch_size = 32) | |
# Evaluate | |
model.evaluate(X_test, y_test) | |
# after 5 epochs, I saw - binary_accuracy: 0.9541 - auc: 0.9795 - f1_score: 0.8150 - fbeta_score: 0.7773 | |
y_predicted = model.predict(X_test) | |
y_predicted = pd.Series(y_predicted.flatten()).apply(lambda x: 0 if x <= 0.5 else 1).to_numpy() | |
from sklearn.metrics import classification_report | |
print(classification_report(y_test, y_predicted)) | |
# precision recall f1-score support | |
# | |
# 0 0.96 0.99 0.97 1206 | |
# 1 0.89 0.75 0.82 187 | |
# | |
# accuracy 0.95 1393 | |
# macro avg 0.92 0.87 0.89 1393 | |
# weighted avg 0.95 0.95 0.95 1393 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment