Skip to content

Instantly share code, notes, and snippets.

@nvg
Created September 14, 2022 01:50
Show Gist options
  • Save nvg/01bd9397038e50a9b2c29f856035edb5 to your computer and use it in GitHub Desktop.
Save nvg/01bd9397038e50a9b2c29f856035edb5 to your computer and use it in GitHub Desktop.
Binary classification with BERT
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
## Dataset is obtained from https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
df=pd.read_csv('spam.csv', encoding_errors='ignore')
df.drop(df.columns[2:5], axis=1, inplace=True)
df.rename(columns = {'v1':'Category', 'v2':'Message'}, inplace = True)
df['Category'] = df['Category'].apply(lambda c: 0 if c == 'ham' else 1)
## Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Category'], stratify=df['Category'])
## BERT
PREP = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
ENCODER = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
# hub.load(PREP)
# hub.load(ENCODER)
bert_preprocess = hub.KerasLayer(PREP)
bert_encoder = hub.KerasLayer(ENCODER)
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
model = tf.keras.Model(inputs=[text_input], outputs = [l])
model.compile(optimizer=tf.keras.optimizers.Adam(),
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=[tf.keras.metrics.BinaryAccuracy(),
tf.keras.metrics.AUC(),
tfa.metrics.F1Score(num_classes=1, average='macro',threshold=0.5),
tfa.metrics.FBetaScore(beta=2.0, num_classes=1, average='macro',threshold=0.5)
])
model.fit(X_train, y_train, epochs=5, batch_size = 32)
# Evaluate
model.evaluate(X_test, y_test)
# after 5 epochs, I saw - binary_accuracy: 0.9541 - auc: 0.9795 - f1_score: 0.8150 - fbeta_score: 0.7773
y_predicted = model.predict(X_test)
y_predicted = pd.Series(y_predicted.flatten()).apply(lambda x: 0 if x <= 0.5 else 1).to_numpy()
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))
# precision recall f1-score support
#
# 0 0.96 0.99 0.97 1206
# 1 0.89 0.75 0.82 187
#
# accuracy 0.95 1393
# macro avg 0.92 0.87 0.89 1393
# weighted avg 0.95 0.95 0.95 1393
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment