Created
January 1, 2022 22:39
-
-
Save BrentonPoke/a0a3e810171de9ba862d3b5e892d02ae to your computer and use it in GitHub Desktop.
Python script that trains on IoT intrusions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% [markdown] | |
# # IoT Intrusion Detection | |
# | |
# The N-BaIoT Dataset contains traffic data for 9 IoT devices. The data comprise of both benign traffic and of a variety of malicious attacks. Here we run three deep neural networks to identify cyberattacks on a Provision PT-737E Security Camera. | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
import datetime | |
import numpy as np | |
import pandas as pd | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
benign=pd.read_csv('input/8.benign.csv') | |
g_c=pd.read_csv('input/8.gafgyt.combo.csv') | |
g_j=pd.read_csv('input/8.gafgyt.junk.csv') | |
g_s=pd.read_csv('input/8.gafgyt.scan.csv') | |
g_t=pd.read_csv('input/8.gafgyt.tcp.csv') | |
g_u=pd.read_csv('input/8.gafgyt.udp.csv') | |
ack=pd.read_csv('input/8.mirai.ack.csv') | |
sca=pd.read_csv('input/8.mirai.scan.csv') | |
syn=pd.read_csv('input/8.mirai.syn.csv') | |
udp=pd.read_csv('input/8.mirai.udp.csv') | |
pln=pd.read_csv('input/8.mirai.udpplain.csv') | |
benign7 = pd.read_csv('input/7.benign.csv') | |
g_c7=pd.read_csv('input/7.gafgyt.combo.csv') | |
g_j7=pd.read_csv('input/7.gafgyt.junk.csv') | |
g_s7=pd.read_csv('input/7.gafgyt.scan.csv') | |
g_t7=pd.read_csv('input/7.gafgyt.tcp.csv') | |
g_u7=pd.read_csv('input/7.gafgyt.udp.csv') | |
benign1 = pd.read_csv('input/1.benign.csv') | |
g_c1=pd.read_csv('input/1.gafgyt.combo.csv') | |
g_j1=pd.read_csv('input/1.gafgyt.junk.csv') | |
g_s1=pd.read_csv('input/1.gafgyt.scan.csv') | |
g_t1=pd.read_csv('input/1.gafgyt.tcp.csv') | |
g_u1=pd.read_csv('input/1.gafgyt.udp.csv') | |
ack1=pd.read_csv('input/1.mirai.ack.csv') | |
sca1=pd.read_csv('input/1.mirai.scan.csv') | |
syn1=pd.read_csv('input/1.mirai.syn.csv') | |
udp1=pd.read_csv('input/1.mirai.udp.csv') | |
pln1=pd.read_csv('input/1.mirai.udpplain.csv') | |
benign2 = pd.read_csv('input/2.benign.csv') | |
g_c2=pd.read_csv('input/2.gafgyt.combo.csv') | |
g_j2=pd.read_csv('input/2.gafgyt.junk.csv') | |
g_s2=pd.read_csv('input/2.gafgyt.scan.csv') | |
g_t2=pd.read_csv('input/2.gafgyt.tcp.csv') | |
g_u2=pd.read_csv('input/2.gafgyt.udp.csv') | |
ack2=pd.read_csv('input/2.mirai.ack.csv') | |
sca2=pd.read_csv('input/2.mirai.scan.csv') | |
syn2=pd.read_csv('input/2.mirai.syn.csv') | |
udp2=pd.read_csv('input/2.mirai.udp.csv') | |
pln2=pd.read_csv('input/2.mirai.udpplain.csv') | |
frames = [g_c,g_c7,g_c1,g_c2] | |
frames1 = [g_j,g_j7,g_j1,g_j2] | |
frames2 = [g_s,g_s7,g_s1,g_s2] | |
frames3 = [g_t,g_t7,g_t1,g_t2] | |
frames4 = [g_u,g_u7,g_u1,g_u2] | |
frames5 = [ack,ack2,ack1] | |
frames6 = [sca,sca2,sca1] | |
frames7 = [syn,syn2,syn1] | |
frames8 = [udp,udp2,udp1] | |
frames9 = [pln,pln2,pln1] | |
frames0 = [benign,benign7,benign1,benign2] | |
benign = pd.concat(frames0) | |
g_c = pd.concat(frames) | |
g_j = pd.concat(frames1) | |
g_s = pd.concat(frames2) | |
g_t = pd.concat(frames3) | |
g_u = pd.concat(frames4) | |
ack = pd.concat(frames5) | |
sca = pd.concat(frames6) | |
syn = pd.concat(frames7) | |
udp = pd.concat(frames8) | |
pln = pd.concat(frames9) | |
benign=benign.sample(frac=0.25,replace=False) | |
g_c=g_c.sample(frac=0.25,replace=False) | |
g_j=g_j.sample(frac=0.5,replace=False) | |
g_s=g_s.sample(frac=0.5,replace=False) | |
g_t=g_t.sample(frac=0.15,replace=False) | |
g_u=g_u.sample(frac=0.15,replace=False) | |
ack = ack.sample(frac=0.15,replace=False) | |
sca = sca.sample(frac=0.15,replace=False) | |
syn = syn.sample(frac=0.15,replace=False) | |
udp = udp.sample(frac=0.15,replace=False) | |
pln = pln.sample(frac=0.15,replace=False) | |
benign['type']='benign' | |
g_c['type']='gafgyt_combo' | |
g_j['type']='gafgyt_junk' | |
g_s['type']='gafgyt_scan' | |
g_t['type']='gafgyt_tcp' | |
g_u['type']='gafgyt_udp' | |
ack['type']='mirai_ack' | |
sca['type']='mirai_scan' | |
syn['type']='mirai_syn' | |
udp['type']='mirai_udp' | |
pln['type']='mirai_udpplain' | |
data=pd.concat([benign,g_c,g_j,g_s,g_t,g_u,ack,sca,syn,udp,pln], | |
axis=0, sort=False, ignore_index=True) | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#how many instances of each class | |
data.groupby('type')['type'].count() | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#shuffle rows of dataframe | |
sampler=np.random.permutation(len(data)) | |
data=data.take(sampler) | |
data.head(20) | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#dummy encode labels, store separately | |
labels_full=pd.get_dummies(data['type'], prefix='type') | |
labels_full.head() | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#drop labels from training dataset | |
data=data.drop(columns='type') | |
data.head() | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#standardize numerical columns | |
def standardize(df,col): | |
df[col]= (df[col]-df[col].mean())/df[col].std() | |
data_st=data.copy() | |
for i in (data_st.iloc[:,:-1].columns): | |
standardize (data_st,i) | |
data_st.head() | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#training data for the neural net | |
train_data_st=data_st.values | |
train_data_st | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#labels for training | |
labels=labels_full.values | |
labels | |
# %% [markdown] | |
# ### Keras model | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#import libraries | |
import tensorflow as tf | |
from sklearn.model_selection import train_test_split | |
from sklearn import metrics | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Dense, Activation | |
from tensorflow.keras.callbacks import EarlyStopping | |
# test/train split 25% test | |
x_train_st, x_test_st, y_train_st, y_test_st = train_test_split( | |
train_data_st, labels, test_size=0.25, random_state=42) | |
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") | |
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) | |
# create and fit model | |
model = Sequential() | |
model.add(Dense(10, input_dim=train_data_st.shape[1], activation='relu')) | |
model.add(Dense(40, input_dim=train_data_st.shape[1], activation='relu')) | |
model.add(Dense(10, input_dim=train_data_st.shape[1], activation='relu')) | |
model.add(Dense(1, kernel_initializer='normal')) | |
model.add(Dense(labels.shape[1],activation='softmax')) | |
model.compile(loss='categorical_crossentropy', optimizer='adam') | |
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, | |
patience=5, verbose=1, mode='auto') | |
model.fit(x_train_st,y_train_st,validation_data=(x_test_st,y_test_st), | |
callbacks=[monitor, tensorboard_callback],verbose=2,epochs=50) | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
# metrics | |
pred_st = model.predict(x_test_st) | |
pred_st = np.argmax(pred_st,axis=1) | |
y_eval_st = np.argmax(y_test_st,axis=1) | |
score_st = metrics.accuracy_score(y_eval_st, pred_st) | |
print("accuracy: {}".format(score_st)) | |
# %% [code] {"jupyter":{"outputs_hidden":false}} | |
#second model | |
# model2 = Sequential() | |
# model2.add(Dense(32, input_dim=train_data_st.shape[1], activation='relu')) | |
# model2.add(Dense(72, input_dim=train_data_st.shape[1], activation='relu')) | |
# model2.add(Dense(32, input_dim=train_data_st.shape[1], activation='relu')) | |
# model2.add(Dense(1, kernel_initializer='normal')) | |
# model2.add(Dense(labels.shape[1],activation='softmax')) | |
# model2.compile(loss='categorical_crossentropy', optimizer='adam') | |
# monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, | |
# patience=5, verbose=1, mode='auto') | |
# model2.fit(x_train_st,y_train_st,validation_data=(x_test_st,y_test_st), | |
# callbacks=[monitor, tensorboard_callback], verbose=2, epochs=100) | |
# | |
# # %% [code] {"jupyter":{"outputs_hidden":false}} | |
# # metrics | |
# pred_st1 = model2.predict(x_test_st) | |
# pred_st1 = np.argmax(pred_st1,axis=1) | |
# y_eval_st1 = np.argmax(y_test_st,axis=1) | |
# score_st1 = metrics.accuracy_score(y_eval_st1, pred_st1) | |
# print("accuracy: {}".format(score_st1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment