Created
February 27, 2022 16:38
-
-
Save martinferianc/38c57e7089b985f26c34e1b2bd353dd9 to your computer and use it in GitHub Desktop.
Neural network from scratch by only using numpy!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from https://github.com/SkalskiP/ILearnDeepLearning.py/blob/master/01_mysteries_of_neural_networks/03_numpy_neural_net/Numpy%20deep%20neural%20network.ipynb | |
import numpy as np | |
import matplotlib.pyplot as plt | |
np.random.seed(42) | |
# Define a sample function to sample a dataset of 2 dimensional points | |
def get_data(N = 300): | |
# Generate a dataset of 2 dimensional points | |
X = np.random.randn(N, 2) | |
# Assign labels based on the sum of the coordinates | |
T = np.where(X[:, 0] + X[:, 1] < 1.0, 0, 1) | |
# Update the labels | |
Y = np.zeros((N, 1)) | |
Y[T == 1] = 1 | |
return X, Y | |
class NN(): | |
def __init__(self, architecture): | |
self.activations = [] | |
self.params_values = {} | |
self.layers = len(architecture) | |
self.grads_momentum = {} | |
for i, layer in enumerate(architecture): | |
input_size, output_size, activation = layer["input_dim"], layer["output_dim"], layer["activation"] | |
self.activations.append(activation) | |
self.params_values[f"W{str(i)}"] = np.random.randn( | |
output_size, input_size | |
) / np.sqrt(input_size) | |
self.params_values[f"b{str(i)}"] = np.zeros((1, output_size)) | |
self.grads_momentum[f"W{str(i)}"] = np.zeros_like(self.params_values[f"W{str(i)}"]) | |
self.grads_momentum[f"b{str(i)}"] = np.zeros_like(self.params_values[f"b{str(i)}"]) | |
self.reset() | |
# Reset the gradients and the cache for activations | |
def reset(self): | |
self.cache = {} | |
self.grads = {} | |
# Define the ReLU function | |
def relu(self, x): | |
return np.maximum(0, x) | |
# Define the derivative of the ReLU function with respect to its input and the previous gradient | |
def drelu(self, dA, z): | |
dA_ = np.copy(dA) | |
dA_[z <= 0] = 0 | |
return dA_ | |
# Define the sigmoid function | |
def sigmoid(self, x): | |
return 1. / (1. + np.exp(-x)) | |
# Define the derivative of the sigmoid function with respect to its input and the previous gradient | |
def dsigmoid(self, dA, z): | |
s = self.sigmoid(z) | |
return s * (1. - s) * dA | |
# Define the binary cross-entropy function | |
def bce(self, yhat, y): | |
yhat, y = yhat.flatten(), y.flatten() | |
cost = -np.mean(np.dot(y, np.log(yhat+1e-8)) + np.dot((1 - y), np.log(1 - yhat+1e-8))) | |
return np.squeeze(cost) | |
# Define the binary cross-entropy function derivative | |
def dbce(self, yhat, y): | |
return -(y / (yhat+1e-8) - (1 - y) / (1 - yhat+1e-8)) | |
# Define the forward function of a linear layer | |
def single_forward(self, x, W, b, activation): | |
Z = x @ W.T + b | |
A = getattr(self, activation)(Z) | |
# Cache both the preactivation and activation values | |
return A, Z | |
# Compute the full forward step by going though each layer in the NN | |
def forward(self, x): | |
A_prev = None | |
A_curr = x | |
for i in range(self.layers): | |
W, b = self.params_values[f"W{str(i)}"], self.params_values[f"b{str(i)}"] | |
activation = self.activations[i] | |
A_prev = A_curr | |
A_curr, Z_curr = self.single_forward(A_prev, W, b, activation) | |
self.cache[str(i)] = (Z_curr, A_prev) | |
return A_curr | |
def single_backward(self, dA_curr, W, Z_curr, A_prev, activation): | |
m = A_prev.shape[1] | |
# Compute the gradient of the cost with respect to the activation of the current layer | |
dactivation = getattr(self, f"d{activation}") | |
dA_curr = dactivation(dA_curr, Z_curr) | |
# Compute the gradient with respect to weights | |
dW = np.dot(dA_curr.T, A_prev) / m | |
# Compute the gradient with respect to the bias | |
db = np.sum(dA_curr, axis = 0, keepdims = True) / m | |
# Compute the gradient for the previous layer | |
dA_curr = np.dot(dA_curr, W) | |
return dA_curr, dW, db | |
# Do the full backward step by going through each layer in the NN | |
def backward(self, yhat, y): | |
# First compute the cost derivative | |
dA_curr = self.dbce(yhat, y) | |
for i in range(self.layers - 1, -1, -1): | |
W = self.params_values[f"W{str(i)}"] | |
# Reuse the cached values for the current layer | |
Z_curr, A_prev = self.cache[str(i)] | |
dA_curr, dW, db = self.single_backward(dA_curr, W, Z_curr, A_prev, self.activations[i]) | |
self.grads[f"W{str(i)}"] = dW | |
self.grads[f"b{str(i)}"] = db | |
# Compute accuracy and do not forget to turn the labels to 0-1 | |
def accuracy(self, yhat, y): | |
prediction = np.where(yhat > 0.5, 1, 0) | |
return np.mean(prediction == y) | |
# The full training loop by groung first through the forward step | |
# Then through the backward step | |
# Followed by the parameter update step | |
def train(self, x, y, learning_rate, epochs, momentum = 0.9, weight_decay = 0.0001): | |
losses = [] | |
accuracies = [] | |
for _ in range(epochs): | |
yhat = self.forward(x) | |
loss = self.bce(yhat, y) | |
losses.append(loss) | |
accuracy = self.accuracy(yhat, y) | |
accuracies.append(accuracy) | |
self.backward(yhat, y) | |
self.update_params(weight_decay, momentum, learning_rate) | |
return losses, accuracies | |
# Update the parameters of the NN | |
def update_params(self, weight_decay, momentum, learning_rate): | |
for i in range(self.layers): | |
# Notice that weight decay is added very simply through addition and scaling | |
dW = self.grads[f"W{str(i)}"] + weight_decay * self.params_values[f"W{str(i)}"] | |
db = self.grads[f"b{str(i)}"] + weight_decay * self.params_values[f"b{str(i)}"] | |
# Update the momentum bufferst with the new gradients | |
self.grads_momentum[f"W{str(i)}"] = momentum * self.grads_momentum[f"W{str(i)}"] + (1 - momentum) * dW | |
self.grads_momentum[f"b{str(i)}"] = momentum * self.grads_momentum[f"b{str(i)}"] + (1 - momentum) * db | |
# Finally, update the parameters with the momentum buffer and the learning rate | |
self.params_values[f"W{str(i)}"] -= learning_rate * self.grads_momentum[f"W{str(i)}"] | |
self.params_values[f"b{str(i)}"] -= learning_rate * self.grads_momentum[f"b{str(i)}"] | |
self.reset() | |
X, Y = get_data() | |
# Plot the dataset with the labels | |
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='RdBu') | |
plt.savefig('data.png') | |
plt.close() | |
nn_architecture = [ | |
{"input_dim": 2, "output_dim": 4, "activation": "relu"}, | |
{"input_dim": 4, "output_dim": 6, "activation": "relu"}, | |
{"input_dim": 6, "output_dim": 6, "activation": "relu"}, | |
{"input_dim": 6, "output_dim": 4, "activation": "relu"}, | |
{"input_dim": 4, "output_dim": 1, "activation": "sigmoid"}, | |
] | |
# Initialize a NN | |
nn = NN(nn_architecture) | |
# Train the NN | |
losses, accuracies = nn.train(X, Y, 0.005, 1000, 0.9, 0.1) | |
# Plot the loss for the number of epochs | |
plt.plot(losses) | |
plt.xlabel('Epochs') | |
plt.ylabel('Loss') | |
plt.savefig('loss.png') | |
plt.close() | |
# Plot the accuracy for the number of epochs | |
plt.plot(accuracies) | |
plt.xlabel('Epochs') | |
plt.ylabel('Accuracy') | |
plt.savefig('accuracy.png') | |
plt.close() | |
# Plot the decision boundary | |
yhat = nn.forward(X) | |
yhat[yhat > 0.5] = 1 | |
plt.scatter(X[:, 0], X[:, 1], c=yhat, s=50, cmap='RdBu') | |
plt.savefig('decision_boundary.png') | |
plt.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment