martinferianc · February 27, 2022 16:38
diff --git a/nn.py b/nn.py
 # Adapted from https://github.com/SkalskiP/ILearnDeepLearning.py/blob/master/01_mysteries_of_neural_networks/03_numpy_neural_net/Numpy%20deep%20neural%20network.ipynb

 import numpy as np
 import matplotlib.pyplot as plt

 np.random.seed(42)

 # Define a sample function to sample a dataset of 2 dimensional points
 def get_data(N = 300):
    # Generate a dataset of 2 dimensional points
    X = np.random.randn(N, 2)
    # Assign labels based on the sum of the coordinates
    T = np.where(X[:, 0] + X[:, 1] < 1.0, 0, 1)
    # Update the labels
    Y = np.zeros((N, 1))
    Y[T == 1] = 1
    return X, Y

 class NN():
    def __init__(self, architecture):
        self.activations = []
        self.params_values = {}
        self.layers = len(architecture)
        self.grads_momentum = {}
        for i, layer in enumerate(architecture):
            input_size, output_size, activation = layer["input_dim"], layer["output_dim"], layer["activation"]
            self.activations.append(activation)
            self.params_values[f"W{str(i)}"] = np.random.randn(
                output_size, input_size
            ) / np.sqrt(input_size)
            self.params_values[f"b{str(i)}"] = np.zeros((1, output_size))
            self.grads_momentum[f"W{str(i)}"] = np.zeros_like(self.params_values[f"W{str(i)}"])
            self.grads_momentum[f"b{str(i)}"] = np.zeros_like(self.params_values[f"b{str(i)}"])

        self.reset()

    # Reset the gradients and the cache for activations
    def reset(self):
        self.cache = {}
        self.grads = {}

    # Define the ReLU function
    def relu(self, x):
        return np.maximum(0, x)

    # Define the derivative of the ReLU function with respect to its input and the previous gradient
    def drelu(self, dA, z):
        dA_ = np.copy(dA)
        dA_[z <= 0] = 0
        return dA_

    # Define the sigmoid function
    def sigmoid(self, x):
        return 1. / (1. + np.exp(-x))

    # Define the derivative of the sigmoid function with respect to its input and the previous gradient
    def dsigmoid(self, dA, z):
        s = self.sigmoid(z)
        return s * (1. - s) * dA

    # Define the binary cross-entropy function
    def bce(self, yhat, y):
        yhat, y = yhat.flatten(), y.flatten()
        cost = -np.mean(np.dot(y, np.log(yhat+1e-8)) + np.dot((1 - y), np.log(1 - yhat+1e-8)))
        return np.squeeze(cost)

    # Define the binary cross-entropy function derivative
    def dbce(self, yhat, y):
        return -(y / (yhat+1e-8) - (1 - y) / (1 - yhat+1e-8))

    # Define the forward function of a linear layer
    def single_forward(self, x, W, b, activation):
        Z = x @ W.T + b
        A = getattr(self, activation)(Z)
        # Cache both the preactivation and activation values
        return A, Z

    # Compute the full forward step by going though each layer in the NN 
    def forward(self, x):
        A_prev = None
        A_curr = x
        for i in range(self.layers):
            W, b = self.params_values[f"W{str(i)}"], self.params_values[f"b{str(i)}"]
            activation = self.activations[i]
            A_prev = A_curr
            A_curr, Z_curr = self.single_forward(A_prev, W, b, activation)
            self.cache[str(i)] = (Z_curr, A_prev)
        return A_curr

    def single_backward(self, dA_curr, W, Z_curr, A_prev, activation):
        m = A_prev.shape[1]
        # Compute the gradient of the cost with respect to the activation of the current layer
        dactivation = getattr(self, f"d{activation}")
        dA_curr = dactivation(dA_curr, Z_curr)
        # Compute the gradient with respect to weights
        dW = np.dot(dA_curr.T, A_prev) / m
        # Compute the gradient with respect to the bias
        db = np.sum(dA_curr, axis = 0, keepdims = True) / m
        # Compute the gradient for the previous layer
        dA_curr = np.dot(dA_curr, W)
        return dA_curr, dW, db

    # Do the full backward step by going through each layer in the NN
    def backward(self, yhat, y):
        # First compute the cost derivative
        dA_curr = self.dbce(yhat, y)
        for i in range(self.layers - 1, -1, -1):
            W = self.params_values[f"W{str(i)}"]
            # Reuse the cached values for the current layer
            Z_curr, A_prev = self.cache[str(i)]
            dA_curr, dW, db = self.single_backward(dA_curr, W, Z_curr, A_prev, self.activations[i])
            self.grads[f"W{str(i)}"] = dW
            self.grads[f"b{str(i)}"] = db

    # Compute accuracy and do not forget to turn the labels to 0-1
    def accuracy(self, yhat, y):
        prediction = np.where(yhat > 0.5, 1, 0)
        return np.mean(prediction == y)

    # The full training loop by groung first through the forward step
    # Then through the backward step
    # Followed by the parameter update step
    def train(self, x, y, learning_rate, epochs, momentum = 0.9, weight_decay = 0.0001):
        losses = []
        accuracies = []
        for _ in range(epochs):
            yhat = self.forward(x)
            loss = self.bce(yhat, y)
            losses.append(loss)
            accuracy = self.accuracy(yhat, y)
            accuracies.append(accuracy)
            self.backward(yhat, y)
            self.update_params(weight_decay, momentum, learning_rate)

        return losses, accuracies

    # Update the parameters of the NN
    def update_params(self, weight_decay, momentum, learning_rate):
        for i in range(self.layers):
            # Notice that weight decay is added very simply through addition and scaling
            dW = self.grads[f"W{str(i)}"] + weight_decay * self.params_values[f"W{str(i)}"]
            db = self.grads[f"b{str(i)}"] + weight_decay * self.params_values[f"b{str(i)}"]
            # Update the momentum bufferst with the new gradients
            self.grads_momentum[f"W{str(i)}"] = momentum * self.grads_momentum[f"W{str(i)}"] + (1 - momentum) * dW
            self.grads_momentum[f"b{str(i)}"] = momentum * self.grads_momentum[f"b{str(i)}"] + (1 - momentum) * db
            # Finally, update the parameters with the momentum buffer and the learning rate
            self.params_values[f"W{str(i)}"] -= learning_rate * self.grads_momentum[f"W{str(i)}"]
            self.params_values[f"b{str(i)}"] -= learning_rate * self.grads_momentum[f"b{str(i)}"]
        self.reset()

 X, Y = get_data()

 # Plot the dataset with the labels
 plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='RdBu')
 plt.savefig('data.png')
 plt.close()
        
 nn_architecture = [
    {"input_dim": 2, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 6, "activation": "relu"},
    {"input_dim": 6, "output_dim": 6, "activation": "relu"},
    {"input_dim": 6, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 1, "activation": "sigmoid"},
 ]

 # Initialize a NN 
 nn = NN(nn_architecture)
 # Train the NN
 losses, accuracies = nn.train(X, Y, 0.005, 1000, 0.9, 0.1)

 # Plot the loss for the number of epochs
 plt.plot(losses)
 plt.xlabel('Epochs')
 plt.ylabel('Loss')
 plt.savefig('loss.png')
 plt.close()

 # Plot the accuracy for the number of epochs
 plt.plot(accuracies)
 plt.xlabel('Epochs')
 plt.ylabel('Accuracy')
 plt.savefig('accuracy.png')
 plt.close()

 # Plot the decision boundary
 yhat = nn.forward(X)
 yhat[yhat > 0.5] = 1
 plt.scatter(X[:, 0], X[:, 1], c=yhat, s=50, cmap='RdBu')
 plt.savefig('decision_boundary.png')
 plt.close()
	# Adapted from https://github.com/SkalskiP/ILearnDeepLearning.py/blob/master/01_mysteries_of_neural_networks/03_numpy_neural_net/Numpy%20deep%20neural%20network.ipynb

	import numpy as np
	import matplotlib.pyplot as plt

	np.random.seed(42)

	# Define a sample function to sample a dataset of 2 dimensional points
	def get_data(N = 300):
	# Generate a dataset of 2 dimensional points
	X = np.random.randn(N, 2)
	# Assign labels based on the sum of the coordinates
	T = np.where(X[:, 0] + X[:, 1] < 1.0, 0, 1)
	# Update the labels
	Y = np.zeros((N, 1))
	Y[T == 1] = 1
	return X, Y

	class NN():
	def __init__(self, architecture):
	self.activations = []
	self.params_values = {}
	self.layers = len(architecture)
	self.grads_momentum = {}
	for i, layer in enumerate(architecture):
	input_size, output_size, activation = layer["input_dim"], layer["output_dim"], layer["activation"]
	self.activations.append(activation)
	self.params_values[f"W{str(i)}"] = np.random.randn(
	output_size, input_size
	) / np.sqrt(input_size)
	self.params_values[f"b{str(i)}"] = np.zeros((1, output_size))
	self.grads_momentum[f"W{str(i)}"] = np.zeros_like(self.params_values[f"W{str(i)}"])
	self.grads_momentum[f"b{str(i)}"] = np.zeros_like(self.params_values[f"b{str(i)}"])

	self.reset()

	# Reset the gradients and the cache for activations
	def reset(self):
	self.cache = {}
	self.grads = {}

	# Define the ReLU function
	def relu(self, x):
	return np.maximum(0, x)

	# Define the derivative of the ReLU function with respect to its input and the previous gradient
	def drelu(self, dA, z):
	dA_ = np.copy(dA)
	dA_[z <= 0] = 0
	return dA_

	# Define the sigmoid function
	def sigmoid(self, x):
	return 1. / (1. + np.exp(-x))

	# Define the derivative of the sigmoid function with respect to its input and the previous gradient
	def dsigmoid(self, dA, z):
	s = self.sigmoid(z)
	return s * (1. - s) * dA

	# Define the binary cross-entropy function
	def bce(self, yhat, y):
	yhat, y = yhat.flatten(), y.flatten()
	cost = -np.mean(np.dot(y, np.log(yhat+1e-8)) + np.dot((1 - y), np.log(1 - yhat+1e-8)))
	return np.squeeze(cost)

	# Define the binary cross-entropy function derivative
	def dbce(self, yhat, y):
	return -(y / (yhat+1e-8) - (1 - y) / (1 - yhat+1e-8))

	# Define the forward function of a linear layer
	def single_forward(self, x, W, b, activation):
	Z = x @ W.T + b
	A = getattr(self, activation)(Z)
	# Cache both the preactivation and activation values
	return A, Z

	# Compute the full forward step by going though each layer in the NN
	def forward(self, x):
	A_prev = None
	A_curr = x
	for i in range(self.layers):
	W, b = self.params_values[f"W{str(i)}"], self.params_values[f"b{str(i)}"]
	activation = self.activations[i]
	A_prev = A_curr
	A_curr, Z_curr = self.single_forward(A_prev, W, b, activation)
	self.cache[str(i)] = (Z_curr, A_prev)
	return A_curr

	def single_backward(self, dA_curr, W, Z_curr, A_prev, activation):
	m = A_prev.shape[1]
	# Compute the gradient of the cost with respect to the activation of the current layer
	dactivation = getattr(self, f"d{activation}")
	dA_curr = dactivation(dA_curr, Z_curr)
	# Compute the gradient with respect to weights
	dW = np.dot(dA_curr.T, A_prev) / m
	# Compute the gradient with respect to the bias
	db = np.sum(dA_curr, axis = 0, keepdims = True) / m
	# Compute the gradient for the previous layer
	dA_curr = np.dot(dA_curr, W)
	return dA_curr, dW, db

	# Do the full backward step by going through each layer in the NN
	def backward(self, yhat, y):
	# First compute the cost derivative
	dA_curr = self.dbce(yhat, y)
	for i in range(self.layers - 1, -1, -1):
	W = self.params_values[f"W{str(i)}"]
	# Reuse the cached values for the current layer
	Z_curr, A_prev = self.cache[str(i)]
	dA_curr, dW, db = self.single_backward(dA_curr, W, Z_curr, A_prev, self.activations[i])
	self.grads[f"W{str(i)}"] = dW
	self.grads[f"b{str(i)}"] = db

	# Compute accuracy and do not forget to turn the labels to 0-1
	def accuracy(self, yhat, y):
	prediction = np.where(yhat > 0.5, 1, 0)
	return np.mean(prediction == y)

	# The full training loop by groung first through the forward step
	# Then through the backward step
	# Followed by the parameter update step
	def train(self, x, y, learning_rate, epochs, momentum = 0.9, weight_decay = 0.0001):
	losses = []
	accuracies = []
	for _ in range(epochs):
	yhat = self.forward(x)
	loss = self.bce(yhat, y)
	losses.append(loss)
	accuracy = self.accuracy(yhat, y)
	accuracies.append(accuracy)
	self.backward(yhat, y)
	self.update_params(weight_decay, momentum, learning_rate)

	return losses, accuracies

	# Update the parameters of the NN
	def update_params(self, weight_decay, momentum, learning_rate):
	for i in range(self.layers):
	# Notice that weight decay is added very simply through addition and scaling
	dW = self.grads[f"W{str(i)}"] + weight_decay * self.params_values[f"W{str(i)}"]
	db = self.grads[f"b{str(i)}"] + weight_decay * self.params_values[f"b{str(i)}"]
	# Update the momentum bufferst with the new gradients
	self.grads_momentum[f"W{str(i)}"] = momentum * self.grads_momentum[f"W{str(i)}"] + (1 - momentum) * dW
	self.grads_momentum[f"b{str(i)}"] = momentum * self.grads_momentum[f"b{str(i)}"] + (1 - momentum) * db
	# Finally, update the parameters with the momentum buffer and the learning rate
	self.params_values[f"W{str(i)}"] -= learning_rate * self.grads_momentum[f"W{str(i)}"]
	self.params_values[f"b{str(i)}"] -= learning_rate * self.grads_momentum[f"b{str(i)}"]
	self.reset()

	X, Y = get_data()

	# Plot the dataset with the labels
	plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='RdBu')
	plt.savefig('data.png')
	plt.close()

	nn_architecture = [
	{"input_dim": 2, "output_dim": 4, "activation": "relu"},
	{"input_dim": 4, "output_dim": 6, "activation": "relu"},
	{"input_dim": 6, "output_dim": 6, "activation": "relu"},
	{"input_dim": 6, "output_dim": 4, "activation": "relu"},
	{"input_dim": 4, "output_dim": 1, "activation": "sigmoid"},
	]

	# Initialize a NN
	nn = NN(nn_architecture)
	# Train the NN
	losses, accuracies = nn.train(X, Y, 0.005, 1000, 0.9, 0.1)

	# Plot the loss for the number of epochs
	plt.plot(losses)
	plt.xlabel('Epochs')
	plt.ylabel('Loss')
	plt.savefig('loss.png')
	plt.close()

	# Plot the accuracy for the number of epochs
	plt.plot(accuracies)
	plt.xlabel('Epochs')
	plt.ylabel('Accuracy')
	plt.savefig('accuracy.png')
	plt.close()

	# Plot the decision boundary
	yhat = nn.forward(X)
	yhat[yhat > 0.5] = 1
	plt.scatter(X[:, 0], X[:, 1], c=yhat, s=50, cmap='RdBu')
	plt.savefig('decision_boundary.png')
	plt.close()