standarderror · September 26, 2016 03:36
diff --git a/Autoencoder.py b/Autoencoder.py
 ### Autoencoder

 ###################################
 ## [1] Parameters & hyperparameters
 ###################################

 # shape of data
 N = shape(X)[1]  # num observation
 D = shape(X)[0]  # num features (dimensionality)

 ## architecture and learning parameters
 h = 200 # size of hidden layer

 # weight vectors are randomly initialised from a uniform distribution
 r = sqrt(6) / sqrt(D + h + 1)
 rand = np.random.RandomState()

 W1 = np.asarray(rand.uniform(low = -r, high = r, size = (h, D)))
 W2 = np.asarray(rand.uniform(low = -r, high = r, size = (D, h)))

 # bias vectors are 0s
 b1 = np.zeros((h, 1))
 b2 = np.zeros((D, 1))

 # hyperparameters
 _lambda = 0.0003 # regularization strength (1 is max)
 rho = 0.1        # sparsity parameters
 beta = 0.3


 ###############################
 ## [2] Define helper functions
 ###############################

 # sigmoid activation function
 def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

 # unravel theta vector
 def unravel(theta):
    W1 = theta[0 : D*h].reshape(h, D)
    W2 = theta[D*h : 2*D*h].reshape(D, h)
    b1 = theta[2*D*h : 2*D*h+h].reshape(h, 1)
    b2 = theta[2*D*h+h : 2*D*h+h+D].reshape(D, 1)
    return W1, W2, b1, b2

 # ravel theta vector
 def _ravel(W1, W2, b1, b2):
    return numpy.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten()))


 ############################################################
 ## [3] Define backpropagation algorithm to train autoencoder
 ############################################################

 def backpropagate(theta):
    W1, W2, b1, b2 = unravel(theta)

    # Forward propagate to calculate output of hidden layer then output layer
    hidden_layer = sigmoid(np.dot(W1, X) + b1)
    output_layer = sigmoid(np.dot(W2, hidden_layer) + b2)

    # calculate error of outputs
    error = output_layer - X

    ## calculate sparsity penalty term and cost component
    rho_hat = np.sum(hidden_layer, axis = 1) / N   # average activation value of hidden layer
    cost_sparsity = beta * np.sum(rho * np.log(rho / rho_hat) + (1 - rho) * np.log((1 - rho) / (1 - rho_hat)))
    delta_sparsity = beta * (-(rho / rho_hat) + ((1 - rho) / (1 - rho_hat)))

    ## calculate cost
    cost_sq_error = 0.5 * np.sum(np.multiply(error, error)) / N 
    cost_regularisation = 0.5 * _lambda * (np.sum(np.multiply(W1, W1)) + np.sum(np.multiply(W2, W2)))
    cost = cost_sq_error + cost_regularisation + cost_sparsity

    
    ## backpropagate error 
    delta_output = np.multiply(error, np.multiply(output_layer, 1 - output_layer))
    delta_hidden = np.multiply(np.dot(numpy.transpose(W2), delta_output) + np.transpose(np.matrix(delta_sparsity)), 
                             np.multiply(hidden_layer, 1 - hidden_layer))

    ## calculate gradient values
    W1_grad = np.dot(delta_hidden, np.transpose(X))
    W2_grad = np.dot(delta_output, np.transpose(hidden_layer))
    b1_grad = np.sum(delta_hidden, axis = 1)
    b2_grad = np.sum(delta_output, axis = 1)

    ## average and apply regularisation
    W1_grad = W1_grad / N + _lambda * W1 
    W2_grad = W2_grad / N + _lambda * W2 
    b1_grad = b1_grad / N 
    b2_grad = b2_grad / N 
    
    # Ravel weight and bias vectors into a single vector to return to minimise function
    theta_grad = _ravel(np.array(W1_grad), np.array(W2_grad), np.array(b1_grad), np.array(b2_grad))

    return [cost, theta_grad]
diff --git a/plot weights.py b/plot weights.py
 # plot weights
 opt_theta = opt_solution.x
 opt_W1,_,_,_  = unravel(opt_theta)
 weights = [(opt_W1[i,:]/np.sum(opt_W1[i,:]**2)).reshape(sqrt(D), sqrt(D)) for i in xrange(opt_W1.shape[0])]

 for i, w in enumerate(weights):
    plt.subplot(20, 15, i + 1)
    plt.set_cmap('gray')
    plt.axis('off')
    plt.imshow(w)
 plt.subplots_adjust(hspace=0.05, wspace=0.05)
 plt.gcf().set_size_inches(15, 20)
diff --git a/preprocessing.py b/preprocessing.py
 ## Prepare Python & IPython notebook
 %pylab inline

 import numpy as np
 import matplotlib.pyplot as plt
 import scipy.optimize
 import cPickle, gzip

 ## Prepare dataset
 # Import data

 # Load the dataset
 with gzip.open('mnist.pkl.gz', 'rb') as f:   
    train_set, valid_set, test_set = cPickle.load(f)
    
 # grab the training data
 X = train_set[0]

 ## pre-processing
 ## standardise & normalise
 X -= np.mean(X)   # subtract mean
 xstd = np.std(X)   
 # standardise - divide by SD = scales to [-1,1]
 X /= xstd

 # rescale from [-1,1] to [0.1, 0.9] which is optimal for sigmoid function
 rng = X.max(axis=1, keepdims=True) - X.min(axis=1, keepdims=True)
 X = 0.9 - (((0.9 - 0.1) * (X.max(axis=1, keepdims=True) - X)) / rng)

 # transpose to align with our design
 X = X.T
diff --git a/train.py b/train.py
 # combine weights and betas into a single vector for optimisation
 theta = _ravel(W1, W2, b1, b2)

 # Train!
 opt_solution = scipy.optimize.minimize(backpropagate, theta, method = 'L-BFGS-B', jac = True, options = {'maxiter': 1000})
	### Autoencoder

	###################################
	## [1] Parameters & hyperparameters
	###################################

	# shape of data
	N = shape(X)[1] # num observation
	D = shape(X)[0] # num features (dimensionality)

	## architecture and learning parameters
	h = 200 # size of hidden layer

	# weight vectors are randomly initialised from a uniform distribution
	r = sqrt(6) / sqrt(D + h + 1)
	rand = np.random.RandomState()

	W1 = np.asarray(rand.uniform(low = -r, high = r, size = (h, D)))
	W2 = np.asarray(rand.uniform(low = -r, high = r, size = (D, h)))

	# bias vectors are 0s
	b1 = np.zeros((h, 1))
	b2 = np.zeros((D, 1))

	# hyperparameters
	_lambda = 0.0003 # regularization strength (1 is max)
	rho = 0.1 # sparsity parameters
	beta = 0.3


	###############################
	## [2] Define helper functions
	###############################

	# sigmoid activation function
	def sigmoid(x):
	return 1.0 / (1 + np.exp(-x))

	# unravel theta vector
	def unravel(theta):
	W1 = theta[0 : D*h].reshape(h, D)
	W2 = theta[Dh : 2D*h].reshape(D, h)
	b1 = theta[2Dh : 2Dh+h].reshape(h, 1)
	b2 = theta[2Dh+h : 2Dh+h+D].reshape(D, 1)
	return W1, W2, b1, b2

	# ravel theta vector
	def _ravel(W1, W2, b1, b2):
	return numpy.concatenate((W1.flatten(), W2.flatten(), b1.flatten(), b2.flatten()))


	############################################################
	## [3] Define backpropagation algorithm to train autoencoder
	############################################################

	def backpropagate(theta):
	W1, W2, b1, b2 = unravel(theta)

	# Forward propagate to calculate output of hidden layer then output layer
	hidden_layer = sigmoid(np.dot(W1, X) + b1)
	output_layer = sigmoid(np.dot(W2, hidden_layer) + b2)

	# calculate error of outputs
	error = output_layer - X

	## calculate sparsity penalty term and cost component
	rho_hat = np.sum(hidden_layer, axis = 1) / N # average activation value of hidden layer
	cost_sparsity = beta * np.sum(rho * np.log(rho / rho_hat) + (1 - rho) * np.log((1 - rho) / (1 - rho_hat)))
	delta_sparsity = beta * (-(rho / rho_hat) + ((1 - rho) / (1 - rho_hat)))

	## calculate cost
	cost_sq_error = 0.5 * np.sum(np.multiply(error, error)) / N
	cost_regularisation = 0.5 * _lambda * (np.sum(np.multiply(W1, W1)) + np.sum(np.multiply(W2, W2)))
	cost = cost_sq_error + cost_regularisation + cost_sparsity


	## backpropagate error
	delta_output = np.multiply(error, np.multiply(output_layer, 1 - output_layer))
	delta_hidden = np.multiply(np.dot(numpy.transpose(W2), delta_output) + np.transpose(np.matrix(delta_sparsity)),
	np.multiply(hidden_layer, 1 - hidden_layer))

	## calculate gradient values
	W1_grad = np.dot(delta_hidden, np.transpose(X))
	W2_grad = np.dot(delta_output, np.transpose(hidden_layer))
	b1_grad = np.sum(delta_hidden, axis = 1)
	b2_grad = np.sum(delta_output, axis = 1)

	## average and apply regularisation
	W1_grad = W1_grad / N + _lambda * W1
	W2_grad = W2_grad / N + _lambda * W2
	b1_grad = b1_grad / N
	b2_grad = b2_grad / N

	# Ravel weight and bias vectors into a single vector to return to minimise function
	theta_grad = _ravel(np.array(W1_grad), np.array(W2_grad), np.array(b1_grad), np.array(b2_grad))

	return [cost, theta_grad]
	# plot weights
	opt_theta = opt_solution.x
	opt_W1,_,_,_ = unravel(opt_theta)
	weights = [(opt_W1[i,:]/np.sum(opt_W1[i,:]**2)).reshape(sqrt(D), sqrt(D)) for i in xrange(opt_W1.shape[0])]

	for i, w in enumerate(weights):
	plt.subplot(20, 15, i + 1)
	plt.set_cmap('gray')
	plt.axis('off')
	plt.imshow(w)
	plt.subplots_adjust(hspace=0.05, wspace=0.05)
	plt.gcf().set_size_inches(15, 20)
	## Prepare Python & IPython notebook
	%pylab inline

	import numpy as np
	import matplotlib.pyplot as plt
	import scipy.optimize
	import cPickle, gzip

	## Prepare dataset
	# Import data

	# Load the dataset
	with gzip.open('mnist.pkl.gz', 'rb') as f:
	train_set, valid_set, test_set = cPickle.load(f)

	# grab the training data
	X = train_set[0]

	## pre-processing
	## standardise & normalise
	X -= np.mean(X) # subtract mean
	xstd = np.std(X)
	# standardise - divide by SD = scales to [-1,1]
	X /= xstd

	# rescale from [-1,1] to [0.1, 0.9] which is optimal for sigmoid function
	rng = X.max(axis=1, keepdims=True) - X.min(axis=1, keepdims=True)
	X = 0.9 - (((0.9 - 0.1) * (X.max(axis=1, keepdims=True) - X)) / rng)

	# transpose to align with our design
	X = X.T
	# combine weights and betas into a single vector for optimisation
	theta = _ravel(W1, W2, b1, b2)

	# Train!
	opt_solution = scipy.optimize.minimize(backpropagate, theta, method = 'L-BFGS-B', jac = True, options = {'maxiter': 1000})