Skip to content

Instantly share code, notes, and snippets.

@rootAvish
Forked from syhw/dnn.py
Last active August 29, 2015 14:27

Revisions

  1. @syhw syhw revised this gist Sep 25, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dnn.py
    Original file line number Diff line number Diff line change
    @@ -615,7 +615,7 @@ def new_dnn(dropout=False):
    # TODO if you have a big enough GPU, use these:
    #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
    #layers_sizes=[2000, 2000, 2000, 2000],
    #dropout_rates=[0., 0.5, 0.5, 0.5, 0.5],
    #dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
    n_outs=n_outs,
    max_norm=4.,
    fast_drop=True,
  2. @syhw syhw revised this gist Sep 25, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions dnn.py
    Original file line number Diff line number Diff line change
    @@ -378,7 +378,7 @@ def __init__(self, numpy_rng, theano_rng=None,
    n_ins=40*3,
    layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
    layers_sizes=[4000, 4000, 4000, 4000],
    dropout_rates=[0.0, 0.5, 0.5, 0.5, 0.5],
    dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
    n_outs=62 * 3,
    rho=0.9,
    eps=1.E-6,
    @@ -611,7 +611,7 @@ def new_dnn(dropout=False):
    return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
    layers_types=[ReLU, ReLU, LogisticRegression],
    layers_sizes=[200, 200],
    dropout_rates=[0., 0.5, 0.5],
    dropout_rates=[0.2, 0.5, 0.5],
    # TODO if you have a big enough GPU, use these:
    #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
    #layers_sizes=[2000, 2000, 2000, 2000],
  3. @syhw syhw revised this gist Aug 26, 2014. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions dnn.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,7 @@
    """
    A deep neural network with or w/o dropout in one file.
    License: Do What The Fuck You Want to Public License http://www.wtfpl.net/
    """

    import numpy, theano, sys, math
  4. @syhw syhw created this gist Aug 9, 2014.
    735 changes: 735 additions & 0 deletions dnn.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,735 @@
    """
    A deep neural network with or w/o dropout in one file.
    """

    import numpy, theano, sys, math
    from theano import tensor as T
    from theano import shared
    from theano.tensor.shared_randomstreams import RandomStreams
    from collections import OrderedDict

    BATCH_SIZE = 100


    def relu_f(vec):
    """ Wrapper to quickly change the rectified linear unit function """
    return (vec + abs(vec)) / 2.


    def dropout(rng, x, p=0.5):
    """ Zero-out random values in x with probability p using rng """
    if p > 0. and p < 1.:
    seed = rng.randint(2 ** 30)
    srng = theano.tensor.shared_randomstreams.RandomStreams(seed)
    mask = srng.binomial(n=1, p=1.-p, size=x.shape,
    dtype=theano.config.floatX)
    return x * mask
    return x


    def fast_dropout(rng, x):
    """ Multiply activations by N(1,1) """
    seed = rng.randint(2 ** 30)
    srng = RandomStreams(seed)
    mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX)
    return x * mask


    def build_shared_zeros(shape, name):
    """ Builds a theano shared variable filled with a zeros numpy array """
    return shared(value=numpy.zeros(shape, dtype=theano.config.floatX),
    name=name, borrow=True)


    class Linear(object):
    """ Basic linear transformation layer (W.X + b) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
    if W is None:
    W_values = numpy.asarray(rng.uniform(
    low=-numpy.sqrt(6. / (n_in + n_out)),
    high=numpy.sqrt(6. / (n_in + n_out)),
    size=(n_in, n_out)), dtype=theano.config.floatX)
    W_values *= 4 # This works for sigmoid activated networks!
    W = theano.shared(value=W_values, name='W', borrow=True)
    if b is None:
    b = build_shared_zeros((n_out,), 'b')
    self.input = input
    self.W = W
    self.b = b
    self.params = [self.W, self.b]
    self.output = T.dot(self.input, self.W) + self.b
    if fdrop:
    self.output = fast_dropout(rng, self.output)

    def __repr__(self):
    return "Linear"


    class SigmoidLayer(Linear):
    """ Sigmoid activation layer (sigmoid(W.X + b)) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
    super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b)
    self.pre_activation = self.output
    if fdrop:
    self.pre_activation = fast_dropout(rng, self.pre_activation)
    self.output = T.nnet.sigmoid(self.pre_activation)


    class ReLU(Linear):
    """ Rectified Linear Unit activation layer (max(0, W.X + b)) """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
    if b is None:
    b = build_shared_zeros((n_out,), 'b')
    super(ReLU, self).__init__(rng, input, n_in, n_out, W, b)
    self.pre_activation = self.output
    if fdrop:
    self.pre_activation = fast_dropout(rng, self.pre_activation)
    self.output = relu_f(self.pre_activation)


    class DatasetMiniBatchIterator(object):
    """ Basic mini-batch iterator """
    def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False):
    self.x = x
    self.y = y
    self.batch_size = batch_size
    self.randomize = randomize
    from sklearn.utils import check_random_state
    self.rng = check_random_state(42)

    def __iter__(self):
    n_samples = self.x.shape[0]
    if self.randomize:
    for _ in xrange(n_samples / BATCH_SIZE):
    if BATCH_SIZE > 1:
    i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE))
    else:
    i = int(math.floor(self.rng.rand(1) * n_samples))
    yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size],
    self.y[i*self.batch_size:(i+1)*self.batch_size])
    else:
    for i in xrange((n_samples + self.batch_size - 1)
    / self.batch_size):
    yield (self.x[i*self.batch_size:(i+1)*self.batch_size],
    self.y[i*self.batch_size:(i+1)*self.batch_size])


    class LogisticRegression:
    """Multi-class Logistic Regression
    """
    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
    if W != None:
    self.W = W
    else:
    self.W = build_shared_zeros((n_in, n_out), 'W')
    if b != None:
    self.b = b
    else:
    self.b = build_shared_zeros((n_out,), 'b')

    # P(Y|X) = softmax(W.X + b)
    self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
    self.y_pred = T.argmax(self.p_y_given_x, axis=1)
    self.output = self.y_pred
    self.params = [self.W, self.b]

    def negative_log_likelihood(self, y):
    return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    def negative_log_likelihood_sum(self, y):
    return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])

    def training_cost(self, y):
    """ Wrapper for standard name """
    return self.negative_log_likelihood_sum(y)

    def errors(self, y):
    if y.ndim != self.y_pred.ndim:
    raise TypeError("y should have the same shape as self.y_pred",
    ("y", y.type, "y_pred", self.y_pred.type))
    if y.dtype.startswith('int'):
    return T.mean(T.neq(self.y_pred, y))
    else:
    print("!!! y should be of int type")
    return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int')))


    class NeuralNet(object):
    """ Neural network (not regularized, without dropout) """
    def __init__(self, numpy_rng, theano_rng=None,
    n_ins=40*3,
    layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression],
    layers_sizes=[1024, 1024, 1024, 1024],
    n_outs=62 * 3,
    rho=0.9,
    eps=1.E-6,
    max_norm=0.,
    debugprint=False):
    """
    Basic feedforward neural network.
    """
    self.layers = []
    self.params = []
    self.n_layers = len(layers_types)
    self.layers_types = layers_types
    assert self.n_layers > 0
    self.max_norm = max_norm
    self._rho = rho # "momentum" for adadelta
    self._eps = eps # epsilon for adadelta
    self._accugrads = [] # for adadelta
    self._accudeltas = [] # for adadelta

    if theano_rng == None:
    theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

    self.x = T.fmatrix('x')
    self.y = T.ivector('y')

    self.layers_ins = [n_ins] + layers_sizes
    self.layers_outs = layers_sizes + [n_outs]

    layer_input = self.x

    for layer_type, n_in, n_out in zip(layers_types,
    self.layers_ins, self.layers_outs):
    this_layer = layer_type(rng=numpy_rng,
    input=layer_input, n_in=n_in, n_out=n_out)
    assert hasattr(this_layer, 'output')
    self.params.extend(this_layer.params)
    self._accugrads.extend([build_shared_zeros(t.shape.eval(),
    'accugrad') for t in this_layer.params])
    self._accudeltas.extend([build_shared_zeros(t.shape.eval(),
    'accudelta') for t in this_layer.params])

    self.layers.append(this_layer)
    layer_input = this_layer.output

    assert hasattr(self.layers[-1], 'training_cost')
    assert hasattr(self.layers[-1], 'errors')
    # TODO standardize cost
    self.mean_cost = self.layers[-1].negative_log_likelihood(self.y)
    self.cost = self.layers[-1].training_cost(self.y)
    if debugprint:
    theano.printing.debugprint(self.cost)

    self.errors = self.layers[-1].errors(self.y)

    def __repr__(self):
    dimensions_layers_str = map(lambda x: "x".join(map(str, x)),
    zip(self.layers_ins, self.layers_outs))
    return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])),
    zip(self.layers_types, dimensions_layers_str)))


    def get_SGD_trainer(self):
    """ Returns a plain SGD minibatch trainer with learning rate as param.
    """
    batch_x = T.fmatrix('batch_x')
    batch_y = T.ivector('batch_y')
    learning_rate = T.fscalar('lr') # learning rate to use
    # compute the gradients with respect to the model parameters
    # using mean_cost so that the learning rate is not too dependent
    # on the batch size
    gparams = T.grad(self.mean_cost, self.params)

    # compute list of weights updates
    updates = OrderedDict()
    for param, gparam in zip(self.params, gparams):
    if self.max_norm:
    W = param - gparam * learning_rate
    col_norms = W.norm(2, axis=0)
    desired_norms = T.clip(col_norms, 0, self.max_norm)
    updates[param] = W * (desired_norms / (1e-6 + col_norms))
    else:
    updates[param] = param - gparam * learning_rate

    train_fn = theano.function(inputs=[theano.Param(batch_x),
    theano.Param(batch_y),
    theano.Param(learning_rate)],
    outputs=self.mean_cost,
    updates=updates,
    givens={self.x: batch_x, self.y: batch_y})

    return train_fn


    def get_adagrad_trainer(self):
    """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
    """
    batch_x = T.fmatrix('batch_x')
    batch_y = T.ivector('batch_y')
    learning_rate = T.fscalar('lr') # learning rate to use
    # compute the gradients with respect to the model parameters
    gparams = T.grad(self.mean_cost, self.params)

    # compute list of weights updates
    updates = OrderedDict()
    for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
    # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
    agrad = accugrad + gparam * gparam
    dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
    if self.max_norm:
    W = param + dx
    col_norms = W.norm(2, axis=0)
    desired_norms = T.clip(col_norms, 0, self.max_norm)
    updates[param] = W * (desired_norms / (1e-6 + col_norms))
    else:
    updates[param] = param + dx
    updates[accugrad] = agrad

    train_fn = theano.function(inputs=[theano.Param(batch_x),
    theano.Param(batch_y),
    theano.Param(learning_rate)],
    outputs=self.mean_cost,
    updates=updates,
    givens={self.x: batch_x, self.y: batch_y})

    return train_fn

    def get_adadelta_trainer(self):
    """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and
    self._eps params.
    """
    batch_x = T.fmatrix('batch_x')
    batch_y = T.ivector('batch_y')
    # compute the gradients with respect to the model parameters
    gparams = T.grad(self.mean_cost, self.params)

    # compute list of weights updates
    updates = OrderedDict()
    for accugrad, accudelta, param, gparam in zip(self._accugrads,
    self._accudeltas, self.params, gparams):
    # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
    agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
    dx = - T.sqrt((accudelta + self._eps)
    / (agrad + self._eps)) * gparam
    updates[accudelta] = (self._rho * accudelta
    + (1 - self._rho) * dx * dx)
    if self.max_norm:
    W = param + dx
    col_norms = W.norm(2, axis=0)
    desired_norms = T.clip(col_norms, 0, self.max_norm)
    updates[param] = W * (desired_norms / (1e-6 + col_norms))
    else:
    updates[param] = param + dx
    updates[accugrad] = agrad

    train_fn = theano.function(inputs=[theano.Param(batch_x),
    theano.Param(batch_y)],
    outputs=self.mean_cost,
    updates=updates,
    givens={self.x: batch_x, self.y: batch_y})

    return train_fn

    def score_classif(self, given_set):
    """ Returns functions to get current classification errors. """
    batch_x = T.fmatrix('batch_x')
    batch_y = T.ivector('batch_y')
    score = theano.function(inputs=[theano.Param(batch_x),
    theano.Param(batch_y)],
    outputs=self.errors,
    givens={self.x: batch_x, self.y: batch_y})

    def scoref():
    """ returned function that scans the entire set given as input """
    return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

    return scoref


    class RegularizedNet(NeuralNet):
    """ Neural net with L1 and L2 regularization """
    def __init__(self, numpy_rng, theano_rng=None,
    n_ins=100,
    layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
    layers_sizes=[1024, 1024, 1024],
    n_outs=2,
    rho=0.9,
    eps=1.E-6,
    L1_reg=0.,
    L2_reg=0.,
    max_norm=0.,
    debugprint=False):
    """
    Feedforward neural network with added L1 and/or L2 regularization.
    """
    super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins,
    layers_types, layers_sizes, n_outs, rho, eps, max_norm,
    debugprint)

    L1 = shared(0.)
    for param in self.params:
    L1 += T.sum(abs(param))
    if L1_reg > 0.:
    self.cost = self.cost + L1_reg * L1
    L2 = shared(0.)
    for param in self.params:
    L2 += T.sum(param ** 2)
    if L2_reg > 0.:
    self.cost = self.cost + L2_reg * L2


    class DropoutNet(NeuralNet):
    """ Neural net with dropout (see Hinton's et al. paper) """
    def __init__(self, numpy_rng, theano_rng=None,
    n_ins=40*3,
    layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
    layers_sizes=[4000, 4000, 4000, 4000],
    dropout_rates=[0.0, 0.5, 0.5, 0.5, 0.5],
    n_outs=62 * 3,
    rho=0.9,
    eps=1.E-6,
    max_norm=0.,
    fast_drop=False,
    debugprint=False):
    """
    Feedforward neural network with dropout regularization.
    """
    super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins,
    layers_types, layers_sizes, n_outs, rho, eps, max_norm,
    debugprint)

    self.dropout_rates = dropout_rates
    if fast_drop:
    if dropout_rates[0]:
    dropout_layer_input = fast_dropout(numpy_rng, self.x)
    else:
    dropout_layer_input = self.x
    else:
    dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0])
    self.dropout_layers = []

    for layer, layer_type, n_in, n_out, dr in zip(self.layers,
    layers_types, self.layers_ins, self.layers_outs,
    dropout_rates[1:] + [0]): # !!! we do not dropout anything
    # from the last layer !!!
    if dr:
    if fast_drop:
    this_layer = layer_type(rng=numpy_rng,
    input=dropout_layer_input, n_in=n_in, n_out=n_out,
    W=layer.W, b=layer.b, fdrop=True)
    else:
    this_layer = layer_type(rng=numpy_rng,
    input=dropout_layer_input, n_in=n_in, n_out=n_out,
    W=layer.W * 1. / (1. - dr),
    b=layer.b * 1. / (1. - dr))
    # N.B. dropout with dr==1 does not dropanything!!
    this_layer.output = dropout(numpy_rng, this_layer.output, dr)
    else:
    this_layer = layer_type(rng=numpy_rng,
    input=dropout_layer_input, n_in=n_in, n_out=n_out,
    W=layer.W, b=layer.b)

    assert hasattr(this_layer, 'output')
    self.dropout_layers.append(this_layer)
    dropout_layer_input = this_layer.output

    assert hasattr(self.layers[-1], 'training_cost')
    assert hasattr(self.layers[-1], 'errors')
    # these are the dropout costs
    self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y)
    self.cost = self.dropout_layers[-1].training_cost(self.y)

    # these is the non-dropout errors
    self.errors = self.layers[-1].errors(self.y)

    def __repr__(self):
    return super(DropoutNet, self).__repr__() + "\n"\
    + "dropout rates: " + str(self.dropout_rates)


    def add_fit_and_score(class_to_chg):
    """ Mutates a class to add the fit() and score() functions to a NeuralNet.
    """
    from types import MethodType
    def fit(self, x_train, y_train, x_dev=None, y_dev=None,
    max_epochs=100, early_stopping=True, split_ratio=0.1,
    method='adadelta', verbose=False, plot=False):
    """
    Fits the neural network to `x_train` and `y_train`.
    If x_dev nor y_dev are not given, it will do a `split_ratio` cross-
    validation split on `x_train` and `y_train` (for early stopping).
    """
    import time, copy
    if x_dev == None or y_dev == None:
    from sklearn.cross_validation import train_test_split
    x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train,
    test_size=split_ratio, random_state=42)
    if method == 'sgd':
    train_fn = self.get_SGD_trainer()
    elif method == 'adagrad':
    train_fn = self.get_adagrad_trainer()
    elif method == 'adadelta':
    train_fn = self.get_adadelta_trainer()
    train_set_iterator = DatasetMiniBatchIterator(x_train, y_train)
    dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev)
    train_scoref = self.score_classif(train_set_iterator)
    dev_scoref = self.score_classif(dev_set_iterator)
    best_dev_loss = numpy.inf
    epoch = 0
    # TODO early stopping (not just cross val, also stop training)
    if plot:
    verbose = True
    self._costs = []
    self._train_errors = []
    self._dev_errors = []
    self._updates = []

    while epoch < max_epochs:
    if not verbose:
    sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs))
    sys.stdout.flush()
    avg_costs = []
    timer = time.time()
    for x, y in train_set_iterator:
    if method == 'sgd' or method == 'adagrad':
    avg_cost = train_fn(x, y, lr=1.E-2) # TODO: you have to
    # play with this
    # learning rate
    # (dataset dependent)
    elif method == 'adadelta':
    avg_cost = train_fn(x, y)
    if type(avg_cost) == list:
    avg_costs.append(avg_cost[0])
    else:
    avg_costs.append(avg_cost)
    if verbose:
    mean_costs = numpy.mean(avg_costs)
    mean_train_errors = numpy.mean(train_scoref())
    print(' epoch %i took %f seconds' %
    (epoch, time.time() - timer))
    print(' epoch %i, avg costs %f' %
    (epoch, mean_costs))
    print(' epoch %i, training error %f' %
    (epoch, mean_train_errors))
    if plot:
    self._costs.append(mean_costs)
    self._train_errors.append(mean_train_errors)
    dev_errors = numpy.mean(dev_scoref())
    if plot:
    self._dev_errors.append(dev_errors)
    if dev_errors < best_dev_loss:
    best_dev_loss = dev_errors
    best_params = copy.deepcopy(self.params)
    if verbose:
    print('!!! epoch %i, validation error of best model %f' %
    (epoch, dev_errors))
    epoch += 1
    if not verbose:
    print("")
    for i, param in enumerate(best_params):
    self.params[i] = param

    def score(self, x, y):
    """ error rates """
    iterator = DatasetMiniBatchIterator(x, y)
    scoref = self.score_classif(iterator)
    return numpy.mean(scoref())

    class_to_chg.fit = MethodType(fit, None, class_to_chg)
    class_to_chg.score = MethodType(score, None, class_to_chg)


    if __name__ == "__main__":
    add_fit_and_score(DropoutNet)
    add_fit_and_score(RegularizedNet)

    def nudge_dataset(X, Y):
    """
    This produces a dataset 5 times bigger than the original one,
    by moving the 8x8 images in X around by 1px to left, right, down, up
    """
    from scipy.ndimage import convolve
    direction_vectors = [
    [[0, 1, 0],
    [0, 0, 0],
    [0, 0, 0]],
    [[0, 0, 0],
    [1, 0, 0],
    [0, 0, 0]],
    [[0, 0, 0],
    [0, 0, 1],
    [0, 0, 0]],
    [[0, 0, 0],
    [0, 0, 0],
    [0, 1, 0]]]
    shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
    weights=w).ravel()
    X = numpy.concatenate([X] +
    [numpy.apply_along_axis(shift, 1, X, vector)
    for vector in direction_vectors])
    Y = numpy.concatenate([Y for _ in range(5)], axis=0)
    return X, Y

    from sklearn import datasets, svm, naive_bayes
    from sklearn import cross_validation, preprocessing
    MNIST = True # MNIST dataset
    DIGITS = False # digits dataset
    FACES = True # faces dataset
    TWENTYNEWSGROUPS = False # 20 newgroups dataset
    VERBOSE = True # prints evolution of the loss/accuracy during the fitting
    SCALE = True # scale the dataset
    PLOT = True # plot losses and accuracies

    def train_models(x_train, y_train, x_test, y_test, n_features, n_outs,
    use_dropout=True, n_epochs=100, numpy_rng=None,
    svms=False, nb=False, deepnn=True, name=''):
    if svms:
    print("Linear SVM")
    classifier = svm.SVC(gamma=0.001)
    print(classifier)
    classifier.fit(x_train, y_train)
    print("score: %f" % classifier.score(x_test, y_test))

    print("RBF-kernel SVM")
    classifier = svm.SVC(kernel='rbf', class_weight='auto')
    print(classifier)
    classifier.fit(x_train, y_train)
    print("score: %f" % classifier.score(x_test, y_test))

    if nb:
    print("Multinomial Naive Bayes")
    classifier = naive_bayes.MultinomialNB()
    print(classifier)
    classifier.fit(x_train, y_train)
    print("score: %f" % classifier.score(x_test, y_test))

    if deepnn:
    import warnings
    warnings.filterwarnings("ignore") # TODO remove

    if use_dropout:
    #n_epochs *= 4 TODO
    pass

    def new_dnn(dropout=False):
    if dropout:
    print("Dropout DNN")
    return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
    layers_types=[ReLU, ReLU, LogisticRegression],
    layers_sizes=[200, 200],
    dropout_rates=[0., 0.5, 0.5],
    # TODO if you have a big enough GPU, use these:
    #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
    #layers_sizes=[2000, 2000, 2000, 2000],
    #dropout_rates=[0., 0.5, 0.5, 0.5, 0.5],
    n_outs=n_outs,
    max_norm=4.,
    fast_drop=True,
    debugprint=0)
    else:
    print("Simple (regularized) DNN")
    return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features,
    layers_types=[ReLU, ReLU, LogisticRegression],
    layers_sizes=[200, 200],
    n_outs=n_outs,
    #L1_reg=0.001/x_train.shape[0],
    #L2_reg=0.001/x_train.shape[0],
    L1_reg=0.,
    L2_reg=1./x_train.shape[0],
    debugprint=0)

    import matplotlib.pyplot as plt
    plt.figure()
    ax1 = plt.subplot(221)
    ax2 = plt.subplot(222)
    ax3 = plt.subplot(223)
    ax4 = plt.subplot(224) # TODO plot the updates of the weights
    methods = ['sgd', 'adagrad', 'adadelta']
    #methods = ['adadelta'] TODO if you want "good" results asap
    for method in methods:
    dnn = new_dnn(use_dropout)
    print dnn, "using", method
    dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT)
    test_error = dnn.score(x_test, y_test)
    print("score: %f" % (1. - test_error))
    ax1.plot(numpy.log10(dnn._costs), label=method)
    ax2.plot(numpy.log10(dnn._train_errors), label=method)
    ax3.plot(numpy.log10(dnn._dev_errors), label=method)
    #ax2.plot(dnn._train_errors, label=method)
    #ax3.plot(dnn._dev_errors, label=method)
    ax4.plot([test_error for _ in range(10)], label=method)
    ax1.set_xlabel('epoch')
    ax1.set_ylabel('cost (log10)')
    ax2.set_xlabel('epoch')
    ax2.set_ylabel('train error')
    ax3.set_xlabel('epoch')
    ax3.set_ylabel('dev error')
    ax4.set_ylabel('test error')
    plt.legend()
    plt.savefig('training_' + name + '.png')


    if MNIST:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
    X = numpy.asarray(mnist.data, dtype='float32')
    if SCALE:
    #X = preprocessing.scale(X)
    X /= 255.
    y = numpy.asarray(mnist.target, dtype='int32')
    print("Total dataset size:")
    print("n samples: %d" % X.shape[0])
    print("n features: %d" % X.shape[1])
    print("n classes: %d" % len(set(y)))
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=42)

    train_models(x_train, y_train, x_test, y_test, X.shape[1],
    len(set(y)), numpy_rng=numpy.random.RandomState(123),
    name='MNIST')


    if DIGITS:
    digits = datasets.load_digits()
    data = numpy.asarray(digits.data, dtype='float32')
    target = numpy.asarray(digits.target, dtype='int32')
    nudged_x, nudged_y = nudge_dataset(data, target)
    if SCALE:
    nudged_x = preprocessing.scale(nudged_x)
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    nudged_x, nudged_y, test_size=0.2, random_state=42)
    train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1],
    len(set(target)), numpy_rng=numpy.random.RandomState(123),
    name='digits')

    if FACES:
    import logging
    logging.basicConfig(level=logging.INFO,
    format='%(asctime)s %(message)s')
    lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70,
    resize=0.4)
    X = numpy.asarray(lfw_people.data, dtype='float32')
    if SCALE:
    X = preprocessing.scale(X)
    y = numpy.asarray(lfw_people.target, dtype='int32')
    target_names = lfw_people.target_names
    print("Total dataset size:")
    print("n samples: %d" % X.shape[0])
    print("n features: %d" % X.shape[1])
    print("n classes: %d" % target_names.shape[0])
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=42)

    train_models(x_train, y_train, x_test, y_test, X.shape[1],
    len(set(y)), numpy_rng=numpy.random.RandomState(123),
    name='faces')

    if TWENTYNEWSGROUPS:
    from sklearn.feature_extraction.text import TfidfVectorizer
    newsgroups_train = datasets.fetch_20newsgroups(subset='train')
    vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
    #vectorizer = HashingVectorizer(encoding='latin-1')
    x_train = vectorizer.fit_transform(newsgroups_train.data)
    x_train = numpy.asarray(x_train.todense(), dtype='float32')
    y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
    newsgroups_test = datasets.fetch_20newsgroups(subset='test')
    x_test = vectorizer.transform(newsgroups_test.data)
    x_test = numpy.asarray(x_test.todense(), dtype='float32')
    y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
    train_models(x_train, y_train, x_test, y_test, x_train.shape[1],
    len(set(y_train)),
    numpy_rng=numpy.random.RandomState(123),
    svms=False, nb=True, deepnn=True,
    name='20newsgroups')