rootAvish · August 29, 2015 14:27 · Sep 25, 2014 · Sep 25, 2014 · Aug 26, 2014 · Aug 9, 2014
diff --git a/dnn.py b/dnn.py
@@ -615,7 +615,7 @@ def new_dnn(dropout=False):
                         # TODO if you have a big enough GPU, use these:
                         #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                         #layers_sizes=[2000, 2000, 2000, 2000],
-                        #dropout_rates=[0., 0.5, 0.5, 0.5, 0.5],
+                        #dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
                         n_outs=n_outs,
                         max_norm=4.,
                         fast_drop=True,

diff --git a/dnn.py b/dnn.py
@@ -378,7 +378,7 @@ def __init__(self, numpy_rng, theano_rng=None,
                  n_ins=40*3,
                  layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                  layers_sizes=[4000, 4000, 4000, 4000],
-                 dropout_rates=[0.0, 0.5, 0.5, 0.5, 0.5],
+                 dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
                  n_outs=62 * 3,
                  rho=0.9,
                  eps=1.E-6,
@@ -611,7 +611,7 @@ def new_dnn(dropout=False):
                     return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
                         layers_types=[ReLU, ReLU, LogisticRegression],
                         layers_sizes=[200, 200],
-                        dropout_rates=[0., 0.5, 0.5],
+                        dropout_rates=[0.2, 0.5, 0.5],
                         # TODO if you have a big enough GPU, use these:
                         #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
                         #layers_sizes=[2000, 2000, 2000, 2000],

diff --git a/dnn.py b/dnn.py
@@ -1,5 +1,7 @@
 """
 A deep neural network with or w/o dropout in one file.
+
+License: Do What The Fuck You Want to Public License http://www.wtfpl.net/
 """
 
 import numpy, theano, sys, math

diff --git a/dnn.py b/dnn.py
@@ -0,0 +1,735 @@
+"""
+A deep neural network with or w/o dropout in one file.
+"""
+
+import numpy, theano, sys, math
+from theano import tensor as T
+from theano import shared
+from theano.tensor.shared_randomstreams import RandomStreams
+from collections import OrderedDict
+
+BATCH_SIZE = 100
+
+
+def relu_f(vec):
+    """ Wrapper to quickly change the rectified linear unit function """
+    return (vec + abs(vec)) / 2.
+
+
+def dropout(rng, x, p=0.5):
+    """ Zero-out random values in x with probability p using rng """
+    if p > 0. and p < 1.:
+        seed = rng.randint(2 ** 30)
+        srng = theano.tensor.shared_randomstreams.RandomStreams(seed)
+        mask = srng.binomial(n=1, p=1.-p, size=x.shape,
+                dtype=theano.config.floatX)
+        return x * mask
+    return x
+
+
+def fast_dropout(rng, x):
+    """ Multiply activations by N(1,1) """
+    seed = rng.randint(2 ** 30)
+    srng = RandomStreams(seed)
+    mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX)
+    return x * mask
+
+
+def build_shared_zeros(shape, name):
+    """ Builds a theano shared variable filled with a zeros numpy array """
+    return shared(value=numpy.zeros(shape, dtype=theano.config.floatX),
+            name=name, borrow=True)
+
+
+class Linear(object):
+    """ Basic linear transformation layer (W.X + b) """
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
+        if W is None:
+            W_values = numpy.asarray(rng.uniform(
+                low=-numpy.sqrt(6. / (n_in + n_out)),
+                high=numpy.sqrt(6. / (n_in + n_out)),
+                size=(n_in, n_out)), dtype=theano.config.floatX)
+            W_values *= 4  # This works for sigmoid activated networks!
+            W = theano.shared(value=W_values, name='W', borrow=True)
+        if b is None:
+            b = build_shared_zeros((n_out,), 'b')
+        self.input = input
+        self.W = W
+        self.b = b
+        self.params = [self.W, self.b]
+        self.output = T.dot(self.input, self.W) + self.b
+        if fdrop:
+            self.output = fast_dropout(rng, self.output)
+
+    def __repr__(self):
+        return "Linear"
+
+
+class SigmoidLayer(Linear):
+    """ Sigmoid activation layer (sigmoid(W.X + b)) """
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
+        super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b)
+        self.pre_activation = self.output
+        if fdrop:
+            self.pre_activation = fast_dropout(rng, self.pre_activation)
+        self.output = T.nnet.sigmoid(self.pre_activation)
+
+
+class ReLU(Linear):
+    """ Rectified Linear Unit activation layer (max(0, W.X + b)) """
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False):
+        if b is None:
+            b = build_shared_zeros((n_out,), 'b')
+        super(ReLU, self).__init__(rng, input, n_in, n_out, W, b)
+        self.pre_activation = self.output
+        if fdrop:
+            self.pre_activation = fast_dropout(rng, self.pre_activation)
+        self.output = relu_f(self.pre_activation)
+
+
+class DatasetMiniBatchIterator(object):
+    """ Basic mini-batch iterator """
+    def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False):
+        self.x = x
+        self.y = y
+        self.batch_size = batch_size
+        self.randomize = randomize
+        from sklearn.utils import check_random_state
+        self.rng = check_random_state(42)
+
+    def __iter__(self):
+        n_samples = self.x.shape[0]
+        if self.randomize:
+            for _ in xrange(n_samples / BATCH_SIZE):
+                if BATCH_SIZE > 1:
+                    i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE))
+                else:
+                    i = int(math.floor(self.rng.rand(1) * n_samples))
+                yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size],
+                       self.y[i*self.batch_size:(i+1)*self.batch_size])
+        else:
+            for i in xrange((n_samples + self.batch_size - 1)
+                            / self.batch_size):
+                yield (self.x[i*self.batch_size:(i+1)*self.batch_size],
+                       self.y[i*self.batch_size:(i+1)*self.batch_size])
+
+
+class LogisticRegression:
+    """Multi-class Logistic Regression
+    """
+    def __init__(self, rng, input, n_in, n_out, W=None, b=None):
+        if W != None:
+            self.W = W
+        else:
+            self.W = build_shared_zeros((n_in, n_out), 'W')
+        if b != None:
+            self.b = b
+        else:
+            self.b = build_shared_zeros((n_out,), 'b')
+
+        # P(Y|X) = softmax(W.X + b)
+        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
+        self.y_pred = T.argmax(self.p_y_given_x, axis=1)
+        self.output = self.y_pred
+        self.params = [self.W, self.b]
+
+    def negative_log_likelihood(self, y):
+        return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+
+    def negative_log_likelihood_sum(self, y):
+        return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
+
+    def training_cost(self, y):
+        """ Wrapper for standard name """
+        return self.negative_log_likelihood_sum(y)
+
+    def errors(self, y):
+        if y.ndim != self.y_pred.ndim:
+            raise TypeError("y should have the same shape as self.y_pred",
+                ("y", y.type, "y_pred", self.y_pred.type))
+        if y.dtype.startswith('int'):
+            return T.mean(T.neq(self.y_pred, y))
+        else:
+            print("!!! y should be of int type")
+            return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int')))
+
+
+class NeuralNet(object):
+    """ Neural network (not regularized, without dropout) """
+    def __init__(self, numpy_rng, theano_rng=None, 
+                 n_ins=40*3,
+                 layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression],
+                 layers_sizes=[1024, 1024, 1024, 1024],
+                 n_outs=62 * 3,
+                 rho=0.9,
+                 eps=1.E-6,
+                 max_norm=0.,
+                 debugprint=False):
+        """
+        Basic feedforward neural network.
+        """
+        self.layers = []
+        self.params = []
+        self.n_layers = len(layers_types)
+        self.layers_types = layers_types
+        assert self.n_layers > 0
+        self.max_norm = max_norm
+        self._rho = rho  # "momentum" for adadelta
+        self._eps = eps  # epsilon for adadelta
+        self._accugrads = []  # for adadelta
+        self._accudeltas = []  # for adadelta
+
+        if theano_rng == None:
+            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
+
+        self.x = T.fmatrix('x')
+        self.y = T.ivector('y')
+
+        self.layers_ins = [n_ins] + layers_sizes
+        self.layers_outs = layers_sizes + [n_outs]
+
+        layer_input = self.x
+
+        for layer_type, n_in, n_out in zip(layers_types,
+                self.layers_ins, self.layers_outs):
+            this_layer = layer_type(rng=numpy_rng,
+                    input=layer_input, n_in=n_in, n_out=n_out)
+            assert hasattr(this_layer, 'output')
+            self.params.extend(this_layer.params)
+            self._accugrads.extend([build_shared_zeros(t.shape.eval(),
+                'accugrad') for t in this_layer.params])
+            self._accudeltas.extend([build_shared_zeros(t.shape.eval(),
+                'accudelta') for t in this_layer.params])
+
+            self.layers.append(this_layer)
+            layer_input = this_layer.output
+
+        assert hasattr(self.layers[-1], 'training_cost')
+        assert hasattr(self.layers[-1], 'errors')
+        # TODO standardize cost
+        self.mean_cost = self.layers[-1].negative_log_likelihood(self.y)
+        self.cost = self.layers[-1].training_cost(self.y)
+        if debugprint:
+            theano.printing.debugprint(self.cost)
+
+        self.errors = self.layers[-1].errors(self.y)
+
+    def __repr__(self):
+        dimensions_layers_str = map(lambda x: "x".join(map(str, x)),
+                                    zip(self.layers_ins, self.layers_outs))
+        return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])),
+                            zip(self.layers_types, dimensions_layers_str)))
+
+
+    def get_SGD_trainer(self):
+        """ Returns a plain SGD minibatch trainer with learning rate as param.
+        """
+        batch_x = T.fmatrix('batch_x')
+        batch_y = T.ivector('batch_y')
+        learning_rate = T.fscalar('lr')  # learning rate to use
+        # compute the gradients with respect to the model parameters
+        # using mean_cost so that the learning rate is not too dependent
+        # on the batch size
+        gparams = T.grad(self.mean_cost, self.params)
+
+        # compute list of weights updates
+        updates = OrderedDict()
+        for param, gparam in zip(self.params, gparams):
+            if self.max_norm:
+                W = param - gparam * learning_rate
+                col_norms = W.norm(2, axis=0)
+                desired_norms = T.clip(col_norms, 0, self.max_norm)
+                updates[param] = W * (desired_norms / (1e-6 + col_norms))
+            else:
+                updates[param] = param - gparam * learning_rate
+
+        train_fn = theano.function(inputs=[theano.Param(batch_x),
+                                           theano.Param(batch_y),
+                                           theano.Param(learning_rate)],
+                                   outputs=self.mean_cost,
+                                   updates=updates,
+                                   givens={self.x: batch_x, self.y: batch_y})
+
+        return train_fn
+
+
+    def get_adagrad_trainer(self):
+        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
+        """
+        batch_x = T.fmatrix('batch_x')
+        batch_y = T.ivector('batch_y')
+        learning_rate = T.fscalar('lr')  # learning rate to use
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(self.mean_cost, self.params)
+
+        # compute list of weights updates
+        updates = OrderedDict()
+        for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
+            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
+            agrad = accugrad + gparam * gparam
+            dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
+            if self.max_norm:
+                W = param + dx
+                col_norms = W.norm(2, axis=0)
+                desired_norms = T.clip(col_norms, 0, self.max_norm)
+                updates[param] = W * (desired_norms / (1e-6 + col_norms))
+            else:
+                updates[param] = param + dx
+            updates[accugrad] = agrad
+
+        train_fn = theano.function(inputs=[theano.Param(batch_x), 
+            theano.Param(batch_y),
+            theano.Param(learning_rate)],
+            outputs=self.mean_cost,
+            updates=updates,
+            givens={self.x: batch_x, self.y: batch_y})
+
+        return train_fn
+
+    def get_adadelta_trainer(self):
+        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and
+        self._eps params.
+        """
+        batch_x = T.fmatrix('batch_x')
+        batch_y = T.ivector('batch_y')
+        # compute the gradients with respect to the model parameters
+        gparams = T.grad(self.mean_cost, self.params)
+
+        # compute list of weights updates
+        updates = OrderedDict()
+        for accugrad, accudelta, param, gparam in zip(self._accugrads,
+                self._accudeltas, self.params, gparams):
+            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
+            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
+            dx = - T.sqrt((accudelta + self._eps)
+                          / (agrad + self._eps)) * gparam
+            updates[accudelta] = (self._rho * accudelta
+                                  + (1 - self._rho) * dx * dx)
+            if self.max_norm:
+                W = param + dx
+                col_norms = W.norm(2, axis=0)
+                desired_norms = T.clip(col_norms, 0, self.max_norm)
+                updates[param] = W * (desired_norms / (1e-6 + col_norms))
+            else:
+                updates[param] = param + dx
+            updates[accugrad] = agrad
+
+        train_fn = theano.function(inputs=[theano.Param(batch_x),
+                                           theano.Param(batch_y)],
+                                   outputs=self.mean_cost,
+                                   updates=updates,
+                                   givens={self.x: batch_x, self.y: batch_y})
+
+        return train_fn
+
+    def score_classif(self, given_set):
+        """ Returns functions to get current classification errors. """
+        batch_x = T.fmatrix('batch_x')
+        batch_y = T.ivector('batch_y')
+        score = theano.function(inputs=[theano.Param(batch_x),
+                                        theano.Param(batch_y)],
+                                outputs=self.errors,
+                                givens={self.x: batch_x, self.y: batch_y})
+
+        def scoref():
+            """ returned function that scans the entire set given as input """
+            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]
+
+        return scoref
+
+
+class RegularizedNet(NeuralNet):
+    """ Neural net with L1 and L2 regularization """
+    def __init__(self, numpy_rng, theano_rng=None,
+                 n_ins=100,
+                 layers_types=[ReLU, ReLU, ReLU, LogisticRegression],
+                 layers_sizes=[1024, 1024, 1024],
+                 n_outs=2,
+                 rho=0.9,
+                 eps=1.E-6,
+                 L1_reg=0.,
+                 L2_reg=0.,
+                 max_norm=0.,
+                 debugprint=False):
+        """
+        Feedforward neural network with added L1 and/or L2 regularization.
+        """
+        super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins,
+                layers_types, layers_sizes, n_outs, rho, eps, max_norm,
+                debugprint)
+
+        L1 = shared(0.)
+        for param in self.params:
+            L1 += T.sum(abs(param))
+        if L1_reg > 0.:
+            self.cost = self.cost + L1_reg * L1
+        L2 = shared(0.)
+        for param in self.params:
+            L2 += T.sum(param ** 2)
+        if L2_reg > 0.:
+            self.cost = self.cost + L2_reg * L2
+
+
+class DropoutNet(NeuralNet):
+    """ Neural net with dropout (see Hinton's et al. paper) """
+    def __init__(self, numpy_rng, theano_rng=None,
+                 n_ins=40*3,
+                 layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
+                 layers_sizes=[4000, 4000, 4000, 4000],
+                 dropout_rates=[0.0, 0.5, 0.5, 0.5, 0.5],
+                 n_outs=62 * 3,
+                 rho=0.9,
+                 eps=1.E-6,
+                 max_norm=0.,
+                 fast_drop=False,
+                 debugprint=False):
+        """
+        Feedforward neural network with dropout regularization.
+        """
+        super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins,
+                layers_types, layers_sizes, n_outs, rho, eps, max_norm,
+                debugprint)
+
+        self.dropout_rates = dropout_rates
+        if fast_drop:
+            if dropout_rates[0]:
+                dropout_layer_input = fast_dropout(numpy_rng, self.x)
+            else:
+                dropout_layer_input = self.x
+        else:
+            dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0])
+        self.dropout_layers = []
+
+        for layer, layer_type, n_in, n_out, dr in zip(self.layers,
+                layers_types, self.layers_ins, self.layers_outs,
+                dropout_rates[1:] + [0]):  # !!! we do not dropout anything
+                                           # from the last layer !!!
+            if dr:
+                if fast_drop:
+                    this_layer = layer_type(rng=numpy_rng,
+                            input=dropout_layer_input, n_in=n_in, n_out=n_out,
+                            W=layer.W, b=layer.b, fdrop=True)
+                else:
+                    this_layer = layer_type(rng=numpy_rng,
+                            input=dropout_layer_input, n_in=n_in, n_out=n_out,
+                            W=layer.W * 1. / (1. - dr),
+                            b=layer.b * 1. / (1. - dr))
+                    # N.B. dropout with dr==1 does not dropanything!!
+                    this_layer.output = dropout(numpy_rng, this_layer.output, dr)
+            else:
+                this_layer = layer_type(rng=numpy_rng,
+                        input=dropout_layer_input, n_in=n_in, n_out=n_out,
+                        W=layer.W, b=layer.b)
+
+            assert hasattr(this_layer, 'output')
+            self.dropout_layers.append(this_layer)
+            dropout_layer_input = this_layer.output
+
+        assert hasattr(self.layers[-1], 'training_cost')
+        assert hasattr(self.layers[-1], 'errors')
+        # these are the dropout costs
+        self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y)
+        self.cost = self.dropout_layers[-1].training_cost(self.y)
+
+        # these is the non-dropout errors
+        self.errors = self.layers[-1].errors(self.y)
+
+    def __repr__(self):
+        return super(DropoutNet, self).__repr__() + "\n"\
+                + "dropout rates: " + str(self.dropout_rates)
+
+
+def add_fit_and_score(class_to_chg):
+    """ Mutates a class to add the fit() and score() functions to a NeuralNet.
+    """
+    from types import MethodType
+    def fit(self, x_train, y_train, x_dev=None, y_dev=None,
+            max_epochs=100, early_stopping=True, split_ratio=0.1,
+            method='adadelta', verbose=False, plot=False):
+        """
+        Fits the neural network to `x_train` and `y_train`. 
+        If x_dev nor y_dev are not given, it will do a `split_ratio` cross-
+        validation split on `x_train` and `y_train` (for early stopping).
+        """
+        import time, copy
+        if x_dev == None or y_dev == None:
+            from sklearn.cross_validation import train_test_split
+            x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train,
+                    test_size=split_ratio, random_state=42)
+        if method == 'sgd':
+            train_fn = self.get_SGD_trainer()
+        elif method == 'adagrad':
+            train_fn = self.get_adagrad_trainer()
+        elif method == 'adadelta':
+            train_fn = self.get_adadelta_trainer()
+        train_set_iterator = DatasetMiniBatchIterator(x_train, y_train)
+        dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev)
+        train_scoref = self.score_classif(train_set_iterator)
+        dev_scoref = self.score_classif(dev_set_iterator)
+        best_dev_loss = numpy.inf
+        epoch = 0
+        # TODO early stopping (not just cross val, also stop training)
+        if plot:
+            verbose = True
+            self._costs = []
+            self._train_errors = []
+            self._dev_errors = []
+            self._updates = []
+
+        while epoch < max_epochs:
+            if not verbose:
+                sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs))
+                sys.stdout.flush()
+            avg_costs = []
+            timer = time.time()
+            for x, y in train_set_iterator:
+                if method == 'sgd' or method == 'adagrad':
+                    avg_cost = train_fn(x, y, lr=1.E-2)  # TODO: you have to
+                                                         # play with this
+                                                         # learning rate
+                                                         # (dataset dependent)
+                elif method == 'adadelta':
+                    avg_cost = train_fn(x, y)
+                if type(avg_cost) == list:
+                    avg_costs.append(avg_cost[0])
+                else:
+                    avg_costs.append(avg_cost)
+            if verbose:
+                mean_costs = numpy.mean(avg_costs)
+                mean_train_errors = numpy.mean(train_scoref())
+                print('  epoch %i took %f seconds' %
+                      (epoch, time.time() - timer))
+                print('  epoch %i, avg costs %f' %
+                      (epoch, mean_costs))
+                print('  epoch %i, training error %f' %
+                      (epoch, mean_train_errors))
+                if plot:
+                    self._costs.append(mean_costs)
+                    self._train_errors.append(mean_train_errors)
+            dev_errors = numpy.mean(dev_scoref())
+            if plot:
+                self._dev_errors.append(dev_errors)
+            if dev_errors < best_dev_loss:
+                best_dev_loss = dev_errors
+                best_params = copy.deepcopy(self.params)
+                if verbose:
+                    print('!!!  epoch %i, validation error of best model %f' %
+                          (epoch, dev_errors))
+            epoch += 1
+        if not verbose:
+            print("")
+        for i, param in enumerate(best_params):
+            self.params[i] = param
+
+    def score(self, x, y):
+        """ error rates """
+        iterator = DatasetMiniBatchIterator(x, y)
+        scoref = self.score_classif(iterator)
+        return numpy.mean(scoref())
+
+    class_to_chg.fit = MethodType(fit, None, class_to_chg)
+    class_to_chg.score = MethodType(score, None, class_to_chg)
+
+
+if __name__ == "__main__":
+    add_fit_and_score(DropoutNet)
+    add_fit_and_score(RegularizedNet)
+
+    def nudge_dataset(X, Y):
+        """
+        This produces a dataset 5 times bigger than the original one,
+        by moving the 8x8 images in X around by 1px to left, right, down, up
+        """
+        from scipy.ndimage import convolve
+        direction_vectors = [
+            [[0, 1, 0],
+             [0, 0, 0],
+             [0, 0, 0]],
+            [[0, 0, 0],
+             [1, 0, 0],
+             [0, 0, 0]],
+            [[0, 0, 0],
+             [0, 0, 1],
+             [0, 0, 0]],
+            [[0, 0, 0],
+             [0, 0, 0],
+             [0, 1, 0]]]
+        shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant',
+                                      weights=w).ravel()
+        X = numpy.concatenate([X] +
+                              [numpy.apply_along_axis(shift, 1, X, vector)
+                                  for vector in direction_vectors])
+        Y = numpy.concatenate([Y for _ in range(5)], axis=0)
+        return X, Y
+
+    from sklearn import datasets, svm, naive_bayes
+    from sklearn import cross_validation, preprocessing
+    MNIST = True  # MNIST dataset
+    DIGITS = False  # digits dataset
+    FACES = True  # faces dataset
+    TWENTYNEWSGROUPS = False  # 20 newgroups dataset
+    VERBOSE = True  # prints evolution of the loss/accuracy during the fitting
+    SCALE = True  # scale the dataset
+    PLOT = True  # plot losses and accuracies
+
+    def train_models(x_train, y_train, x_test, y_test, n_features, n_outs,
+            use_dropout=True, n_epochs=100, numpy_rng=None,
+            svms=False, nb=False, deepnn=True, name=''):
+        if svms:
+            print("Linear SVM")
+            classifier = svm.SVC(gamma=0.001)
+            print(classifier)
+            classifier.fit(x_train, y_train)
+            print("score: %f" % classifier.score(x_test, y_test))
+
+            print("RBF-kernel SVM")
+            classifier = svm.SVC(kernel='rbf', class_weight='auto')
+            print(classifier)
+            classifier.fit(x_train, y_train)
+            print("score: %f" % classifier.score(x_test, y_test))
+
+        if nb:
+            print("Multinomial Naive Bayes")
+            classifier = naive_bayes.MultinomialNB()
+            print(classifier)
+            classifier.fit(x_train, y_train)
+            print("score: %f" % classifier.score(x_test, y_test))
+
+        if deepnn:
+            import warnings
+            warnings.filterwarnings("ignore")  # TODO remove
+
+            if use_dropout:
+                #n_epochs *= 4  TODO
+                pass
+
+            def new_dnn(dropout=False):
+                if dropout:
+                    print("Dropout DNN")
+                    return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features,
+                        layers_types=[ReLU, ReLU, LogisticRegression],
+                        layers_sizes=[200, 200],
+                        dropout_rates=[0., 0.5, 0.5],
+                        # TODO if you have a big enough GPU, use these:
+                        #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
+                        #layers_sizes=[2000, 2000, 2000, 2000],
+                        #dropout_rates=[0., 0.5, 0.5, 0.5, 0.5],
+                        n_outs=n_outs,
+                        max_norm=4.,
+                        fast_drop=True,
+                        debugprint=0)
+                else:
+                    print("Simple (regularized) DNN")
+                    return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features,
+                        layers_types=[ReLU, ReLU, LogisticRegression],
+                        layers_sizes=[200, 200],
+                        n_outs=n_outs,
+                        #L1_reg=0.001/x_train.shape[0],
+                        #L2_reg=0.001/x_train.shape[0],
+                        L1_reg=0.,
+                        L2_reg=1./x_train.shape[0],
+                        debugprint=0)
+
+            import matplotlib.pyplot as plt
+            plt.figure()
+            ax1 = plt.subplot(221)
+            ax2 = plt.subplot(222)
+            ax3 = plt.subplot(223)
+            ax4 = plt.subplot(224)  # TODO plot the updates of the weights
+            methods = ['sgd', 'adagrad', 'adadelta']
+            #methods = ['adadelta'] TODO if you want "good" results asap
+            for method in methods:
+                dnn = new_dnn(use_dropout)
+                print dnn, "using", method
+                dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT)
+                test_error = dnn.score(x_test, y_test)
+                print("score: %f" % (1. - test_error))
+                ax1.plot(numpy.log10(dnn._costs), label=method)
+                ax2.plot(numpy.log10(dnn._train_errors), label=method)
+                ax3.plot(numpy.log10(dnn._dev_errors), label=method)
+                #ax2.plot(dnn._train_errors, label=method)
+                #ax3.plot(dnn._dev_errors, label=method)
+                ax4.plot([test_error for _ in range(10)], label=method)
+            ax1.set_xlabel('epoch')
+            ax1.set_ylabel('cost (log10)')
+            ax2.set_xlabel('epoch')
+            ax2.set_ylabel('train error')
+            ax3.set_xlabel('epoch')
+            ax3.set_ylabel('dev error')
+            ax4.set_ylabel('test error')
+            plt.legend()
+            plt.savefig('training_' + name + '.png')
+
+
+    if MNIST:
+        from sklearn.datasets import fetch_mldata
+        mnist = fetch_mldata('MNIST original')
+        X = numpy.asarray(mnist.data, dtype='float32')
+        if SCALE:
+            #X = preprocessing.scale(X)
+            X /= 255.
+        y = numpy.asarray(mnist.target, dtype='int32')
+        print("Total dataset size:")
+        print("n samples: %d" % X.shape[0])
+        print("n features: %d" % X.shape[1])
+        print("n classes: %d" % len(set(y)))
+        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+                    X, y, test_size=0.2, random_state=42)
+
+        train_models(x_train, y_train, x_test, y_test, X.shape[1],
+                     len(set(y)), numpy_rng=numpy.random.RandomState(123),
+                     name='MNIST')
+
+
+    if DIGITS:
+        digits = datasets.load_digits()
+        data = numpy.asarray(digits.data, dtype='float32')
+        target = numpy.asarray(digits.target, dtype='int32')
+        nudged_x, nudged_y = nudge_dataset(data, target)
+        if SCALE:
+            nudged_x = preprocessing.scale(nudged_x)
+        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+                nudged_x, nudged_y, test_size=0.2, random_state=42)
+        train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1],
+                     len(set(target)), numpy_rng=numpy.random.RandomState(123),
+                     name='digits')
+
+    if FACES:
+        import logging
+        logging.basicConfig(level=logging.INFO,
+                            format='%(asctime)s %(message)s')
+        lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70,
+                                               resize=0.4)
+        X = numpy.asarray(lfw_people.data, dtype='float32')
+        if SCALE:
+            X = preprocessing.scale(X)
+        y = numpy.asarray(lfw_people.target, dtype='int32')
+        target_names = lfw_people.target_names
+        print("Total dataset size:")
+        print("n samples: %d" % X.shape[0])
+        print("n features: %d" % X.shape[1])
+        print("n classes: %d" % target_names.shape[0])
+        x_train, x_test, y_train, y_test = cross_validation.train_test_split(
+                    X, y, test_size=0.2, random_state=42)
+
+        train_models(x_train, y_train, x_test, y_test, X.shape[1],
+                     len(set(y)), numpy_rng=numpy.random.RandomState(123),
+                     name='faces')
+
+    if TWENTYNEWSGROUPS:
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        newsgroups_train = datasets.fetch_20newsgroups(subset='train')
+        vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000)
+        #vectorizer = HashingVectorizer(encoding='latin-1')
+        x_train = vectorizer.fit_transform(newsgroups_train.data)
+        x_train = numpy.asarray(x_train.todense(), dtype='float32')
+        y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
+        newsgroups_test = datasets.fetch_20newsgroups(subset='test')
+        x_test = vectorizer.transform(newsgroups_test.data)
+        x_test = numpy.asarray(x_test.todense(), dtype='float32')
+        y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
+        train_models(x_train, y_train, x_test, y_test, x_train.shape[1],
+                     len(set(y_train)),
+                     numpy_rng=numpy.random.RandomState(123),
+                     svms=False, nb=True, deepnn=True,
+                     name='20newsgroups')