yosemitebandit · June 8, 2017 01:49 · hedeya1980 · Oct 16, 2016 · aakashef · Nov 18, 2016
diff --git a/3_regularization.ipynb b/3_regularization.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "kR-4eNdK6lYS"
   },
   "source": [
    "Deep Learning\n",
    "=============\n",
    "\n",
    "Assignment 3\n",
    "------------\n",
    "\n",
    "Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.\n",
    "\n",
    "The goal of this assignment is to explore regularization techniques."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "cellView": "both",
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "collapsed": true,
    "id": "JLpLa8Jt7Vu4"
   },
   "outputs": [],
   "source": [
    "# These are all the modules we'll be using later. Make sure you can import them\n",
    "# before proceeding further.\n",
    "from __future__ import print_function\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "from six.moves import cPickle as pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "1HrCK6e17WzV"
   },
   "source": [
    "First reload the data we generated in _notmist.ipynb_."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "cellView": "both",
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {
       "item_id": 1
      }
     ]
    },
    "colab_type": "code",
    "collapsed": false,
    "executionInfo": {
     "elapsed": 11777,
     "status": "ok",
     "timestamp": 1449849322348,
     "user": {
      "color": "",
      "displayName": "",
      "isAnonymous": false,
      "isMe": true,
      "permissionId": "",
      "photoUrl": "",
      "sessionId": "0",
      "userId": ""
     },
     "user_tz": 480
    },
    "id": "y3-cj1bpmuxc",
    "outputId": "e03576f1-ebbe-4838-c388-f1777bcc9873"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set (200000, 28, 28) (200000,)\n",
      "Validation set (10000, 28, 28) (10000,)\n",
      "Test set (10000, 28, 28) (10000,)\n"
     ]
    }
   ],
   "source": [
    "pickle_file = 'notMNIST.pickle'\n",
    "\n",
    "with open(pickle_file, 'rb') as f:\n",
    "  save = pickle.load(f)\n",
    "  train_dataset = save['train_dataset']\n",
    "  train_labels = save['train_labels']\n",
    "  valid_dataset = save['valid_dataset']\n",
    "  valid_labels = save['valid_labels']\n",
    "  test_dataset = save['test_dataset']\n",
    "  test_labels = save['test_labels']\n",
    "  del save  # hint to help gc free up memory\n",
    "  print('Training set', train_dataset.shape, train_labels.shape)\n",
    "  print('Validation set', valid_dataset.shape, valid_labels.shape)\n",
    "  print('Test set', test_dataset.shape, test_labels.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "L7aHrm6nGDMB"
   },
   "source": [
    "Reformat into a shape that's more adapted to the models we're going to train:\n",
    "- data as a flat matrix,\n",
    "- labels as float 1-hot encodings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "cellView": "both",
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     },
     "output_extras": [
      {
       "item_id": 1
      }
     ]
    },
    "colab_type": "code",
    "collapsed": false,
    "executionInfo": {
     "elapsed": 11728,
     "status": "ok",
     "timestamp": 1449849322356,
     "user": {
      "color": "",
      "displayName": "",
      "isAnonymous": false,
      "isMe": true,
      "permissionId": "",
      "photoUrl": "",
      "sessionId": "0",
      "userId": ""
     },
     "user_tz": 480
    },
    "id": "IRSyYiIIGIzS",
    "outputId": "3f8996ee-3574-4f44-c953-5c8a04636582"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set (200000, 784) (200000, 10)\n",
      "Validation set (10000, 784) (10000, 10)\n",
      "Test set (10000, 784) (10000, 10)\n"
     ]
    }
   ],
   "source": [
    "image_size = 28\n",
    "num_labels = 10\n",
    "\n",
    "def reformat(dataset, labels):\n",
    "  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)\n",
    "  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]\n",
    "  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)\n",
    "  return dataset, labels\n",
    "train_dataset, train_labels = reformat(train_dataset, train_labels)\n",
    "valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)\n",
    "test_dataset, test_labels = reformat(test_dataset, test_labels)\n",
    "print('Training set', train_dataset.shape, train_labels.shape)\n",
    "print('Validation set', valid_dataset.shape, valid_labels.shape)\n",
    "print('Test set', test_dataset.shape, test_labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "cellView": "both",
    "colab": {
     "autoexec": {
      "startup": false,
      "wait_interval": 0
     }
    },
    "colab_type": "code",
    "collapsed": true,
    "id": "RajPLaL_ZW6w"
   },
   "outputs": [],
   "source": [
    "def accuracy(predictions, labels):\n",
    "  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))\n",
    "          / predictions.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "sgLbUAQ1CW-1"
   },
   "source": [
    "---\n",
    "Problem 1\n",
    "---------\n",
    "\n",
    "Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "first the logistic model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_subset = 10000\n",
    "\n",
    "graph = tf.Graph()\n",
    "with graph.as_default():\n",
    "  # Load the data.\n",
    "  tf_train_dataset = tf.constant(train_dataset[:train_subset, :])\n",
    "  tf_train_labels = tf.constant(train_labels[:train_subset])\n",
    "  tf_valid_dataset = tf.constant(valid_dataset)\n",
    "  tf_test_dataset = tf.constant(test_dataset)\n",
    "  \n",
    "  # Setup variables.\n",
    "  weights = tf.Variable(\n",
    "    tf.truncated_normal([image_size * image_size, num_labels]))\n",
    "  biases = tf.Variable(tf.zeros([num_labels]))\n",
    "  #beta = tf.Variable(tf.truncated_normal([1]))\n",
    "  \n",
    "  # Compute.\n",
    "  logits = tf.matmul(tf_train_dataset, weights) + biases\n",
    "  loss = tf.reduce_mean(\n",
    "    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n",
    "  l2_regularizer = tf.nn.l2_loss(weights)\n",
    "  loss += 5e-4 * l2_regularizer\n",
    "  \n",
    "  # Optimize.\n",
    "  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n",
    "  \n",
    "  # Predict.\n",
    "  train_prediction = tf.nn.softmax(logits)\n",
    "  valid_prediction = tf.nn.softmax(\n",
    "    tf.matmul(tf_valid_dataset, weights) + biases)\n",
    "  test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Loss at step 0: 16.925770\n",
      "Training accuracy: 12.3%\n",
      "Validation accuracy: 16.4%\n",
      "Loss at step 100: 3.695228\n",
      "Training accuracy: 72.1%\n",
      "Validation accuracy: 71.4%\n",
      "Loss at step 200: 3.071672\n",
      "Training accuracy: 75.0%\n",
      "Validation accuracy: 74.0%\n",
      "Loss at step 300: 2.699490\n",
      "Training accuracy: 76.3%\n",
      "Validation accuracy: 74.9%\n",
      "Loss at step 400: 2.429267\n",
      "Training accuracy: 77.0%\n",
      "Validation accuracy: 75.7%\n",
      "Loss at step 500: 2.215348\n",
      "Training accuracy: 77.9%\n",
      "Validation accuracy: 75.8%\n",
      "Loss at step 600: 2.038570\n",
      "Training accuracy: 78.6%\n",
      "Validation accuracy: 76.1%\n",
      "Loss at step 700: 1.888586\n",
      "Training accuracy: 79.2%\n",
      "Validation accuracy: 76.2%\n",
      "Loss at step 800: 1.759213\n",
      "Training accuracy: 80.0%\n",
      "Validation accuracy: 76.6%\n",
      "  Test accuracy: 83.3%\n"
     ]
    }
   ],
   "source": [
    "num_steps = 801\n",
    "\n",
    "with tf.Session(graph=graph) as session:\n",
    "  # Init. \n",
    "  tf.initialize_all_variables().run()\n",
    "  print('Initialized')\n",
    "\n",
    "  for step in range(num_steps):\n",
    "    # Run the computations.\n",
    "    _, l, predictions = session.run([optimizer, loss, train_prediction])\n",
    "    if (step % 100 == 0):\n",
    "      print('Loss at step %d: %f' % (step, l))\n",
    "      print('Training accuracy: %.1f%%' % accuracy(\n",
    "        predictions, train_labels[:train_subset, :]))\n",
    "      print('Validation accuracy: %.1f%%' % accuracy(\n",
    "        valid_prediction.eval(), valid_labels))\n",
    "  print('  Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "now the one-layer network with ReLUs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "deep_graph = tf.Graph()\n",
    "with deep_graph.as_default():\n",
    "  tf_train_dataset = tf.placeholder(tf.float32,\n",
    "                                    shape=(batch_size, image_size * image_size))\n",
    "  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n",
    "  tf_valid_dataset = tf.constant(valid_dataset)\n",
    "  tf_test_dataset = tf.constant(test_dataset)\n",
    "\n",
    "  hidden_layer_size = 1024\n",
    "  hidden_weights = tf.Variable(\n",
    "    tf.truncated_normal([image_size * image_size, hidden_layer_size]))\n",
    "  hidden_biases = tf.Variable(tf.zeros([hidden_layer_size]))\n",
    "  hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases)\n",
    "  \n",
    "  output_weights = tf.Variable(\n",
    "    tf.truncated_normal([hidden_layer_size, num_labels]))\n",
    "  output_biases = tf.Variable(tf.zeros([num_labels]))\n",
    "  logits = tf.matmul(hidden_layer, output_weights) + output_biases\n",
    "\n",
    "  loss = tf.reduce_mean(\n",
    "    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n",
    "  l2_regularizer = tf.nn.l2_loss(output_weights) + tf.nn.l2_loss(hidden_weights)\n",
    "  loss += 5e-4 * l2_regularizer\n",
    "  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n",
    "  train_prediction = tf.nn.softmax(logits)\n",
    "\n",
    "  # Setup validation prediction step.\n",
    "  valid_hidden = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)\n",
    "  valid_logits = tf.matmul(valid_hidden, output_weights) + output_biases\n",
    "  valid_prediction = tf.nn.softmax(valid_logits)\n",
    "\n",
    "  # And setup the test prediction step.\n",
    "  test_hidden = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases)\n",
    "  test_logits = tf.matmul(test_hidden, output_weights) + output_biases\n",
    "  test_prediction = tf.nn.softmax(test_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Minibatch loss at step 0: 526.310730\n",
      "Minibatch accuracy: 10.2%\n",
      "Validation accuracy: 36.6%\n",
      "Minibatch loss at step 500: 129.465988\n",
      "Minibatch accuracy: 80.5%\n",
      "Validation accuracy: 79.5%\n",
      "Minibatch loss at step 1000: 98.813789\n",
      "Minibatch accuracy: 78.1%\n",
      "Validation accuracy: 81.3%\n",
      "Minibatch loss at step 1500: 75.423996\n",
      "Minibatch accuracy: 78.9%\n",
      "Validation accuracy: 82.0%\n",
      "Minibatch loss at step 2000: 56.996544\n",
      "Minibatch accuracy: 79.7%\n",
      "Validation accuracy: 83.5%\n",
      "Minibatch loss at step 2500: 43.878193\n",
      "Minibatch accuracy: 82.0%\n",
      "Validation accuracy: 84.0%\n",
      "Minibatch loss at step 3000: 33.956970\n",
      "Minibatch accuracy: 85.2%\n",
      "Validation accuracy: 84.1%\n",
      "  Test accuracy: 90.1%\n"
     ]
    }
   ],
   "source": [
    "num_steps = 3001\n",
    "\n",
    "with tf.Session(graph=deep_graph) as session:\n",
    "  tf.initialize_all_variables().run()\n",
    "  print(\"Initialized\")\n",
    "  for step in range(num_steps):\n",
    "    # Pick an offset within the training data, which has been randomized.\n",
    "    # Note: we could use better randomization across epochs.\n",
    "    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n",
    "    # Generate a minibatch.\n",
    "    batch_data = train_dataset[offset:(offset + batch_size), :]\n",
    "    batch_labels = train_labels[offset:(offset + batch_size), :]\n",
    "    # Prepare a dictionary telling the session where to feed the minibatch.\n",
    "    # The key of the dictionary is the placeholder node of the graph to be fed,\n",
    "    # and the value is the numpy array to feed to it.\n",
    "    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}\n",
    "    _, l, predictions = session.run(\n",
    "      [optimizer, loss, train_prediction], feed_dict=feed_dict)\n",
    "    if (step % 500 == 0):\n",
    "      print(\"Minibatch loss at step %d: %f\" % (step, l))\n",
    "      print(\"Minibatch accuracy: %.1f%%\" % accuracy(predictions, batch_labels))\n",
    "      print(\"Validation accuracy: %.1f%%\" % accuracy(\n",
    "        valid_prediction.eval(), valid_labels))\n",
    "  print(\"  Test accuracy: %.1f%%\" % accuracy(test_prediction.eval(), test_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "na8xX2yHZzNF"
   },
   "source": [
    "---\n",
    "Problem 2\n",
    "---------\n",
    "Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Minibatch loss at step 0: 510.406250\n",
      "Minibatch accuracy: 9.4%\n",
      "Validation accuracy: 26.3%\n",
      "Minibatch loss at step 500: 122.294800\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.7%\n",
      "Minibatch loss at step 1000: 95.239395\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.7%\n",
      "Minibatch loss at step 1500: 74.170250\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.7%\n",
      "Minibatch loss at step 2000: 57.762047\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.6%\n",
      "Minibatch loss at step 2500: 44.983719\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.6%\n",
      "Minibatch loss at step 3000: 35.032513\n",
      "Minibatch accuracy: 100.0%\n",
      "Validation accuracy: 77.6%\n",
      "  Test accuracy: 84.3%\n"
     ]
    }
   ],
   "source": [
    "batch_size = 128\n",
    "deep_graph = tf.Graph()\n",
    "with deep_graph.as_default():\n",
    "  tf_train_dataset = tf.placeholder(tf.float32,\n",
    "                                    shape=(batch_size, image_size * image_size))\n",
    "  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n",
    "  tf_valid_dataset = tf.constant(valid_dataset)\n",
    "  tf_test_dataset = tf.constant(test_dataset)\n",
    "\n",
    "  hidden_layer_size = 1024\n",
    "  hidden_weights = tf.Variable(\n",
    "    tf.truncated_normal([image_size * image_size, hidden_layer_size]))\n",
    "  hidden_biases = tf.Variable(tf.zeros([hidden_layer_size]))\n",
    "  hidden_layer = tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases)\n",
    "  \n",
    "  output_weights = tf.Variable(\n",
    "    tf.truncated_normal([hidden_layer_size, num_labels]))\n",
    "  output_biases = tf.Variable(tf.zeros([num_labels]))\n",
    "  logits = tf.matmul(hidden_layer, output_weights) + output_biases\n",
    "\n",
    "  loss = tf.reduce_mean(\n",
    "    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n",
    "  l2_regularizer = tf.nn.l2_loss(output_weights) + tf.nn.l2_loss(hidden_weights)\n",
    "  loss += 5e-4 * l2_regularizer\n",
    "  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n",
    "  train_prediction = tf.nn.softmax(logits)\n",
    "\n",
    "  # Setup validation prediction step.\n",
    "  valid_hidden = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)\n",
    "  valid_logits = tf.matmul(valid_hidden, output_weights) + output_biases\n",
    "  valid_prediction = tf.nn.softmax(valid_logits)\n",
    "\n",
    "  # And setup the test prediction step.\n",
    "  test_hidden = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases)\n",
    "  test_logits = tf.matmul(test_hidden, output_weights) + output_biases\n",
    "  test_prediction = tf.nn.softmax(test_logits)\n",
    "\n",
    "num_steps = 3001\n",
    "\n",
    "with tf.Session(graph=deep_graph) as session:\n",
    "  tf.initialize_all_variables().run()\n",
    "  print(\"Initialized\")\n",
    "  for step in range(num_steps):\n",
    "    # Pick an offset within the training data, which has been randomized.\n",
    "    # Note: we could use better randomization across epochs.\n",
    "    # offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n",
    "    offset = batch_size * np.random.choice(np.arange(5))\n",
    "    # Generate a minibatch.\n",
    "    batch_data = train_dataset[offset:(offset + batch_size), :]\n",
    "    batch_labels = train_labels[offset:(offset + batch_size), :]\n",
    "    # Prepare a dictionary telling the session where to feed the minibatch.\n",
    "    # The key of the dictionary is the placeholder node of the graph to be fed,\n",
    "    # and the value is the numpy array to feed to it.\n",
    "    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}\n",
    "    _, l, predictions = session.run(\n",
    "      [optimizer, loss, train_prediction], feed_dict=feed_dict)\n",
    "    if (step % 500 == 0):\n",
    "      print(\"Minibatch loss at step %d: %f\" % (step, l))\n",
    "      print(\"Minibatch accuracy: %.1f%%\" % accuracy(predictions, batch_labels))\n",
    "      print(\"Validation accuracy: %.1f%%\" % accuracy(\n",
    "        valid_prediction.eval(), valid_labels))\n",
    "  print(\"  Test accuracy: %.1f%%\" % accuracy(test_prediction.eval(), test_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "ww3SCBUdlkRc"
   },
   "source": [
    "---\n",
    "Problem 3\n",
    "---------\n",
    "Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.\n",
    "\n",
    "What happens to our extreme overfitting case?\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "back to the standard ReLU example (sans overfitting and sans L2 regularization)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "deep_graph = tf.Graph()\n",
    "with deep_graph.as_default():\n",
    "  tf_train_dataset = tf.placeholder(tf.float32,\n",
    "                                    shape=(batch_size, image_size * image_size))\n",
    "  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n",
    "  tf_valid_dataset = tf.constant(valid_dataset)\n",
    "  tf_test_dataset = tf.constant(test_dataset)\n",
    "\n",
    "  hidden_layer_size = 1024\n",
    "  hidden_weights = tf.Variable(\n",
    "    tf.truncated_normal([image_size * image_size, hidden_layer_size]))\n",
    "  hidden_biases = tf.Variable(tf.zeros([hidden_layer_size]))\n",
    "  hidden_layer = tf.nn.dropout(\n",
    "    tf.nn.relu(tf.matmul(tf_train_dataset, hidden_weights) + hidden_biases), 0.5)\n",
    "  \n",
    "  output_weights = tf.Variable(\n",
    "    tf.truncated_normal([hidden_layer_size, num_labels]))\n",
    "  output_biases = tf.Variable(tf.zeros([num_labels]))\n",
    "  logits = tf.matmul(hidden_layer, output_weights) + output_biases\n",
    "\n",
    "  loss = tf.reduce_mean(\n",
    "    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n",
    "  #l2_regularizer = tf.nn.l2_loss(output_weights) + tf.nn.l2_loss(hidden_weights)\n",
    "  #loss += 5e-4 * l2_regularizer\n",
    "  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)\n",
    "  train_prediction = tf.nn.softmax(logits)\n",
    "\n",
    "  # Setup validation prediction step.\n",
    "  valid_hidden = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)\n",
    "  valid_logits = tf.matmul(valid_hidden, output_weights) + output_biases\n",
    "  valid_prediction = tf.nn.softmax(valid_logits)\n",
    "\n",
    "  # And setup the test prediction step.\n",
    "  test_hidden = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_weights) + hidden_biases)\n",
    "  test_logits = tf.matmul(test_hidden, output_weights) + output_biases\n",
    "  test_prediction = tf.nn.softmax(test_logits)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Minibatch loss at step 0: 483.910400\n",
      "Minibatch accuracy: 14.1%\n",
      "Validation accuracy: 36.8%\n",
      "Minibatch loss at step 500: 26.865017\n",
      "Minibatch accuracy: 69.5%\n",
      "Validation accuracy: 81.0%\n",
      "Minibatch loss at step 1000: 14.268562\n",
      "Minibatch accuracy: 71.9%\n",
      "Validation accuracy: 79.5%\n",
      "Minibatch loss at step 1500: 16.476973\n",
      "Minibatch accuracy: 68.8%\n",
      "Validation accuracy: 80.1%\n",
      "Minibatch loss at step 2000: 5.323195\n",
      "Minibatch accuracy: 70.3%\n",
      "Validation accuracy: 80.5%\n",
      "Minibatch loss at step 2500: 6.034033\n",
      "Minibatch accuracy: 72.7%\n",
      "Validation accuracy: 79.7%\n",
      "Minibatch loss at step 3000: 4.610834\n",
      "Minibatch accuracy: 76.6%\n",
      "Validation accuracy: 78.7%\n",
      "  Test accuracy: 86.1%\n"
     ]
    }
   ],
   "source": [
    "num_steps = 3001\n",
    "\n",
    "with tf.Session(graph=deep_graph) as session:\n",
    "  tf.initialize_all_variables().run()\n",
    "  print(\"Initialized\")\n",
    "  for step in range(num_steps):\n",
    "    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n",
    "    batch_data = train_dataset[offset:(offset + batch_size), :]\n",
    "    batch_labels = train_labels[offset:(offset + batch_size), :]\n",
    "    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}\n",
    "    _, l, predictions = session.run(\n",
    "      [optimizer, loss, train_prediction], feed_dict=feed_dict)\n",
    "    if (step % 500 == 0):\n",
    "      print(\"Minibatch loss at step %d: %f\" % (step, l))\n",
    "      print(\"Minibatch accuracy: %.1f%%\" % accuracy(predictions, batch_labels))\n",
    "      print(\"Validation accuracy: %.1f%%\" % accuracy(\n",
    "        valid_prediction.eval(), valid_labels))\n",
    "  print(\"  Test accuracy: %.1f%%\" % accuracy(test_prediction.eval(), test_labels))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "not much effect -- some people say it helps if the networks are larger: https://discussions.udacity.com/t/problem-3-3-dropout-does-not-improve-test-accuarcy/46286/17\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "-b1hTz3VWZjw"
   },
   "source": [
    "---\n",
    "Problem 4\n",
    "---------\n",
    "\n",
    "Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).\n",
    "\n",
    "One avenue you can explore is to add multiple layers.\n",
    "\n",
    "Another one is to use learning rate decay:\n",
    "\n",
    "    global_step = tf.Variable(0)  # count the number of steps taken.\n",
    "    learning_rate = tf.train.exponential_decay(0.5, step, ...)\n",
    "    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)\n",
    " \n",
    " ---\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Minibatch loss at step 0: 2.546920\n",
      "Minibatch accuracy: 11.7%\n",
      "Validation accuracy: 25.2%\n",
      "Minibatch loss at step 500: 0.608449\n",
      "Minibatch accuracy: 82.8%\n",
      "Validation accuracy: 84.3%\n",
      "Minibatch loss at step 1000: 0.530836\n",
      "Minibatch accuracy: 85.2%\n",
      "Validation accuracy: 85.5%\n",
      "Minibatch loss at step 1500: 0.657449\n",
      "Minibatch accuracy: 85.2%\n",
      "Validation accuracy: 86.2%\n",
      "Minibatch loss at step 2000: 0.557790\n",
      "Minibatch accuracy: 83.6%\n",
      "Validation accuracy: 87.2%\n",
      "Minibatch loss at step 2500: 0.611100\n",
      "Minibatch accuracy: 85.2%\n",
      "Validation accuracy: 87.5%\n",
      "Minibatch loss at step 3000: 0.322495\n",
      "Minibatch accuracy: 90.6%\n",
      "Validation accuracy: 87.8%\n",
      "Minibatch loss at step 3500: 0.393889\n",
      "Minibatch accuracy: 87.5%\n",
      "Validation accuracy: 88.1%\n",
      "Minibatch loss at step 4000: 0.451827\n",
      "Minibatch accuracy: 85.9%\n",
      "Validation accuracy: 88.2%\n",
      "Minibatch loss at step 4500: 0.567805\n",
      "Minibatch accuracy: 84.4%\n",
      "Validation accuracy: 88.4%\n",
      "Minibatch loss at step 5000: 0.573152\n",
      "Minibatch accuracy: 83.6%\n",
      "Validation accuracy: 88.5%\n",
      "Minibatch loss at step 5500: 0.517981\n",
      "Minibatch accuracy: 84.4%\n",
      "Validation accuracy: 88.9%\n",
      "Minibatch loss at step 6000: 0.471292\n",
      "Minibatch accuracy: 90.6%\n",
      "Validation accuracy: 88.9%\n",
      "Minibatch loss at step 6500: 0.492043\n",
      "Minibatch accuracy: 86.7%\n",
      "Validation accuracy: 89.0%\n",
      "Minibatch loss at step 7000: 0.312218\n",
      "Minibatch accuracy: 89.8%\n",
      "Validation accuracy: 88.8%\n",
      "Minibatch loss at step 7500: 0.282312\n",
      "Minibatch accuracy: 93.0%\n",
      "Validation accuracy: 89.2%\n",
      "Minibatch loss at step 8000: 0.385935\n",
      "Minibatch accuracy: 92.2%\n",
      "Validation accuracy: 89.3%\n",
      "Minibatch loss at step 8500: 0.512655\n",
      "Minibatch accuracy: 84.4%\n",
      "Validation accuracy: 89.3%\n",
      "Minibatch loss at step 9000: 0.439570\n",
      "Minibatch accuracy: 87.5%\n",
      "Validation accuracy: 89.5%\n",
      "Minibatch loss at step 9500: 0.423484\n",
      "Minibatch accuracy: 86.7%\n",
      "Validation accuracy: 89.6%\n",
      "Minibatch loss at step 10000: 0.372880\n",
      "Minibatch accuracy: 91.4%\n",
      "Validation accuracy: 89.6%\n",
      "Minibatch loss at step 10500: 0.236239\n",
      "Minibatch accuracy: 91.4%\n",
      "Validation accuracy: 89.8%\n",
      "Minibatch loss at step 11000: 0.459948\n",
      "Minibatch accuracy: 85.2%\n",
      "Validation accuracy: 89.7%\n",
      "Minibatch loss at step 11500: 0.438499\n",
      "Minibatch accuracy: 89.8%\n",
      "Validation accuracy: 89.7%\n",
      "Minibatch loss at step 12000: 0.315564\n",
      "Minibatch accuracy: 89.8%\n",
      "Validation accuracy: 89.8%\n",
      "Minibatch loss at step 12500: 0.342032\n",
      "Minibatch accuracy: 90.6%\n",
      "Validation accuracy: 89.6%\n",
      "Minibatch loss at step 13000: 0.373088\n",
      "Minibatch accuracy: 90.6%\n",
      "Validation accuracy: 89.7%\n",
      "Minibatch loss at step 13500: 0.280111\n",
      "Minibatch accuracy: 93.0%\n",
      "Validation accuracy: 89.7%\n",
      "Minibatch loss at step 14000: 0.553383\n",
      "Minibatch accuracy: 86.7%\n",
      "Validation accuracy: 89.9%\n",
      "Minibatch loss at step 14500: 0.263432\n",
      "Minibatch accuracy: 89.8%\n",
      "Validation accuracy: 89.9%\n",
      "Minibatch loss at step 15000: 0.471659\n",
      "Minibatch accuracy: 88.3%\n",
      "Validation accuracy: 90.0%\n",
      "Minibatch loss at step 15500: 0.343309\n",
      "Minibatch accuracy: 88.3%\n",
      "Validation accuracy: 89.9%\n",
      "Minibatch loss at step 16000: 0.523962\n",
      "Minibatch accuracy: 84.4%\n",
      "Validation accuracy: 89.8%\n",
      "Minibatch loss at step 16500: 0.261766\n",
      "Minibatch accuracy: 93.8%\n",
      "Validation accuracy: 90.2%\n",
      "Minibatch loss at step 17000: 0.329776\n",
      "Minibatch accuracy: 90.6%\n",
      "Validation accuracy: 90.3%\n",
      "Minibatch loss at step 17500: 0.369553\n",
      "Minibatch accuracy: 88.3%\n",
      "Validation accuracy: 90.1%\n",
      "Minibatch loss at step 18000: 0.365626\n",
      "Minibatch accuracy: 89.1%\n",
      "Validation accuracy: 90.0%\n",
      "Minibatch loss at step 18500: 0.231381\n",
      "Minibatch accuracy: 93.8%\n",
      "Validation accuracy: 90.0%\n",
      "Minibatch loss at step 19000: 0.432183\n",
      "Minibatch accuracy: 83.6%\n",
      "Validation accuracy: 90.3%\n",
      "Minibatch loss at step 19500: 0.493186\n",
      "Minibatch accuracy: 82.8%\n",
      "Validation accuracy: 90.3%\n",
      "  Test accuracy: 95.1%\n"
     ]
    }
   ],
   "source": [
    "batch_size = 128\n",
    "\n",
    "hidden_layer_1_size = 1024\n",
    "hidden_layer_2_size = 300\n",
    "hidden_layer_3_size = 50\n",
    "hidden_layer_1_stddev = np.sqrt(2.0/784) \n",
    "hidden_layer_2_stddev = np.sqrt(2.0/hidden_layer_1_size)\n",
    "hidden_layer_3_stddev = np.sqrt(2.0/hidden_layer_2_size)\n",
    "output_layer_stddev = np.sqrt(2.0/hidden_layer_3_size)\n",
    "hidden_layer_1_keep_prob = 0.5\n",
    "hidden_layer_2_keep_prob = 0.7\n",
    "hidden_layer_3_keep_prob = 0.8\n",
    "beta_1 = 1e-5\n",
    "beta_2 = 1e-5\n",
    "beta_3 = 1e-5\n",
    "beta_4 = 1e-5\n",
    "\n",
    "deep_graph = tf.Graph()\n",
    "with deep_graph.as_default():\n",
    "  tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))\n",
    "  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))\n",
    "  tf_valid_dataset = tf.constant(valid_dataset)\n",
    "  tf_test_dataset = tf.constant(test_dataset)\n",
    "\n",
    "  # first hidden layer\n",
    "  hidden_layer_1_weights = tf.Variable(\n",
    "    tf.truncated_normal(\n",
    "      [image_size * image_size, hidden_layer_1_size], stddev=hidden_layer_1_stddev))\n",
    "  hidden_layer_1_biases = tf.Variable(tf.zeros([hidden_layer_1_size]))\n",
    "  hidden_layer_1 = tf.nn.dropout(\n",
    "    tf.nn.relu(tf.matmul(tf_train_dataset, hidden_layer_1_weights) + hidden_layer_1_biases),\n",
    "    hidden_layer_1_keep_prob)\n",
    "  \n",
    "  # second hidden layer\n",
    "  hidden_layer_2_weights = tf.Variable(\n",
    "    tf.truncated_normal(\n",
    "      [hidden_layer_1_size, hidden_layer_2_size], stddev=hidden_layer_2_stddev))\n",
    "  hidden_layer_2_biases = tf.Variable(tf.zeros([hidden_layer_2_size]))\n",
    "  hidden_layer_2 = tf.nn.dropout(\n",
    "    tf.nn.relu(tf.matmul(hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases),\n",
    "    hidden_layer_2_keep_prob)\n",
    "  \n",
    "  # third hidden layer\n",
    "  hidden_layer_3_weights = tf.Variable(\n",
    "    tf.truncated_normal(\n",
    "      [hidden_layer_2_size, hidden_layer_3_size], stddev=hidden_layer_3_stddev))\n",
    "  hidden_layer_3_biases = tf.Variable(tf.zeros([hidden_layer_3_size]))\n",
    "  hidden_layer_3 = tf.nn.dropout(\n",
    "    tf.nn.relu(tf.matmul(hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases), \n",
    "    hidden_layer_3_keep_prob)\n",
    "  \n",
    "  # output layer\n",
    "  output_weights = tf.Variable(\n",
    "    tf.truncated_normal(\n",
    "      [hidden_layer_3_size, num_labels],\n",
    "      stddev=output_layer_stddev))\n",
    "  output_biases = tf.Variable(tf.zeros([num_labels]))\n",
    "  logits = tf.matmul(hidden_layer_3, output_weights) + output_biases\n",
    "\n",
    "  # Calculate the loss with regularization\n",
    "  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))\n",
    "  loss += (beta_1 * tf.nn.l2_loss(hidden_layer_1_weights) +\n",
    "           beta_2 * tf.nn.l2_loss(hidden_layer_2_weights) +\n",
    "           beta_3 * tf.nn.l2_loss(hidden_layer_3_weights) +\n",
    "           beta_4 * tf.nn.l2_loss(output_weights))\n",
    "  \n",
    "  # Learn with exponential rate decay.\n",
    "  global_step = tf.Variable(0, trainable=False)\n",
    "  starter_learning_rate = 0.4\n",
    "  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True)\n",
    "  #learning_rate = 0.1\n",
    "  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)\n",
    "  train_prediction = tf.nn.softmax(logits)\n",
    "\n",
    "  # Setup validation prediction step.\n",
    "  validation_hidden_layer_1 = tf.nn.relu(tf.matmul(tf_valid_dataset, hidden_layer_1_weights) + hidden_layer_1_biases)\n",
    "  validation_hidden_layer_2 = tf.nn.relu(tf.matmul(validation_hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases)\n",
    "  validation_hidden_layer_3 = tf.nn.relu(tf.matmul(validation_hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases)\n",
    "  validation_logits = tf.matmul(validation_hidden_layer_3, output_weights) + output_biases\n",
    "  validation_prediction = tf.nn.softmax(validation_logits)\n",
    "\n",
    "  # And setup the test prediction step.  \n",
    "  test_hidden_layer_1 = tf.nn.relu(tf.matmul(tf_test_dataset, hidden_layer_1_weights) + hidden_layer_1_biases)\n",
    "  test_hidden_layer_2 = tf.nn.relu(tf.matmul(test_hidden_layer_1, hidden_layer_2_weights) + hidden_layer_2_biases)\n",
    "  test_hidden_layer_3 = tf.nn.relu(tf.matmul(test_hidden_layer_2, hidden_layer_3_weights) + hidden_layer_3_biases)\n",
    "  test_logits = tf.matmul(test_hidden_layer_3, output_weights) + output_biases\n",
    "  test_prediction = tf.nn.softmax(test_logits)\n",
    "\n",
    "num_steps = 20000\n",
    "\n",
    "with tf.Session(graph=deep_graph) as session:\n",
    "  tf.initialize_all_variables().run()\n",
    "  print(\"Initialized\")\n",
    "  for step in range(num_steps):\n",
    "    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)\n",
    "    batch_data = train_dataset[offset:(offset + batch_size), :]\n",
    "    batch_labels = train_labels[offset:(offset + batch_size), :]\n",
    "    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}\n",
    "    _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)\n",
    "    if (step % 500 == 0):\n",
    "      print(\"Minibatch loss at step %d: %f\" % (step, l))\n",
    "      print(\"Minibatch accuracy: %.1f%%\" % accuracy(predictions, batch_labels))\n",
    "      print(\"Validation accuracy: %.1f%%\" % accuracy(validation_prediction.eval(), valid_labels))\n",
    "  print(\"  Test accuracy: %.1f%%\" % accuracy(test_prediction.eval(), test_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "default_view": {},
   "name": "3_regularization.ipynb",
   "provenance": [],
   "version": "0.3.2",
   "views": {}
  },
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }