ganindu7 · August 28, 2024 13:39
diff --git a/.gitignore b/.gitignore
 ./data/*
 logs
 runs
 ./img/*
 tiny-imagenet-200
 *.pth
diff --git a/alexnet_replica.ipynb b/alexnet_replica.ipynb
diff --git a/alexnet_with_tensorboard.ipynb b/alexnet_with_tensorboard.ipynb
diff --git a/custom_datasets.py b/custom_datasets.py
 import os 
 import pandas as pd 
 from torchvision.io import read_image
 from torch.utils.data import Dataset

 class CustomImageDataset(Dataset):
    def __init__(self, annotation_file, img_dir, transform=None, target_transform=None):
        self.img_labels = pd.read_csv(annotation_file)
        self.img_dir = img_dir
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[index, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[index, 1]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

        
        
diff --git a/data b/data
 ../../data
diff --git a/dataloader.py b/dataloader.py
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 import matplotlib.pyplot as plt
 from torch.utils.data import DataLoader

 training_data = datasets.FashionMNIST(root="data",
                                      train="True",
                                      download=True,
                                      transform=ToTensor()
                                     )
                  
 labels_map = {
    0: "Tshirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot",
 }

 figure = plt.figure(figsize=(8,8))
 train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
 train_features, train_labels = next(iter(train_dataloader))
 print(f"Feature batch shape: {train_features.size()}")
 print(f"Labels batch shape: {train_labels.size()}")
 img = train_features[0].squeeze()
 label = int(train_labels[0])
 print(f"Label: {labels_map[label]}")
 plt.title(labels_map[label])
 plt.imshow(img, cmap="gray")
 plt.show()
diff --git a/eager_model.py b/eager_model.py
 import os
 import torch 
 from torch import nn
 from torch.utils.data import DataLoader
 from torchvision import datasets, transforms 

 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using {device} device')

 class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


 model = NeuralNetwork().to(device)
 print(model)

 X = torch.rand(1, 28, 28, device=device)
 logits = model(X)
 pred_prob = nn.Softmax(dim=1)(logits)
 y_pred = pred_prob.argmax(1)
 print(f"Prediction: {y_pred}")
diff --git a/img b/img
 ../../img/
diff --git a/loading_models.py b/loading_models.py
 import torch
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 from quickstart import NeuralNetwork

 # the process of loading models includes re-creating the model structure and loading the state dictionary into it

 model = NeuralNetwork()
 model.load_state_dict(torch.load("model.pth"))

 model.eval()

 classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
 ]

 test_data = datasets.FashionMNIST(
                                    root="data",
                                    train=False,
                                    download=True,
                                    transform=ToTensor(),
    )

 x, y = test_data[0][0], test_data[0][1]

 with torch.no_grad():
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f"Predicted: {predicted}, Actual: {actual}")
diff --git a/model_params.py b/model_params.py
 import torch
 from torch import nn
 from torchvision import datasets
 from torchvision.transforms import ToTensor

 from torchvision import datasets, transforms 

 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print(f'Using {device} device')

 '''
 Define a NN
 '''

 class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


 '''
 acquire some training data 
 '''
 training_data = datasets.FashionMNIST(root="data",
                                      train="True",
                                      download=True,
                                      transform=ToTensor()
                                     )

 '''
 get some image tensors from the dataset 
 '''
 listsize = 3
 image_list = []
 for i in range(1, listsize + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, *_  = training_data[sample_idx]
    image_list.append(img)


 '''
 instantiate the model and pass an image through it
 Note: The model is still untrained
 '''
 model = NeuralNetwork().to(device)
 image = image_list[0].to(device)
 logits = model(image)
 pred_prob = nn.Softmax(dim=1)(logits)
 y_pred = pred_prob.argmax(1)
 print(f"Prediction: {y_pred}")

 print(f"Model Structure: {model} ")

 for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
diff --git a/network_components.py b/network_components.py
 import torch
 from torch import nn
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 # import matplotlib.pyplot as plt
 # import  numpy as np

 training_data = datasets.FashionMNIST(root="data",
                                      train="True",
                                      download=True,
                                      transform=ToTensor()
                                     )

 listsize = 3
 image_list = []
 for i in range(1, listsize + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    image_list.append(img)

 '''
 The image pulled from the training data has dimensions  [1=channels, 28=dim1(width?), 28=dim2(height?)]
 the n image stack now has dimensions [n, 1, 28, 28]
 the following step will squeeze out the dimension with one resulting in a [n=3?, 28, 28] tensor.
 you can uncomment the two lines below to verify this behavour.
 '''
 # intermediate_raw_stack = torch.stack(image_list,0)
 # print(f"intermediate dimaneions {intermediate_raw_stack.size()}")
 images = torch.squeeze(torch.stack(image_list,0), 1) # image tensor 
 print(f"Images are now stacked into a tensor of dims: {images.size()}")

 '''
 Flattten layer conditions the 2D input by reducing the dimensions 
 '''
 flatten = nn.Flatten()
 flat_images = flatten(images)

 print(f"Now we have a tensor of 1D values (flattened 2D values) representing images: {flat_images.size()}")

 layer1 = nn.Linear(in_features=28*28, out_features=512)
 hidden1 = layer1(flat_images)
 print(f"Size of the hidden linear layer [1] = {hidden1.size()}")

 print(f"Before ReLU: {hidden1}")
 hidden1 = nn.ReLU()(hidden1)
 print(f"After ReLU: {hidden1}")

 '''
 sequential net made from the modules above. 
 '''

 seq_modules = nn.Sequential(flatten,  # flattened (from 28x28/2D to 784/1D) feature tensors
                            layer1,   # input: flattened features, output 512 units
                            nn.ReLU(),
                            nn.Linear(512, 10)
                            )

 logits = seq_modules(images)

 print(f"logits after seqential network = {logits}")

 softmax = nn.Softmax(dim=1) # 'dim' parameter indicates the dimension along the values must sum to 1
 predicted_probabilities = softmax(logits)   

 print(f"predicted probababilities = {predicted_probabilities}")
diff --git a/nn_refresher.ipynb b/nn_refresher.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using basic torch functionality: We will import torch in the most basic way and use it to perform tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch # import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sum:  tensor([5., 7., 9.])\n"
     ]
    }
   ],
   "source": [
    "# create a tensor\n",
    "x = torch.tensor([1.0, 2.0, 3.0])\n",
    "y = torch.tensor([4.0, 5.0, 6.0])\n",
    "\n",
    "# basic operations\n",
    "\n",
    "z = x + y\n",
    "print(\"sum: \", z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "d =  tensor(4., grad_fn=<MeanBackward0>)\n",
      "Gradients:  tensor([0.6667, 0.6667, 0.6667])\n"
     ]
    }
   ],
   "source": [
    "# Autograd Example \n",
    "\n",
    "a = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)\n",
    "\n",
    "'''\n",
    "In this case 'a' is a user created tensor that has enebaled gradient tracking.\n",
    "\n",
    "Gradient tracking eabled tensor operations are linked to a graph structure that can store information about the operations \n",
    "they are involved in.\n",
    "\n",
    "These user created gradient tracking tensors (that are connected to other tensors with a single edge in the graph structure) are \n",
    "also called leaf tensors, these leaf tensors when operated with other tensors create new tensors (forming new connections in the graph) that \n",
    "can store information as the graph structure grows.\n",
    "\n",
    "a tensor operation (let's say a multiply operation) transforms the value through a gradient and the edge in the \n",
    "graph from the input tensor to the output tensor holds(representatively) the gradient of that particular operation. This also \n",
    "means that the edge to the other operand can hold a gradient represnting the trasnsformation along that path.\n",
    "\n",
    "This behaviour makes certion operaitons problematic in this context. let's say we do an in place operation \n",
    "without creating a new tensor. this causes a circular graph connection that does not involve an another node.\n",
    "this effectively breaks up the gradient storing operation becasue if we backtrack through the edge (now circular becasue \n",
    "it comes back to the symbol a) we should return to a past a where the variable is at it's pre-transform value. The inplace operation\n",
    "eassentally deletes the record of the past.\n",
    "\n",
    "so essentailly in place operations for gradient tracking enabled tensors are invalid\n",
    "\n",
    "a = a * 0.5 # will throw an error \n",
    "\n",
    "'''\n",
    "\n",
    "b = a * 2 # here we are creating a new node in the graph and the gradient will be saved.\n",
    "\n",
    "c = b.sum() # [1.0 + 2.0 + 3.0] x 2 == 12 (gradient only comes from multiplication)\n",
    "\n",
    "\n",
    "d = b.mean() # [1.0 + 2.0 + 3.0] x 2/3  == 4.0 (gradient comes from the multiplication and the divison in the averagining)\n",
    "\n",
    "'''\n",
    "The reason we need to use the mean function is because the gradient is designed to use with tha scaler output \n",
    "such as an overall loss of an objective.\n",
    "'''\n",
    "\n",
    "# c.backward()\n",
    "# print(\"c = \", c)\n",
    "\n",
    "d.backward()\n",
    "print(\"d = \", d)\n",
    "\n",
    "# Gradients \n",
    "\n",
    "print(\"Gradients: \", a.grad)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Crafting a simple network Part 1: Here we will create a primiive network from pytorch fundemantals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:619: UserWarning: Can't initialize NVML\n",
      "  warnings.warn(\"Can't initialize NVML\")\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss = 0.21906575560569763\n",
      "Epoch 2, Loss = 0.1309550702571869\n",
      "Epoch 3, Loss = 0.0665150135755539\n",
      "Epoch 4, Loss = 0.18628628551959991\n",
      "Epoch 5, Loss = 0.01886335015296936\n"
     ]
    }
   ],
   "source": [
    "# import torch \n",
    "import torch.nn as nn\n",
    "import torch.optim as optim \n",
    "import torch.utils\n",
    "import torch.utils.data\n",
    "from torchvision import datasets, transforms\n",
    "\n",
    "# Define the network\n",
    "class SuperSimpleNN(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(SuperSimpleNN, self).__init__()\n",
    "        self.fc1 = nn.Linear(28*28, 128)\n",
    "        self.fc2 = nn.Linear(128, 10)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = x.view(-1, 28*28)\n",
    "        x = torch.relu(self.fc1(x))\n",
    "        x = self.fc2(x)\n",
    "        return x\n",
    "    \n",
    "# Load dataset \n",
    "transform =  transforms.Compose([transforms.ToTensor()])\n",
    "train_dataset = datasets.MNIST(root='./data/MNIST', train=True, download=True, transform=transform)\n",
    "train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)\n",
    "\n",
    "# Initialize network \n",
    "model = SuperSimpleNN()\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
    "\n",
    "# Training loop \n",
    "for epoch in range(5):\n",
    "    for batch_idx, (data, target) in enumerate(train_loader):\n",
    "        optimizer.zero_grad() # reset trainiable params\n",
    "        output = model(data)  # get output  \n",
    "        loss = criterion(output, target) # compare output vs target and obtain loss \n",
    "        loss.backward()  # backpropagate\n",
    "        optimizer.step() # update trainable parameters\n",
    "\n",
    "\n",
    "    print(f'Epoch {epoch+1}, Loss = {loss.item()}')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Crafting a simple network Part 2 :Creating a Network to classify FlashionMNIST labels! "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we look at the above code we essentially do four things to end up with a better model.\n",
    "\n",
    "1. Get model output: run a forward pass \n",
    "2. Get the loss: compare the model output and target \n",
    "3. backpropagate: run the backward pass and calculate the gradients \n",
    "4. update trainable parameters \n",
    "\n",
    "we do this over a few epochs and the loss will usually trend downwards as shown in the example above. \n",
    "\n",
    "Now let;s try a slighly advanced model, here we will employ validation, save checkpoints and handle overfitting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax. Perhaps you forgot a comma? (97522079.py, line 35)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  Cell \u001b[0;32mIn[5], line 35\u001b[0;36m\u001b[0m\n\u001b[0;31m    nn.Linear(self.input_resolution, 128)\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax. Perhaps you forgot a comma?\n"
     ]
    }
   ],
   "source": [
    "import torch \n",
    "import torch.nn as nn\n",
    "import torch.optim as optim \n",
    "from torchvision import datasets, transforms\n",
    "\n",
    "from torch.utils.data import DataLoader, random_split\n",
    "\n",
    "'''\n",
    "This is basically the SuperSimpleNN:\n",
    "    has to fc layers with a ReLU nonlinearity inbetween them : \n",
    "                fc layer 1 (hidden) -> (28*28) x 128 (128 neurones)\n",
    "                fc layer 2 (output) -> 128 x 10 (10 neurones)                      \n",
    "'''\n",
    "class simpleNN(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(simpleNN, self).__init__()\n",
    "        self.fc1 = nn.Linear(28*28, 128)\n",
    "        self.fc2 = nn.Linear(128, 10)\n",
    "\n",
    "    def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
    "        x = x.view(-1, 28*28)\n",
    "        x = torch.relu(self.fc1(x))\n",
    "        x = self.fc2(x)\n",
    "        return x\n",
    "\n",
    "\n",
    "'''\n",
    "\n",
    "'''\n",
    "class bitFancyNN(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(bitFancyNN, self).__init__()\n",
    "        \n",
    "        self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
    "        self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
    "\n",
    "        \n",
    "\n",
    "        self.fc1_block = nn.Sequential(\n",
    "            nn.Linear(28*28, 128),\n",
    "            nn.ReLU\n",
    "        )\n",
    "        self\n",
    "\n",
    "\n",
    "transform = transforms.Compose([transforms.ToTensor()])\n",
    "dataset =  datasets.MNIST(root='./data/MNIST', train=True, download=True, transform=transform)\n",
    "\n",
    "#split into training a d validation\n",
    "train_size = int(0.8 * len(dataset))\n",
    "val_size = len(dataset) - train_size\n",
    "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
    "\n",
    "train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)\n",
    "val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)\n",
    "\n",
    "model = simpleNN()\n",
    "criterion = nn.CrossEntropyLoss()\n",
    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
    "\n",
    "best_val_loss = float('inf')\n",
    "\n",
    "num_epochs = 5\n",
    "\n",
    "for epoch in range(num_epochs):\n",
    "\n",
    "    model.train() # for each epoch we need to set the model to training mode\n",
    "    for data, target in train_loader: # train loader outputs data and target\n",
    "        optimizer.zero_grad()\n",
    "        output = model(data)\n",
    "        loss = criterion(output, target)\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "    model.eval() # put the model into eval mode \n",
    "    val_loss = 0\n",
    "    with torch.no_grad():\n",
    "        for data, target in val_loader:\n",
    "            output = model(data)\n",
    "            val_loss += criterion(output, target).item()\n",
    "    \n",
    "    val_loss /= len(val_loader)\n",
    "    print(f'Epoch {epoch+1}: Validation Loss = {val_loss}')\n",
    "\n",
    "    # save model if validation loss has decreased\n",
    "    if val_loss < best_val_loss:\n",
    "        best_val_loss = val_loss\n",
    "        torch.save(model.state_dict(), 'best_model.pth')\n",
    "\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import torch # commented out becase we imported this earlier (uncomment if you skip the above steps)\n",
    "from torch import nn\n",
    "from torch.utils.data import DataLoader\n",
    "from torchvision import datasets \n",
    "from torchvision.transforms import ToTensor, Lambda, Compose \n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "FashonMNIST dataset is pre set in the library and this declations will try to download the dataset if not present already."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Training data\n",
    "'''\n",
    "\n",
    "training_data = datasets.FashionMNIST(\n",
    "                                        root=\"./data/MNIST\",\n",
    "                                        train=\"True\",\n",
    "                                        download=\"True\",\n",
    "                                        transform=ToTensor(),\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "here we set up the parameters for the dataloader, in this case we will set up a custom batch_size "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "set up DataLoader\n",
    "'''\n",
    "batch_size = 64\n",
    "train_dataloader = DataLoader(training_data, batch_size=batch_size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's say we are setting up a simple single-layer network for binary classification.\n",
    "The input data consists of small images, each 28 pixels in height and 28 pixels in width. As a preprocessing step, these images are flattened by reducing the 2D spatial dimensions into a 1D vector of 784 elements (since 28 × 28 = 784). These flattened vectors are then stacked into batches during the data loading stage.\n",
    "\n",
    "For example, if we choose a batch size of M, the input to the network will have dimensions M × 784.\n",
    "\n",
    "Let's assume we are using a single fully connected layer as our model. This layer will have dimensions 784 × 2, where the two output nodes correspond to the two classes in our binary classification task. After the matrix multiplication between the input batch M × 784 and the layer weights 784 × 2, the resulting output tensor will have dimensions M × 2. This output represents the logits for each class across the batch.\n",
    "\n",
    "Subsequent activation (e.g., softmax) will convert these logits into probabilities, allowing us to classify each image in the batch into one of the two categories we are concerened about.\n",
    "\n",
    "In the example below we are classifying between 10 categories becasue the MNIST dataset comes with labels for 10 categories. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "Model Creation\n",
    "'''\n",
    "\n",
    "from typing import Callable\n",
    "\n",
    "# Model definition\n",
    "class NeuralNetwork(nn.Module):\n",
    "    def __init__(self):\n",
    "        super(NeuralNetwork, self).__init__()\n",
    "        self.flatten = nn.Flatten()\n",
    "        self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "        self.linear_relu_stack = nn.Sequential(\n",
    "            nn.Linear(28*28, 512),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(512, 512),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(512, 10) # we have 10 classes so we channel the output into 10 slots \n",
    "        )\n",
    "\n",
    "        self.loss_fn = nn.CrossEntropyLoss()\n",
    "        self.optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.flatten(x)\n",
    "        logits = self.linear_relu_stack(x)\n",
    "        return logits\n",
    "\n",
    "\n",
    "def train(dataloader: DataLoader, \n",
    "          model: NeuralNetwork, \n",
    "          loss_fn: Callable,\n",
    "          optimizer: Callable):\n",
    "    size = len(train_dataloader.dataset)\n",
    "    model.train() # set model to training mode\n",
    "\n",
    "    for batch, (X, y) in enumerate(dataloader):\n",
    "        X, y = X.to(model.device), y.to(model.device)\n",
    "\n",
    "        # compute prediction error\n",
    "        pred = model(X)\n",
    "        loss = loss_fn(pred, y)\n",
    "\n",
    "        # backprop\n",
    "        optimizer.zero_grad()\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        if batch % 100 == 0:\n",
    "            loss, current = loss.item(), batch * len(X)\n",
    "            print(f\"loss: {loss:7f} [{current:>5d}/{size:>5d}]\")\n",
    "\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# device= \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "model = NeuralNetwork()\n",
    "model = model.to(model.device)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NeuralNetwork(\n",
      "  (flatten): Flatten(start_dim=1, end_dim=-1)\n",
      "  (linear_relu_stack): Sequential(\n",
      "    (0): Linear(in_features=784, out_features=512, bias=True)\n",
      "    (1): ReLU()\n",
      "    (2): Linear(in_features=512, out_features=512, bias=True)\n",
      "    (3): ReLU()\n",
      "    (4): Linear(in_features=512, out_features=10, bias=True)\n",
      "  )\n",
      "  (loss_fn): CrossEntropyLoss()\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "print(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1\n",
      "----------\n",
      "loss: 2.300676 [    0/60000]\n",
      "loss: 2.289026 [ 6400/60000]\n",
      "loss: 2.269229 [12800/60000]\n",
      "loss: 2.268934 [19200/60000]\n",
      "loss: 2.243828 [25600/60000]\n",
      "loss: 2.217938 [32000/60000]\n",
      "loss: 2.218239 [38400/60000]\n",
      "loss: 2.186219 [44800/60000]\n",
      "loss: 2.193820 [51200/60000]\n",
      "loss: 2.150481 [57600/60000]\n",
      "Epoch 2\n",
      "----------\n",
      "loss: 2.159364 [    0/60000]\n",
      "loss: 2.147613 [ 6400/60000]\n",
      "loss: 2.091816 [12800/60000]\n",
      "loss: 2.114179 [19200/60000]\n",
      "loss: 2.043714 [25600/60000]\n",
      "loss: 1.988512 [32000/60000]\n",
      "loss: 2.013317 [38400/60000]\n",
      "loss: 1.930811 [44800/60000]\n",
      "loss: 1.953270 [51200/60000]\n",
      "loss: 1.858937 [57600/60000]\n",
      "Epoch 3\n",
      "----------\n",
      "loss: 1.905293 [    0/60000]\n",
      "loss: 1.867958 [ 6400/60000]\n",
      "loss: 1.757275 [12800/60000]\n",
      "loss: 1.804702 [19200/60000]\n",
      "loss: 1.668520 [25600/60000]\n",
      "loss: 1.630944 [32000/60000]\n",
      "loss: 1.655383 [38400/60000]\n",
      "loss: 1.555281 [44800/60000]\n",
      "loss: 1.590403 [51200/60000]\n",
      "loss: 1.472981 [57600/60000]\n",
      "Epoch 4\n",
      "----------\n",
      "loss: 1.569801 [    0/60000]\n",
      "loss: 1.527082 [ 6400/60000]\n",
      "loss: 1.383564 [12800/60000]\n",
      "loss: 1.460762 [19200/60000]\n",
      "loss: 1.321409 [25600/60000]\n",
      "loss: 1.330832 [32000/60000]\n",
      "loss: 1.348618 [38400/60000]\n",
      "loss: 1.268446 [44800/60000]\n",
      "loss: 1.307505 [51200/60000]\n",
      "loss: 1.208527 [57600/60000]\n",
      "Epoch 5\n",
      "----------\n",
      "loss: 1.313115 [    0/60000]\n",
      "loss: 1.287318 [ 6400/60000]\n",
      "loss: 1.124761 [12800/60000]\n",
      "loss: 1.239573 [19200/60000]\n",
      "loss: 1.100252 [25600/60000]\n",
      "loss: 1.134562 [32000/60000]\n",
      "loss: 1.162831 [38400/60000]\n",
      "loss: 1.091245 [44800/60000]\n",
      "loss: 1.136182 [51200/60000]\n",
      "loss: 1.056075 [57600/60000]\n"
     ]
    }
   ],
   "source": [
    "epochs = 5\n",
    "for epoch in range(epochs):\n",
    "    print(f\"Epoch {epoch+1}\\n----------\")\n",
    "    train(train_dataloader, model, model.loss_fn, model.optimizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/quickstart.py b/quickstart.py
 import torch 
 from torch import nn
 from torch.utils.data import DataLoader
 from torchvision import datasets
 from torchvision.transforms import ToTensor, Lambda, Compose
 import matplotlib.pyplot as plt


 '''
 Model creation
 '''

 # Model definition 
 class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)


    def forward(self, x):
            x = self.flatten(x)
            logits = self.linear_relu_stack(x)
            return logits



 def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y ) in enumerate(dataloader):
        X, y = X.to(model.device), y.to(model.device)

        # compute prediction error 
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

 def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(model.device), y.to(model.device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")




 if __name__ == "__main__":

    # download training data from open datasets
    training_data = datasets.FashionMNIST(
                                       root="data",
                                       train=True,
                                       download=True,
                                       transform=ToTensor(),
    )

    # download test data from open datsets 
    test_data = datasets.FashionMNIST(
                                    root="data",
                                    train=False,
                                    download=True,
                                    transform=ToTensor(),
    )

    batch_size = 64


    '''
    pass the data to a Dataloader, the Dataloader wraps an iterable over the dataset
    and support automatic  batching, random sampling, then loads the data into our training functions
    Here our batch size is 64 each elemement will trturn a batch size of 64 features and labels. 
    '''

    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = NeuralNetwork().to(device)

    epochs = 5
    for t in range(epochs):
        print(f"Epoch {t+1}\n-----------------------")
        train(train_dataloader, model, model.loss_fn, model.optimizer)
        test(test_dataloader, model, model.loss_fn)
    print("Done")

    torch.save(model.state_dict(), "model.pth")
    print("saved model")


diff --git a/visualise_data.py b/visualise_data.py
 import torch
 # from torch.utils.data import Dataset
 from torchvision import datasets
 from torchvision.transforms import ToTensor
 import matplotlib.pyplot as plt

 training_data = datasets.FashionMNIST(root="data",
                                      train="True",
                                      download=True,
                                      transform=ToTensor()
                                     )
                  
 labels_map = {
    0: "Tshirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle boot",
 }

 figure = plt.figure(figsize=(8,8))
 cols, rows = 3, 3
 for i in range(1, cols*rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    # print(f"image shape before squeeze = {img.shape}")
    plt.imshow(img.squeeze(), cmap="gray")
 plt.show()
	import os
	import pandas as pd
	from torchvision.io import read_image
	from torch.utils.data import Dataset

	class CustomImageDataset(Dataset):
	def __init__(self, annotation_file, img_dir, transform=None, target_transform=None):
	self.img_labels = pd.read_csv(annotation_file)
	self.img_dir = img_dir
	self.transform = transform
	self.target_transform = target_transform

	def __len__(self):
	return len(self.img_labels)

	def __getitem__(self, index):
	img_path = os.path.join(self.img_dir, self.img_labels.iloc[index, 0])
	image = read_image(img_path)
	label = self.img_labels.iloc[index, 1]
	if self.transform:
	image = self.transform(image)
	if self.target_transform:
	label = self.target_transform(label)
	return image, label
	from torchvision import datasets
	from torchvision.transforms import ToTensor
	import matplotlib.pyplot as plt
	from torch.utils.data import DataLoader

	training_data = datasets.FashionMNIST(root="data",
	train="True",
	download=True,
	transform=ToTensor()
	)

	labels_map = {
	0: "Tshirt",
	1: "Trouser",
	2: "Pullover",
	3: "Dress",
	4: "Coat",
	5: "Sandal",
	6: "Shirt",
	7: "Sneaker",
	8: "Bag",
	9: "Ankle boot",
	}

	figure = plt.figure(figsize=(8,8))
	train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
	train_features, train_labels = next(iter(train_dataloader))
	print(f"Feature batch shape: {train_features.size()}")
	print(f"Labels batch shape: {train_labels.size()}")
	img = train_features[0].squeeze()
	label = int(train_labels[0])
	print(f"Label: {labels_map[label]}")
	plt.title(labels_map[label])
	plt.imshow(img, cmap="gray")
	plt.show()
	import os
	import torch
	from torch import nn
	from torch.utils.data import DataLoader
	from torchvision import datasets, transforms

	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f'Using {device} device')

	class NeuralNetwork(nn.Module):
	def __init__(self):
	super(NeuralNetwork, self).__init__()
	self.flatten = nn.Flatten()
	self.linear_relu_stack = nn.Sequential(
	nn.Linear(28*28, 512),
	nn.ReLU(),
	nn.Linear(512, 512),
	nn.ReLU(),
	nn.Linear(512, 10),
	)

	def forward(self, x):
	x = self.flatten(x)
	logits = self.linear_relu_stack(x)
	return logits


	model = NeuralNetwork().to(device)
	print(model)

	X = torch.rand(1, 28, 28, device=device)
	logits = model(X)
	pred_prob = nn.Softmax(dim=1)(logits)
	y_pred = pred_prob.argmax(1)
	print(f"Prediction: {y_pred}")
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Using basic torch functionality: We will import torch in the most basic way and use it to perform tasks."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import torch # import torch"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"sum: tensor([5., 7., 9.])\n"
	]
	}
	],
	"source": [
	"# create a tensor\n",
	"x = torch.tensor([1.0, 2.0, 3.0])\n",
	"y = torch.tensor([4.0, 5.0, 6.0])\n",
	"\n",
	"# basic operations\n",
	"\n",
	"z = x + y\n",
	"print(\"sum: \", z)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"d = tensor(4., grad_fn=<MeanBackward0>)\n",
	"Gradients: tensor([0.6667, 0.6667, 0.6667])\n"
	]
	}
	],
	"source": [
	"# Autograd Example \n",
	"\n",
	"a = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)\n",
	"\n",
	"'''\n",
	"In this case 'a' is a user created tensor that has enebaled gradient tracking.\n",
	"\n",
	"Gradient tracking eabled tensor operations are linked to a graph structure that can store information about the operations \n",
	"they are involved in.\n",
	"\n",
	"These user created gradient tracking tensors (that are connected to other tensors with a single edge in the graph structure) are \n",
	"also called leaf tensors, these leaf tensors when operated with other tensors create new tensors (forming new connections in the graph) that \n",
	"can store information as the graph structure grows.\n",
	"\n",
	"a tensor operation (let's say a multiply operation) transforms the value through a gradient and the edge in the \n",
	"graph from the input tensor to the output tensor holds(representatively) the gradient of that particular operation. This also \n",
	"means that the edge to the other operand can hold a gradient represnting the trasnsformation along that path.\n",
	"\n",
	"This behaviour makes certion operaitons problematic in this context. let's say we do an in place operation \n",
	"without creating a new tensor. this causes a circular graph connection that does not involve an another node.\n",
	"this effectively breaks up the gradient storing operation becasue if we backtrack through the edge (now circular becasue \n",
	"it comes back to the symbol a) we should return to a past a where the variable is at it's pre-transform value. The inplace operation\n",
	"eassentally deletes the record of the past.\n",
	"\n",
	"so essentailly in place operations for gradient tracking enabled tensors are invalid\n",
	"\n",
	"a = a * 0.5 # will throw an error \n",
	"\n",
	"'''\n",
	"\n",
	"b = a * 2 # here we are creating a new node in the graph and the gradient will be saved.\n",
	"\n",
	"c = b.sum() # [1.0 + 2.0 + 3.0] x 2 == 12 (gradient only comes from multiplication)\n",
	"\n",
	"\n",
	"d = b.mean() # [1.0 + 2.0 + 3.0] x 2/3 == 4.0 (gradient comes from the multiplication and the divison in the averagining)\n",
	"\n",
	"'''\n",
	"The reason we need to use the mean function is because the gradient is designed to use with tha scaler output \n",
	"such as an overall loss of an objective.\n",
	"'''\n",
	"\n",
	"# c.backward()\n",
	"# print(\"c = \", c)\n",
	"\n",
	"d.backward()\n",
	"print(\"d = \", d)\n",
	"\n",
	"# Gradients \n",
	"\n",
	"print(\"Gradients: \", a.grad)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Crafting a simple network Part 1: Here we will create a primiive network from pytorch fundemantals."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py:619: UserWarning: Can't initialize NVML\n",
	" warnings.warn(\"Can't initialize NVML\")\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1, Loss = 0.21906575560569763\n",
	"Epoch 2, Loss = 0.1309550702571869\n",
	"Epoch 3, Loss = 0.0665150135755539\n",
	"Epoch 4, Loss = 0.18628628551959991\n",
	"Epoch 5, Loss = 0.01886335015296936\n"
	]
	}
	],
	"source": [
	"# import torch \n",
	"import torch.nn as nn\n",
	"import torch.optim as optim \n",
	"import torch.utils\n",
	"import torch.utils.data\n",
	"from torchvision import datasets, transforms\n",
	"\n",
	"# Define the network\n",
	"class SuperSimpleNN(nn.Module):\n",
	" def __init__(self):\n",
	" super(SuperSimpleNN, self).__init__()\n",
	" self.fc1 = nn.Linear(28*28, 128)\n",
	" self.fc2 = nn.Linear(128, 10)\n",
	"\n",
	" def forward(self, x):\n",
	" x = x.view(-1, 28*28)\n",
	" x = torch.relu(self.fc1(x))\n",
	" x = self.fc2(x)\n",
	" return x\n",
	" \n",
	"# Load dataset \n",
	"transform = transforms.Compose([transforms.ToTensor()])\n",
	"train_dataset = datasets.MNIST(root='./data/MNIST', train=True, download=True, transform=transform)\n",
	"train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)\n",
	"\n",
	"# Initialize network \n",
	"model = SuperSimpleNN()\n",
	"criterion = nn.CrossEntropyLoss()\n",
	"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
	"\n",
	"# Training loop \n",
	"for epoch in range(5):\n",
	" for batch_idx, (data, target) in enumerate(train_loader):\n",
	" optimizer.zero_grad() # reset trainiable params\n",
	" output = model(data) # get output \n",
	" loss = criterion(output, target) # compare output vs target and obtain loss \n",
	" loss.backward() # backpropagate\n",
	" optimizer.step() # update trainable parameters\n",
	"\n",
	"\n",
	" print(f'Epoch {epoch+1}, Loss = {loss.item()}')\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Crafting a simple network Part 2 :Creating a Network to classify FlashionMNIST labels! "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"If we look at the above code we essentially do four things to end up with a better model.\n",
	"\n",
	"1. Get model output: run a forward pass \n",
	"2. Get the loss: compare the model output and target \n",
	"3. backpropagate: run the backward pass and calculate the gradients \n",
	"4. update trainable parameters \n",
	"\n",
	"we do this over a few epochs and the loss will usually trend downwards as shown in the example above. \n",
	"\n",
	"Now let;s try a slighly advanced model, here we will employ validation, save checkpoints and handle overfitting."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"ename": "SyntaxError",
	"evalue": "invalid syntax. Perhaps you forgot a comma? (97522079.py, line 35)",
	"output_type": "error",
	"traceback": [
	"\u001b[0;36m Cell \u001b[0;32mIn[5], line 35\u001b[0;36m\u001b[0m\n\u001b[0;31m nn.Linear(self.input_resolution, 128)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax. Perhaps you forgot a comma?\n"
	]
	}
	],
	"source": [
	"import torch \n",
	"import torch.nn as nn\n",
	"import torch.optim as optim \n",
	"from torchvision import datasets, transforms\n",
	"\n",
	"from torch.utils.data import DataLoader, random_split\n",
	"\n",
	"'''\n",
	"This is basically the SuperSimpleNN:\n",
	" has to fc layers with a ReLU nonlinearity inbetween them : \n",
	" fc layer 1 (hidden) -> (28*28) x 128 (128 neurones)\n",
	" fc layer 2 (output) -> 128 x 10 (10 neurones) \n",
	"'''\n",
	"class simpleNN(nn.Module):\n",
	" def __init__(self):\n",
	" super(simpleNN, self).__init__()\n",
	" self.fc1 = nn.Linear(28*28, 128)\n",
	" self.fc2 = nn.Linear(128, 10)\n",
	"\n",
	" def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
	" x = x.view(-1, 28*28)\n",
	" x = torch.relu(self.fc1(x))\n",
	" x = self.fc2(x)\n",
	" return x\n",
	"\n",
	"\n",
	"'''\n",
	"\n",
	"'''\n",
	"class bitFancyNN(nn.Module):\n",
	" def __init__(self):\n",
	" super(bitFancyNN, self).__init__()\n",
	" \n",
	" self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
	" self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
	"\n",
	" \n",
	"\n",
	" self.fc1_block = nn.Sequential(\n",
	" nn.Linear(28*28, 128),\n",
	" nn.ReLU\n",
	" )\n",
	" self\n",
	"\n",
	"\n",
	"transform = transforms.Compose([transforms.ToTensor()])\n",
	"dataset = datasets.MNIST(root='./data/MNIST', train=True, download=True, transform=transform)\n",
	"\n",
	"#split into training a d validation\n",
	"train_size = int(0.8 * len(dataset))\n",
	"val_size = len(dataset) - train_size\n",
	"train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
	"\n",
	"train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)\n",
	"val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)\n",
	"\n",
	"model = simpleNN()\n",
	"criterion = nn.CrossEntropyLoss()\n",
	"optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
	"\n",
	"best_val_loss = float('inf')\n",
	"\n",
	"num_epochs = 5\n",
	"\n",
	"for epoch in range(num_epochs):\n",
	"\n",
	" model.train() # for each epoch we need to set the model to training mode\n",
	" for data, target in train_loader: # train loader outputs data and target\n",
	" optimizer.zero_grad()\n",
	" output = model(data)\n",
	" loss = criterion(output, target)\n",
	" loss.backward()\n",
	" optimizer.step()\n",
	"\n",
	" model.eval() # put the model into eval mode \n",
	" val_loss = 0\n",
	" with torch.no_grad():\n",
	" for data, target in val_loader:\n",
	" output = model(data)\n",
	" val_loss += criterion(output, target).item()\n",
	" \n",
	" val_loss /= len(val_loader)\n",
	" print(f'Epoch {epoch+1}: Validation Loss = {val_loss}')\n",
	"\n",
	" # save model if validation loss has decreased\n",
	" if val_loss < best_val_loss:\n",
	" best_val_loss = val_loss\n",
	" torch.save(model.state_dict(), 'best_model.pth')\n",
	"\n",
	" \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"# import torch # commented out becase we imported this earlier (uncomment if you skip the above steps)\n",
	"from torch import nn\n",
	"from torch.utils.data import DataLoader\n",
	"from torchvision import datasets \n",
	"from torchvision.transforms import ToTensor, Lambda, Compose \n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"FashonMNIST dataset is pre set in the library and this declations will try to download the dataset if not present already."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"'''\n",
	"Training data\n",
	"'''\n",
	"\n",
	"training_data = datasets.FashionMNIST(\n",
	" root=\"./data/MNIST\",\n",
	" train=\"True\",\n",
	" download=\"True\",\n",
	" transform=ToTensor(),\n",
	"\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"here we set up the parameters for the dataloader, in this case we will set up a custom batch_size "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"'''\n",
	"set up DataLoader\n",
	"'''\n",
	"batch_size = 64\n",
	"train_dataloader = DataLoader(training_data, batch_size=batch_size)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's say we are setting up a simple single-layer network for binary classification.\n",
	"The input data consists of small images, each 28 pixels in height and 28 pixels in width. As a preprocessing step, these images are flattened by reducing the 2D spatial dimensions into a 1D vector of 784 elements (since 28 × 28 = 784). These flattened vectors are then stacked into batches during the data loading stage.\n",
	"\n",
	"For example, if we choose a batch size of M, the input to the network will have dimensions M × 784.\n",
	"\n",
	"Let's assume we are using a single fully connected layer as our model. This layer will have dimensions 784 × 2, where the two output nodes correspond to the two classes in our binary classification task. After the matrix multiplication between the input batch M × 784 and the layer weights 784 × 2, the resulting output tensor will have dimensions M × 2. This output represents the logits for each class across the batch.\n",
	"\n",
	"Subsequent activation (e.g., softmax) will convert these logits into probabilities, allowing us to classify each image in the batch into one of the two categories we are concerened about.\n",
	"\n",
	"In the example below we are classifying between 10 categories becasue the MNIST dataset comes with labels for 10 categories. \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"'''\n",
	"Model Creation\n",
	"'''\n",
	"\n",
	"from typing import Callable\n",
	"\n",
	"# Model definition\n",
	"class NeuralNetwork(nn.Module):\n",
	" def __init__(self):\n",
	" super(NeuralNetwork, self).__init__()\n",
	" self.flatten = nn.Flatten()\n",
	" self.device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
	" self.linear_relu_stack = nn.Sequential(\n",
	" nn.Linear(28*28, 512),\n",
	" nn.ReLU(),\n",
	" nn.Linear(512, 512),\n",
	" nn.ReLU(),\n",
	" nn.Linear(512, 10) # we have 10 classes so we channel the output into 10 slots \n",
	" )\n",
	"\n",
	" self.loss_fn = nn.CrossEntropyLoss()\n",
	" self.optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)\n",
	"\n",
	" def forward(self, x):\n",
	" x = self.flatten(x)\n",
	" logits = self.linear_relu_stack(x)\n",
	" return logits\n",
	"\n",
	"\n",
	"def train(dataloader: DataLoader, \n",
	" model: NeuralNetwork, \n",
	" loss_fn: Callable,\n",
	" optimizer: Callable):\n",
	" size = len(train_dataloader.dataset)\n",
	" model.train() # set model to training mode\n",
	"\n",
	" for batch, (X, y) in enumerate(dataloader):\n",
	" X, y = X.to(model.device), y.to(model.device)\n",
	"\n",
	" # compute prediction error\n",
	" pred = model(X)\n",
	" loss = loss_fn(pred, y)\n",
	"\n",
	" # backprop\n",
	" optimizer.zero_grad()\n",
	" loss.backward()\n",
	" optimizer.step()\n",
	"\n",
	" if batch % 100 == 0:\n",
	" loss, current = loss.item(), batch * len(X)\n",
	" print(f\"loss: {loss:7f} [{current:>5d}/{size:>5d}]\")\n",
	"\n",
	"\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# device= \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
	"model = NeuralNetwork()\n",
	"model = model.to(model.device)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"NeuralNetwork(\n",
	" (flatten): Flatten(start_dim=1, end_dim=-1)\n",
	" (linear_relu_stack): Sequential(\n",
	" (0): Linear(in_features=784, out_features=512, bias=True)\n",
	" (1): ReLU()\n",
	" (2): Linear(in_features=512, out_features=512, bias=True)\n",
	" (3): ReLU()\n",
	" (4): Linear(in_features=512, out_features=10, bias=True)\n",
	" )\n",
	" (loss_fn): CrossEntropyLoss()\n",
	")\n"
	]
	}
	],
	"source": [
	"print(model)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Epoch 1\n",
	"----------\n",
	"loss: 2.300676 [ 0/60000]\n",
	"loss: 2.289026 [ 6400/60000]\n",
	"loss: 2.269229 [12800/60000]\n",
	"loss: 2.268934 [19200/60000]\n",
	"loss: 2.243828 [25600/60000]\n",
	"loss: 2.217938 [32000/60000]\n",
	"loss: 2.218239 [38400/60000]\n",
	"loss: 2.186219 [44800/60000]\n",
	"loss: 2.193820 [51200/60000]\n",
	"loss: 2.150481 [57600/60000]\n",
	"Epoch 2\n",
	"----------\n",
	"loss: 2.159364 [ 0/60000]\n",
	"loss: 2.147613 [ 6400/60000]\n",
	"loss: 2.091816 [12800/60000]\n",
	"loss: 2.114179 [19200/60000]\n",
	"loss: 2.043714 [25600/60000]\n",
	"loss: 1.988512 [32000/60000]\n",
	"loss: 2.013317 [38400/60000]\n",
	"loss: 1.930811 [44800/60000]\n",
	"loss: 1.953270 [51200/60000]\n",
	"loss: 1.858937 [57600/60000]\n",
	"Epoch 3\n",
	"----------\n",
	"loss: 1.905293 [ 0/60000]\n",
	"loss: 1.867958 [ 6400/60000]\n",
	"loss: 1.757275 [12800/60000]\n",
	"loss: 1.804702 [19200/60000]\n",
	"loss: 1.668520 [25600/60000]\n",
	"loss: 1.630944 [32000/60000]\n",
	"loss: 1.655383 [38400/60000]\n",
	"loss: 1.555281 [44800/60000]\n",
	"loss: 1.590403 [51200/60000]\n",
	"loss: 1.472981 [57600/60000]\n",
	"Epoch 4\n",
	"----------\n",
	"loss: 1.569801 [ 0/60000]\n",
	"loss: 1.527082 [ 6400/60000]\n",
	"loss: 1.383564 [12800/60000]\n",
	"loss: 1.460762 [19200/60000]\n",
	"loss: 1.321409 [25600/60000]\n",
	"loss: 1.330832 [32000/60000]\n",
	"loss: 1.348618 [38400/60000]\n",
	"loss: 1.268446 [44800/60000]\n",
	"loss: 1.307505 [51200/60000]\n",
	"loss: 1.208527 [57600/60000]\n",
	"Epoch 5\n",
	"----------\n",
	"loss: 1.313115 [ 0/60000]\n",
	"loss: 1.287318 [ 6400/60000]\n",
	"loss: 1.124761 [12800/60000]\n",
	"loss: 1.239573 [19200/60000]\n",
	"loss: 1.100252 [25600/60000]\n",
	"loss: 1.134562 [32000/60000]\n",
	"loss: 1.162831 [38400/60000]\n",
	"loss: 1.091245 [44800/60000]\n",
	"loss: 1.136182 [51200/60000]\n",
	"loss: 1.056075 [57600/60000]\n"
	]
	}
	],
	"source": [
	"epochs = 5\n",
	"for epoch in range(epochs):\n",
	" print(f\"Epoch {epoch+1}\\n----------\")\n",
	" train(train_dataloader, model, model.loss_fn, model.optimizer)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"\n",
	"\n"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	import torch
	# from torch.utils.data import Dataset
	from torchvision import datasets
	from torchvision.transforms import ToTensor
	import matplotlib.pyplot as plt

	training_data = datasets.FashionMNIST(root="data",
	train="True",
	download=True,
	transform=ToTensor()
	)

	labels_map = {
	0: "Tshirt",
	1: "Trouser",
	2: "Pullover",
	3: "Dress",
	4: "Coat",
	5: "Sandal",
	6: "Shirt",
	7: "Sneaker",
	8: "Bag",
	9: "Ankle boot",
	}

	figure = plt.figure(figsize=(8,8))
	cols, rows = 3, 3
	for i in range(1, cols*rows + 1):
	sample_idx = torch.randint(len(training_data), size=(1,)).item()
	img, label = training_data[sample_idx]
	figure.add_subplot(rows, cols, i)
	plt.title(labels_map[label])
	plt.axis("off")
	# print(f"image shape before squeeze = {img.shape}")
	plt.imshow(img.squeeze(), cmap="gray")
	plt.show()