Last active
February 12, 2026 04:34
-
Star
(411)
You must be signed in to star a gist -
Fork
(79)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
microgpt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| The most atomic way to train and inference a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| # Let there be order among chaos | |
| random.seed(42) | |
| # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to discrete symbols and back | |
| chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter | |
| vocab_size = len(chars) | |
| stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer | |
| itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string | |
| BOS = stoi['<BOS>'] | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be an Autograd to apply the chain rule recursively across a computation graph and so | |
| # calculate the gradients of the loss with respect to model parameters. | |
| class Value: | |
| """Stores a single scalar value and its gradient.""" | |
| def __init__(self, data, _children=(), _op=''): | |
| self.data = data | |
| self.grad = 0 | |
| self._backward = lambda: None | |
| self._prev = set(_children) | |
| self._op = _op # the op that produced this node, for graphviz / debugging / etc | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data + other.data, (self, other), '+') | |
| def _backward(): | |
| self.grad += out.grad | |
| other.grad += out.grad | |
| out._backward = _backward | |
| return out | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data * other.data, (self, other), '*') | |
| def _backward(): | |
| self.grad += other.data * out.grad | |
| other.grad += self.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def __pow__(self, other): | |
| assert isinstance(other, (int, float)), "only supporting int/float powers for now" | |
| out = Value(self.data**other, (self,), f'**{other}') | |
| def _backward(): | |
| self.grad += (other * self.data**(other-1)) * out.grad | |
| out._backward = _backward | |
| return out | |
| def log(self): | |
| out = Value(math.log(self.data), (self,), 'log') | |
| def _backward(): | |
| self.grad += (1 / self.data) * out.grad | |
| out._backward = _backward | |
| return out | |
| def exp(self): | |
| out = Value(math.exp(self.data), (self,), 'exp') | |
| def _backward(): | |
| self.grad += out.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def relu(self): | |
| out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU') | |
| def _backward(): | |
| self.grad += (out.data > 0) * out.grad | |
| out._backward = _backward | |
| return out | |
| def backward(self): | |
| # topological order all of the children in the graph | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._prev: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| # go one variable at a time and apply the chain rule to get its gradient | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| v._backward() | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" | |
| # Initialize the parameters, to store the knowledge of the model. | |
| n_embd = 16 # embedding dimension | |
| n_head = 4 # number of attention heads | |
| n_layer = 1 # number of layers | |
| block_size = 8 # maximum sequence length | |
| head_dim = n_embd // n_head # dimension of each head | |
| matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next. | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2 | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) | |
| for li in range(n_layer): | |
| # 1) Multi-head attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() ** 2 for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 500 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss. | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters. | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients. | |
| lr_t = learning_rate * (1 - step / num_steps) | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") | |
| # Inference: may the model babble back to us | |
| temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| print(f"sample {sample_idx+1}: ", end="") | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| print(itos[token_id], end="") | |
| print() |
Thanks for this awesome distillation
recommend running with uv run --python pypy karpathy.py for speed and better garbage collection, python gets heavy on the memory.
Let there be order among chaos
Let there be Adam, the blessed optimizer and its buffers
Let there be art, and there was art.
Thank you @karpathy for your elegant work!
Hello, future history books 👋
Kinda weird how the dual vector version was deleted when it's actually a lot simpler.
Its prettier when you called it art.
uv run --python pypy microgpt.py
thanks for this
"final average loss over the document sequence. May yours be low." :)
now everyone will be saying they trained a neural network from scratch haha @karpathy
goat
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
thank you, functional programmer