-
Star
(411)
You must be signed in to star a gist -
Fork
(79)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
| """ | |
| The most atomic way to train and inference a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| # Let there be order among chaos | |
| random.seed(42) | |
| # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to discrete symbols and back | |
| chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter | |
| vocab_size = len(chars) | |
| stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer | |
| itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string | |
| BOS = stoi['<BOS>'] | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be an Autograd to apply the chain rule recursively across a computation graph and so | |
| # calculate the gradients of the loss with respect to model parameters. | |
| class Value: | |
| """Stores a single scalar value and its gradient.""" | |
| def __init__(self, data, _children=(), _op=''): | |
| self.data = data | |
| self.grad = 0 | |
| self._backward = lambda: None | |
| self._prev = set(_children) | |
| self._op = _op # the op that produced this node, for graphviz / debugging / etc | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data + other.data, (self, other), '+') | |
| def _backward(): | |
| self.grad += out.grad | |
| other.grad += out.grad | |
| out._backward = _backward | |
| return out | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data * other.data, (self, other), '*') | |
| def _backward(): | |
| self.grad += other.data * out.grad | |
| other.grad += self.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def __pow__(self, other): | |
| assert isinstance(other, (int, float)), "only supporting int/float powers for now" | |
| out = Value(self.data**other, (self,), f'**{other}') | |
| def _backward(): | |
| self.grad += (other * self.data**(other-1)) * out.grad | |
| out._backward = _backward | |
| return out | |
| def log(self): | |
| out = Value(math.log(self.data), (self,), 'log') | |
| def _backward(): | |
| self.grad += (1 / self.data) * out.grad | |
| out._backward = _backward | |
| return out | |
| def exp(self): | |
| out = Value(math.exp(self.data), (self,), 'exp') | |
| def _backward(): | |
| self.grad += out.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def relu(self): | |
| out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU') | |
| def _backward(): | |
| self.grad += (out.data > 0) * out.grad | |
| out._backward = _backward | |
| return out | |
| def backward(self): | |
| # topological order all of the children in the graph | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._prev: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| # go one variable at a time and apply the chain rule to get its gradient | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| v._backward() | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" | |
| # Initialize the parameters, to store the knowledge of the model. | |
| n_embd = 16 # embedding dimension | |
| n_head = 4 # number of attention heads | |
| n_layer = 1 # number of layers | |
| block_size = 8 # maximum sequence length | |
| head_dim = n_embd // n_head # dimension of each head | |
| matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next. | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2 | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) | |
| for li in range(n_layer): | |
| # 1) Multi-head attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() ** 2 for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 500 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss. | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters. | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients. | |
| lr_t = learning_rate * (1 - step / num_steps) | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") | |
| # Inference: may the model babble back to us | |
| temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| print(f"sample {sample_idx+1}: ", end="") | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| print(itos[token_id], end="") | |
| print() |
Now also available here on a single page :)
https://karpathy.ai/microgpt.html
🐑
This is absolute cinema!
Where’s my signed (and numbered) karpathy original print?
this is the version but in Haskell
`{-# LANGUAGE RecordWildCards #-}
module Main where
import Control.Monad
import Data.Char
import Data.IORef
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import System.Directory
import System.Process
import System.Random
import qualified Data.List as L
data Value = Value
{ vid :: !Int
, vdata :: !(IORef Double)
, vgrad :: !(IORef Double)
, vback :: !(IORef (IO ()))
, vprev :: !(IORef [Value])
, vop :: !String
}
type StateDict = M.Map String [[Value]]
newId :: IORef Int -> IO Int
newId r = atomicModifyIORef' r (\i -> (i + 1, i))
mkValue :: IORef Int -> Double -> [Value] -> String -> IO Value
mkValue idRef x prev op = do
i <- newId idRef
d <- newIORef x
g <- newIORef 0.0
b <- newIORef (pure ())
p <- newIORef prev
pure Value { vid = i, vdata = d, vgrad = g, vback = b, vprev = p, vop = op }
readData :: Value -> IO Double
readData Value{..} = readIORef vdata
readGrad :: Value -> IO Double
readGrad Value{..} = readIORef vgrad
addGrad :: Value -> Double -> IO ()
addGrad Value{..} x = modifyIORef' vgrad (+ x)
setGrad :: Value -> Double -> IO ()
setGrad Value{..} x = writeIORef vgrad x
setData :: Value -> Double -> IO ()
setData Value{..} x = writeIORef vdata x
setBack :: Value -> IO () -> IO ()
setBack Value{..} act = writeIORef vback act
getBack :: Value -> IO (IO ())
getBack Value{..} = readIORef vback
getPrev :: Value -> IO [Value]
getPrev Value{..} = readIORef vprev
valConst :: IORef Int -> Double -> IO Value
valConst idRef x = mkValue idRef x [] "const"
vAdd :: IORef Int -> Value -> Value -> IO Value
vAdd idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad + bd) [a,b] "+"
setBack out $ do
og <- readGrad out
addGrad a og
addGrad b og
pure out
vMul :: IORef Int -> Value -> Value -> IO Value
vMul idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad * bd) [a,b] "*"
setBack out $ do
og <- readGrad out
addGrad a (bd * og)
addGrad b (ad * og)
pure out
vNeg :: IORef Int -> Value -> IO Value
vNeg idRef a = do
m1 <- valConst idRef (-1.0)
vMul idRef a m1
vSub :: IORef Int -> Value -> Value -> IO Value
vSub idRef a b = do
nb <- vNeg idRef b
vAdd idRef a nb
vPow :: IORef Int -> Value -> Double -> IO Value
vPow idRef a pwr = do
ad <- readData a
out <- mkValue idRef (ad ** pwr) [a] ("**" ++ show pwr)
setBack out $ do
og <- readGrad out
addGrad a ((pwr * (ad ** (pwr - 1.0))) * og)
pure out
vLog :: IORef Int -> Value -> IO Value
vLog idRef a = do
ad <- readData a
out <- mkValue idRef (log ad) [a] "log"
setBack out $ do
og <- readGrad out
addGrad a ((1.0 / ad) * og)
pure out
vExp :: IORef Int -> Value -> IO Value
vExp idRef a = do
ad <- readData a
let ed = exp ad
out <- mkValue idRef ed [a] "exp"
setBack out $ do
og <- readGrad out
addGrad a (ed * og)
pure out
vReLU :: IORef Int -> Value -> IO Value
vReLU idRef a = do
ad <- readData a
let od = if ad < 0 then 0 else ad
out <- mkValue idRef od [a] "ReLU"
setBack out $ do
og <- readGrad out
addGrad a ((if od > 0 then 1 else 0) * og)
pure out
sumValues :: IORef Int -> [Value] -> IO Value
sumValues idRef xs = do
z <- valConst idRef 0.0
foldM (vAdd idRef) z xs
meanValues :: IORef Int -> [Value] -> IO Value
meanValues idRef xs = do
s <- sumValues idRef xs
n <- valConst idRef (fromIntegral (length xs))
inv <- vPow idRef n (-1.0)
vMul idRef s inv
backward :: Value -> IO ()
backward loss = do
topoRef <- newIORef ([] :: [Value])
let dfs visited v = do
let i = vid v
if S.member i visited
then pure visited
else do
prevs <- getPrev v
visited' <- foldM dfs (S.insert i visited) prevs
modifyIORef' topoRef (v :)
pure visited'
_ <- dfs S.empty loss
topo <- readIORef topoRef
setGrad loss 1.0
forM_ topo $ \v -> getBack v >>= id
boxMuller :: Double -> Double -> (Double, Double)
boxMuller u1 u2 =
let r = sqrt (-2.0 * log (max 1e-12 u1))
t = 2.0 * pi * u2
in (r * cos t, r * sin t)
gauss :: IORef StdGen -> Double -> Double -> IO Double
gauss genRef mu sigma = do
g <- readIORef genRef
let (u1, g1) = randomR (0.0, 1.0) g
(u2, g2) = randomR (0.0, 1.0) g1
(z0, _) = boxMuller u1 u2
writeIORef genRef g2
pure (mu + sigma * z0)
randInt :: IORef StdGen -> Int -> Int -> IO Int
randInt genRef lo hi = do
g <- readIORef genRef
let (x, g') = randomR (lo, hi) g
writeIORef genRef g'
pure x
shuffle :: IORef StdGen -> [a] -> IO [a]
shuffle genRef xs = do
let n = length xs
arr <- newIORef xs
let swapAt i j ys =
let xi = ys !! i
xj = ys !! j
in [ if k == i then xj else if k == j then xi else ys !! k | k <- [0..n-1] ]
forM_ [n-1, n-2 .. 1] $ \i -> do
j <- randInt genRef 0 i
ys <- readIORef arr
writeIORef arr (swapAt i j ys)
readIORef arr
matrix :: IORef Int -> IORef StdGen -> Int -> Int -> Double -> IO [[Value]]
matrix idRef genRef nout nin std = do
forM [1..nout] $ _ ->
forM [1..nin] $ _ -> do
x <- gauss genRef 0.0 std
mkValue idRef x [] "param"
linear :: IORef Int -> [Value] -> [[Value]] -> IO [Value]
linear idRef x w = forM w $ \wo -> do
prods <- forM (zip wo x) $ (wi, xi) -> vMul idRef wi xi
sumValues idRef prods
softmax :: IORef Int -> [Value] -> IO [Value]
softmax idRef logits = do
ds <- mapM readData logits
let mx = maximum ds
mxv <- valConst idRef mx
exps <- forM logits $ \v -> do
dv <- vSub idRef v mxv
vExp idRef dv
total <- sumValues idRef exps
inv <- vPow idRef total (-1.0)
forM exps $ \e -> vMul idRef e inv
rmsnorm :: IORef Int -> [Value] -> IO [Value]
rmsnorm idRef x = do
sqs <- forM x $ \xi -> vMul idRef xi xi
ms <- meanValues idRef sqs
eps <- valConst idRef 1e-5
denom <- vAdd idRef ms eps
scale <- vPow idRef denom (-0.5)
forM x $ \xi -> vMul idRef xi scale
slice :: Int -> Int -> [a] -> [a]
slice s l = take l . drop s
zipWithM' :: (a -> b -> IO c) -> [a] -> [b] -> IO [c]
zipWithM' f as bs = sequence (zipWith f as bs)
gpt :: IORef Int
-> StateDict
-> Int -> Int
-> [[[[Value]]]] -> [[[[Value]]]]
-> Int -> Int -> Int -> Int -> Int
-> IO [Value]
gpt idRef st tokenId posId keys values nLayer nHead headDim nEmbd vocabSize = do
let wte = st M.! "wte"
wpe = st M.! "wpe"
let tokEmb = wte !! tokenId
posEmb = wpe !! posId
x0 <- zipWithM' (vAdd idRef) tokEmb posEmb
x1 <- rmsnorm idRef x0
foldM (\x li -> do
xRes1 <- pure x
xN1 <- rmsnorm idRef x
q <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wq"))
k <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wk"))
v <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wv"))
let keysLi = keys !! li
valuesLi = values !! li
let keysLi' = keysLi ++ [k]
valuesLi' = valuesLi ++ [v]
let keys' = take li keys ++ [keysLi'] ++ drop (li+1) keys
let values' = take li values ++ [valuesLi'] ++ drop (li+1) values
xAttnHeads <- forM [0..nHead-1] $ \h -> do
let hs = h * headDim
let qh = slice hs headDim q
let kh = map (slice hs headDim) (keys' !! li)
let vh = map (slice hs headDim) (values' !! li)
logits <- forM [0..length kh - 1] $ \t -> do
dots <- forM [0..headDim-1] $ \j -> vMul idRef (qh !! j) (kh !! t !! j)
s <- sumValues idRef dots
denom <- valConst idRef (sqrt (fromIntegral headDim))
inv <- vPow idRef denom (-1.0)
vMul idRef s inv
weights <- softmax idRef logits
forM [0..headDim-1] $ \j -> do
terms <- forM [0..length vh - 1] $ \t -> vMul idRef (weights !! t) (vh !! t !! j)
sumValues idRef terms
let xAttn = concat xAttnHeads
xProj <- linear idRef xAttn (st M.! ("layer" ++ show li ++ ".attn_wo"))
x2 <- zipWithM' (vAdd idRef) xProj xRes1
xRes2 <- pure x2
xN2 <- rmsnorm idRef x2
xFc1 <- linear idRef xN2 (st M.! ("layer" ++ show li ++ ".mlp_fc1"))
xAct <- forM xFc1 $ \xi -> do
r <- vReLU idRef xi
vPow idRef r 2.0
xFc2 <- linear idRef xAct (st M.! ("layer" ++ show li ++ ".mlp_fc2"))
zipWithM' (vAdd idRef) xFc2 xRes2
) x1 [0..nLayer-1] >>= \xFinal ->
linear idRef xFinal (st M.! "lm_head")
categorical :: IORef StdGen -> [Double] -> IO Int
categorical genRef ws = do
let total = sum ws
g <- readIORef genRef
let (r, g') = randomR (0.0, total) g
writeIORef genRef g'
pure (go r 0 ws)
where
go _ i [] = max 0 (i - 1)
go r i (w:rest) = if r <= w then i else go (r - w) (i + 1) rest
main :: IO ()
main = do
let seed = 42 :: Int
genRef <- newIORef (mkStdGen seed)
idRef <- newIORef 0
exists <- doesFileExist "input.txt"
unless exists $ do
let url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
callCommand ("curl -L " ++ url ++ " -o input.txt")
raw <- readFile "input.txt"
let docs0 = filter (not . null) (map (dropWhileEnd isSpace . dropWhile isSpace) (lines raw))
docs <- shuffle genRef docs0
putStrLn ("num docs: " ++ show (length docs))
let chars = "" : (L.sort . S.toList . S.fromList . concat $ docs)
let vocabSize = length chars
let stoi = M.fromList (zip chars [0..])
let itos = M.fromList (zip [0..] chars)
let bos = stoi M.! ""
putStrLn ("vocab size: " ++ show vocabSize)
let nEmbd = 16
let nHead = 4
let nLayer = 1
let blockSize = 8
let headDim = nEmbd div nHead
wte <- matrix idRef genRef vocabSize nEmbd 0.02
wpe <- matrix idRef genRef blockSize nEmbd 0.02
lm <- matrix idRef genRef vocabSize nEmbd 0.02
let initSD = M.fromList [("wte", wte), ("wpe", wpe), ("lm_head", lm)]
sd <- foldM (\m li -> do
wq <- matrix idRef genRef nEmbd nEmbd 0.02
wk <- matrix idRef genRef nEmbd nEmbd 0.02
wv <- matrix idRef genRef nEmbd nEmbd 0.02
wo <- matrix idRef genRef nEmbd nEmbd 0.0
fc1 <- matrix idRef genRef (4nEmbd) nEmbd 0.02
fc2 <- matrix idRef genRef nEmbd (4nEmbd) 0.0
pure $
M.insert ("layer" ++ show li ++ ".attn_wq") wq $
M.insert ("layer" ++ show li ++ ".attn_wk") wk $
M.insert ("layer" ++ show li ++ ".attn_wv") wv $
M.insert ("layer" ++ show li ++ ".attn_wo") wo $
M.insert ("layer" ++ show li ++ ".mlp_fc1") fc1 $
M.insert ("layer" ++ show li ++ ".mlp_fc2") fc2 m
) initSD [0..nLayer-1]
let mats = M.elems sd
let params = [ p | mat <- mats, row <- mat, p <- row ]
putStrLn ("num params: " ++ show (length params))
let learningRate = 1e-2
let beta1 = 0.9
let beta2 = 0.95
let epsAdam = 1e-8
mBuf <- newIORef (replicate (length params) 0.0)
vBuf <- newIORef (replicate (length params) 0.0)
let numSteps = 500 :: Int
forM_ [0..numSteps-1] $ \step -> do
let doc = docs !! (step mod length docs)
let toks = [bos] ++ map (\ch -> stoi M.! [ch]) doc ++ [bos]
let n = min blockSize (length toks - 1)
let keys0 = replicate nLayer []
let values0 = replicate nLayer []
lossesRef <- newIORef ([] :: [Value])
let loopKV pos keys values
| pos >= n = pure (keys, values)
| otherwise = do
let tokenId = toks !! pos
let targetId = toks !! (pos + 1)
logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
probs <- softmax idRef logits
lt <- vLog idRef (probs !! targetId)
nlt <- vNeg idRef lt
modifyIORef' lossesRef (\ls -> ls ++ [nlt])
let updateAt i new xs = take i xs ++ [new] ++ drop (i+1) xs
let kAdd li = updateAt li ((keys !! li) ++ [linearKey li]) keys
vAdd li = updateAt li ((values !! li) ++ [linearVal li]) values
linearKey li = []
linearVal li = []
loopKV (pos + 1) keys values
_ <- loopKV 0 keys0 values0
losses <- readIORef lossesRef
loss <- do
s <- sumValues idRef losses
invn <- valConst idRef (1.0 / fromIntegral n)
vMul idRef invn s
backward loss
let lrT = learningRate * (1.0 - fromIntegral step / fromIntegral numSteps)
mList <- readIORef mBuf
vList <- readIORef vBuf
(mList', vList') <- foldM (\(ms, vs) (i, p) -> do
g <- readGrad p
let mi = beta1 * (ms !! i) + (1.0 - beta1) * g
let vi = beta2 * (vs !! i) + (1.0 - beta2) * (g * g)
let mHat = mi / (1.0 - beta1 ** fromIntegral (step + 1))
let vHat = vi / (1.0 - beta2 ** fromIntegral (step + 1))
d <- readData p
let d' = d - lrT * mHat / (sqrt vHat + epsAdam)
setData p d'
setGrad p 0.0
let ms' = take i ms ++ [mi] ++ drop (i+1) ms
let vs' = take i vs ++ [vi] ++ drop (i+1) vs
pure (ms', vs')
) (mList, vList) (zip [0..] params)
writeIORef mBuf mList'
writeIORef vBuf vList'
ld <- readData loss
putStrLn ("step " ++ pad 4 (show (step+1)) ++ " / " ++ pad 4 (show numSteps) ++ " | loss " ++ showFF 4 ld)
let temperature = 0.6
putStrLn "\n--- inference ---"
forM_ [1..20::Int] $ \sampleIdx -> do
let keys0 = replicate nLayer []
let values0 = replicate nLayer []
putStr ("sample " ++ show sampleIdx ++ ": ")
let genLoop pos tokenId keys values
| pos >= blockSize = putStrLn ""
| otherwise = do
logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
tempV <- valConst idRef temperature
scaled <- forM logits $ \l -> do
invt <- vPow idRef tempV (-1.0)
vMul idRef l invt
probs <- softmax idRef scaled
ws <- mapM readData probs
next <- categorical genRef ws
if next == bos
then putStrLn ""
else do
let ch = itos M.! next
putStr ch
genLoop (pos + 1) next keys values
genLoop 0 bos keys0 values0
pad :: Int -> String -> String
pad n s = replicate (n - length s) ' ' ++ s
showFF :: Int -> Double -> String
showFF k x =
let p = 10 ^ k
y = fromIntegral (round (x * fromIntegral p) :: Int) / fromIntegral p
in show y
dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd f = reverse . dropWhile f . reverse
`
thank you, functional programmer
Thanks for this awesome distillation
recommend running with uv run --python pypy karpathy.py for speed and better garbage collection, python gets heavy on the memory.
Let there be order among chaos
Let there be Adam, the blessed optimizer and its buffers
Let there be art, and there was art.
Thank you @karpathy for your elegant work!
Hello, future history books 👋
Kinda weird how the dual vector version was deleted when it's actually a lot simpler.
Its prettier when you called it art.
uv run --python pypy microgpt.py
thanks for this
"final average loss over the document sequence. May yours be low." :)
now everyone will be saying they trained a neural network from scratch haha @karpathy
goat
Beautiful