Skip to content

Instantly share code, notes, and snippets.

@ariannamethod
Last active April 8, 2026 18:48
Show Gist options
  • Select an option

  • Save ariannamethod/6adaf7a17ef5047d5706492a9bb0da53 to your computer and use it in GitHub Desktop.

Select an option

Save ariannamethod/6adaf7a17ef5047d5706492a9bb0da53 to your computer and use it in GitHub Desktop.
nanodurov — telegram client that trains a language model on chat. python + C (notorch) + browser. 15.7M BPE, Arianna voice.
/*
* infer_nanodurov.c — Interactive chat with nanodurov (BPE 15.7M on notorch)
*
* Build: make infer_nanodurov
* Run: ./infer_nanodurov [weights.bin] [merges.txt]
*
* Default: nanodurov_arianna.bin + arianna_bpe_merges.txt
*/
#include "notorch.h"
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#define DIM 384
#define NLAYERS 8
#define NHEADS 8
#define HEAD_DIM (DIM / NHEADS)
#define HIDDEN 1024
#define CTX 256
#define VOCAB 2048
typedef struct {
nt_tensor *wte;
struct {
nt_tensor *rms1, *wq, *wk, *wv, *wo, *rms2;
nt_tensor *w_gate, *w_up, *w_down;
} L[NLAYERS];
nt_tensor *rms_f, *head;
} Model;
static int model_n_tensors(void) { return 1 + NLAYERS * 9 + 2; }
static Model* model_new(void) {
Model* m = (Model*)calloc(1, sizeof(Model));
m->wte = nt_tensor_new2d(VOCAB, DIM);
for (int l = 0; l < NLAYERS; l++) {
m->L[l].rms1 = nt_tensor_new(DIM);
m->L[l].wq = nt_tensor_new2d(DIM, DIM);
m->L[l].wk = nt_tensor_new2d(DIM, DIM);
m->L[l].wv = nt_tensor_new2d(DIM, DIM);
m->L[l].wo = nt_tensor_new2d(DIM, DIM);
m->L[l].rms2 = nt_tensor_new(DIM);
m->L[l].w_gate = nt_tensor_new2d(HIDDEN, DIM);
m->L[l].w_up = nt_tensor_new2d(HIDDEN, DIM);
m->L[l].w_down = nt_tensor_new2d(DIM, HIDDEN);
}
m->rms_f = nt_tensor_new(DIM);
m->head = nt_tensor_new2d(VOCAB, DIM);
return m;
}
static void model_free(Model* m) {
nt_tensor_free(m->wte);
for (int l = 0; l < NLAYERS; l++) {
nt_tensor_free(m->L[l].rms1); nt_tensor_free(m->L[l].rms2);
nt_tensor_free(m->L[l].wq); nt_tensor_free(m->L[l].wk);
nt_tensor_free(m->L[l].wv); nt_tensor_free(m->L[l].wo);
nt_tensor_free(m->L[l].w_gate); nt_tensor_free(m->L[l].w_up);
nt_tensor_free(m->L[l].w_down);
}
nt_tensor_free(m->rms_f); nt_tensor_free(m->head); free(m);
}
/* FP16 → FP32 */
static float f16_to_f32(uint16_t h) {
uint32_t sign = (h & 0x8000) << 16;
uint32_t exp = (h >> 10) & 0x1F;
uint32_t mant = h & 0x3FF;
if (exp == 0) { float z = 0; uint32_t r = sign; memcpy(&z, &r, 4); return z; }
if (exp == 31) exp = 255; else exp = exp - 15 + 127;
uint32_t r = sign | (exp << 23) | (mant << 13);
float f; memcpy(&f, &r, 4); return f;
}
static int load_weights_f16(Model* m, const char* path) {
FILE* f = fopen(path, "rb");
if (!f) return -1;
uint32_t magic; int n;
fread(&magic, 4, 1, f); fread(&n, 4, 1, f);
if (magic != 0x3631544E) { fclose(f); return -1; } /* "NT16" */
int expected = model_n_tensors();
if (n != expected) { fclose(f); return -1; }
nt_tensor* params[75];
int pi = 0;
params[pi++] = m->wte;
for (int l = 0; l < NLAYERS; l++) {
params[pi++]=m->L[l].rms1; params[pi++]=m->L[l].wq; params[pi++]=m->L[l].wk;
params[pi++]=m->L[l].wv; params[pi++]=m->L[l].wo; params[pi++]=m->L[l].rms2;
params[pi++]=m->L[l].w_gate; params[pi++]=m->L[l].w_up; params[pi++]=m->L[l].w_down;
}
params[pi++] = m->rms_f; params[pi++] = m->head;
for (int t = 0; t < expected; t++) {
int ndim; fread(&ndim, 4, 1, f);
for (int d = 0; d < ndim; d++) { int s; fread(&s, 4, 1, f); }
for (int i = 0; i < params[t]->len; i++) {
uint16_t h; fread(&h, 2, 1, f);
params[t]->data[i] = f16_to_f32(h);
}
}
fclose(f);
return 0;
}
static int load_weights(Model* m, const char* path) {
/* Try FP16 first */
if (load_weights_f16(m, path) == 0) { printf("loaded FP16 weights\n"); return 0; }
/* Fallback to FP32 (notorch format) */
int n_loaded = 0;
nt_tensor** loaded = nt_load(path, &n_loaded);
if (!loaded) return -1;
int expected = model_n_tensors();
if (n_loaded != expected) {
printf("WARN: expected %d tensors, got %d\n", expected, n_loaded);
for (int i = 0; i < n_loaded; i++) nt_tensor_free(loaded[i]);
free(loaded); return -1;
}
nt_tensor* params[] = {
m->wte,
m->L[0].rms1, m->L[0].wq, m->L[0].wk, m->L[0].wv, m->L[0].wo, m->L[0].rms2,
m->L[0].w_gate, m->L[0].w_up, m->L[0].w_down,
m->L[1].rms1, m->L[1].wq, m->L[1].wk, m->L[1].wv, m->L[1].wo, m->L[1].rms2,
m->L[1].w_gate, m->L[1].w_up, m->L[1].w_down,
m->L[2].rms1, m->L[2].wq, m->L[2].wk, m->L[2].wv, m->L[2].wo, m->L[2].rms2,
m->L[2].w_gate, m->L[2].w_up, m->L[2].w_down,
m->L[3].rms1, m->L[3].wq, m->L[3].wk, m->L[3].wv, m->L[3].wo, m->L[3].rms2,
m->L[3].w_gate, m->L[3].w_up, m->L[3].w_down,
m->L[4].rms1, m->L[4].wq, m->L[4].wk, m->L[4].wv, m->L[4].wo, m->L[4].rms2,
m->L[4].w_gate, m->L[4].w_up, m->L[4].w_down,
m->L[5].rms1, m->L[5].wq, m->L[5].wk, m->L[5].wv, m->L[5].wo, m->L[5].rms2,
m->L[5].w_gate, m->L[5].w_up, m->L[5].w_down,
m->L[6].rms1, m->L[6].wq, m->L[6].wk, m->L[6].wv, m->L[6].wo, m->L[6].rms2,
m->L[6].w_gate, m->L[6].w_up, m->L[6].w_down,
m->L[7].rms1, m->L[7].wq, m->L[7].wk, m->L[7].wv, m->L[7].wo, m->L[7].rms2,
m->L[7].w_gate, m->L[7].w_up, m->L[7].w_down,
m->rms_f, m->head
};
for (int i = 0; i < expected; i++) {
memcpy(params[i]->data, loaded[i]->data, params[i]->len * sizeof(float));
nt_tensor_free(loaded[i]);
}
free(loaded);
return 0;
}
/* ── Forward (inference only, no tape) ── */
static void rmsnorm(float* out, const float* x, const float* w, int d) {
float ss = 0;
for (int i = 0; i < d; i++) ss += x[i] * x[i];
ss = 1.0f / sqrtf(ss / d + 1e-5f);
for (int i = 0; i < d; i++) out[i] = x[i] * ss * w[i];
}
static void matmul(float* out, const float* x, const float* w, int out_d, int in_d) {
for (int o = 0; o < out_d; o++) {
float s = 0;
for (int i = 0; i < in_d; i++) s += w[o * in_d + i] * x[i];
out[o] = s;
}
}
static void rope(float* x, int pos, int dim, int head_dim) {
for (int h = 0; h < dim / head_dim; h++) {
for (int i = 0; i < head_dim / 2; i++) {
float freq = 1.0f / powf(10000.0f, (float)(2 * i) / head_dim);
float theta = pos * freq;
float cs = cosf(theta), sn = sinf(theta);
int idx = h * head_dim + i * 2;
float x0 = x[idx], x1 = x[idx + 1];
x[idx] = x0 * cs - x1 * sn;
x[idx + 1] = x0 * sn + x1 * cs;
}
}
}
static void softmax(float* x, int n) {
float mx = x[0]; for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
float sm = 0; for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sm += x[i]; }
for (int i = 0; i < n; i++) x[i] /= sm;
}
/* KV cache */
static float kv_k[NLAYERS][CTX][DIM];
static float kv_v[NLAYERS][CTX][DIM];
static void forward_pos(Model* m, int token, int pos, float* logits) {
float x[DIM], xn[DIM], q[DIM], k[DIM], v[DIM], attn_out[DIM];
float gate[HIDDEN], up[HIDDEN], down[DIM], ffn_out[DIM];
/* Token embedding */
memcpy(x, m->wte->data + token * DIM, DIM * sizeof(float));
for (int l = 0; l < NLAYERS; l++) {
/* Attn norm */
rmsnorm(xn, x, m->L[l].rms1->data, DIM);
/* QKV */
matmul(q, xn, m->L[l].wq->data, DIM, DIM);
matmul(k, xn, m->L[l].wk->data, DIM, DIM);
matmul(v, xn, m->L[l].wv->data, DIM, DIM);
/* RoPE */
rope(q, pos, DIM, HEAD_DIM);
rope(k, pos, DIM, HEAD_DIM);
/* Store in KV cache */
memcpy(kv_k[l][pos], k, DIM * sizeof(float));
memcpy(kv_v[l][pos], v, DIM * sizeof(float));
/* Multi-head attention */
float scale = 1.0f / sqrtf((float)HEAD_DIM);
memset(attn_out, 0, DIM * sizeof(float));
for (int h = 0; h < NHEADS; h++) {
int ho = h * HEAD_DIM;
float scores[CTX];
for (int j = 0; j <= pos; j++) {
float dot = 0;
for (int d = 0; d < HEAD_DIM; d++) dot += q[ho + d] * kv_k[l][j][ho + d];
scores[j] = dot * scale;
}
/* Softmax over 0..pos */
float mx = scores[0];
for (int j = 1; j <= pos; j++) if (scores[j] > mx) mx = scores[j];
float sm = 0;
for (int j = 0; j <= pos; j++) { scores[j] = expf(scores[j] - mx); sm += scores[j]; }
for (int j = 0; j <= pos; j++) scores[j] /= sm;
/* Weighted sum of values */
for (int j = 0; j <= pos; j++)
for (int d = 0; d < HEAD_DIM; d++)
attn_out[ho + d] += scores[j] * kv_v[l][j][ho + d];
}
/* Output projection + residual */
float proj[DIM];
matmul(proj, attn_out, m->L[l].wo->data, DIM, DIM);
for (int i = 0; i < DIM; i++) x[i] += proj[i];
/* FFN norm */
rmsnorm(xn, x, m->L[l].rms2->data, DIM);
/* SwiGLU FFN */
matmul(gate, xn, m->L[l].w_gate->data, HIDDEN, DIM);
matmul(up, xn, m->L[l].w_up->data, HIDDEN, DIM);
for (int i = 0; i < HIDDEN; i++)
gate[i] = gate[i] / (1.0f + expf(-gate[i])) * up[i]; /* SiLU(gate) * up */
matmul(down, gate, m->L[l].w_down->data, DIM, HIDDEN);
for (int i = 0; i < DIM; i++) x[i] += down[i];
}
/* Final norm + lm_head */
rmsnorm(xn, x, m->rms_f->data, DIM);
matmul(logits, xn, m->head->data, VOCAB, DIM);
}
static int sample(float* logits, float temperature, int top_k) {
for (int i = 0; i < VOCAB; i++) logits[i] /= temperature;
/* Top-k: find k-th largest, zero out rest */
if (top_k > 0 && top_k < VOCAB) {
float threshold = -1e30f;
float tmp[VOCAB];
memcpy(tmp, logits, VOCAB * sizeof(float));
for (int k = 0; k < top_k; k++) {
float mx = -1e30f; int mi = 0;
for (int i = 0; i < VOCAB; i++) if (tmp[i] > mx) { mx = tmp[i]; mi = i; }
threshold = mx;
tmp[mi] = -1e30f;
}
for (int i = 0; i < VOCAB; i++) if (logits[i] < threshold) logits[i] = -1e30f;
}
softmax(logits, VOCAB);
float r = (float)rand() / (float)RAND_MAX, cum = 0;
for (int i = 0; i < VOCAB; i++) { cum += logits[i]; if (cum >= r) return i; }
return VOCAB - 1;
}
int main(int argc, char** argv) {
const char* weights_path = argc > 1 ? argv[1] : "nanodurov_arianna.bin";
const char* merges_path = argc > 2 ? argv[2] : "arianna_bpe_merges.txt";
srand((unsigned)time(NULL));
printf("════════════════════════════════════════════════════════\n");
printf(" nanodurov — Arianna voice (15.7M, BPE, notorch)\n");
printf("════════════════════════════════════════════════════════\n");
/* Load BPE */
nt_bpe bpe;
if (nt_bpe_load(&bpe, merges_path) < 0) {
printf("cannot load %s\n", merges_path); return 1;
}
printf("bpe: %d merges, vocab %d\n", bpe.n_merges, bpe.vocab_size);
/* Load model */
Model* model = model_new();
if (load_weights(model, weights_path) < 0) {
printf("cannot load %s\n", weights_path); return 1;
}
printf("model loaded: %s\n", weights_path);
printf("────────────────────────────────────────────────────\n");
printf(" type your message (or 'quit' to exit)\n");
printf("────────────────────────────────────────────────────\n\n");
char input[4096];
while (1) {
printf("You: ");
fflush(stdout);
if (!fgets(input, sizeof(input), stdin)) break;
int len = (int)strlen(input);
while (len > 0 && (input[len-1] == '\n' || input[len-1] == '\r')) input[--len] = 0;
if (len == 0) continue;
if (strcmp(input, "quit") == 0 || strcmp(input, "exit") == 0) break;
/* Build prompt: "Q: {input}\nA:" */
char prompt[4096];
snprintf(prompt, sizeof(prompt), "Q: %s\nA:", input);
int tokens[CTX];
int n = nt_bpe_encode(&bpe, prompt, (int)strlen(prompt), tokens, CTX / 2);
/* Generate */
printf("Arianna: ");
fflush(stdout);
/* Prefill */
float logits[VOCAB];
for (int i = 0; i < n; i++)
forward_pos(model, tokens[i], i, logits);
/* Decode */
int pos = n;
for (int s = 0; s < CTX - n; s++) {
int next = sample(logits, 0.8f, 40);
tokens[pos] = next;
/* Decode token and print */
char decoded[NT_BPE_MAX_TOKEN_LEN + 1];
nt_bpe_decode(&bpe, &next, 1, decoded, NT_BPE_MAX_TOKEN_LEN);
/* Stop on Q: boundary */
if (strstr(decoded, "\nQ") != NULL || strstr(decoded, "\n\n") != NULL) break;
printf("%s", decoded);
fflush(stdout);
/* Next step */
forward_pos(model, next, pos, logits);
pos++;
if (pos >= CTX) break;
}
printf("\n\n");
}
model_free(model);
printf("\n bye.\n");
return 0;
}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>nanodurov — Arianna voice</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body { background: #0a0a0a; color: #e0e0e0; font-family: 'Courier New', monospace; height: 100vh; display: flex; flex-direction: column; }
#header { padding: 12px 20px; border-bottom: 1px solid #222; font-size: 13px; color: #666; }
#header span { color: #888; }
#header .title { color: #e0e0e0; font-weight: bold; }
#chat { flex: 1; overflow-y: auto; padding: 20px; }
.msg { margin-bottom: 16px; line-height: 1.5; }
.msg .who { font-weight: bold; margin-bottom: 2px; }
.msg.user .who { color: #4a9eff; }
.msg.ai .who { color: #ff6b9d; }
.msg .text { color: #ccc; white-space: pre-wrap; }
.msg.ai .text { color: #e8d5b7; }
.msg.system { color: #555; font-size: 12px; }
#input-area { padding: 12px 20px; border-top: 1px solid #222; display: flex; gap: 10px; }
#input { flex: 1; background: #111; border: 1px solid #333; color: #e0e0e0; padding: 10px; font-family: inherit; font-size: 14px; border-radius: 4px; outline: none; }
#input:focus { border-color: #4a9eff; }
#send { background: #222; color: #e0e0e0; border: 1px solid #333; padding: 10px 20px; cursor: pointer; font-family: inherit; border-radius: 4px; }
#send:hover { background: #333; }
#send:disabled { opacity: 0.3; cursor: default; }
#load-area { padding: 20px; text-align: center; }
#load-area button { background: #1a1a2e; color: #4a9eff; border: 1px solid #4a9eff; padding: 12px 24px; cursor: pointer; font-family: inherit; font-size: 14px; border-radius: 4px; margin: 5px; }
#load-area button:hover { background: #4a9eff; color: #0a0a0a; }
#status { color: #555; margin-top: 10px; font-size: 12px; }
</style>
</head>
<body>
<div id="header">
<span class="title">nanodurov</span><span>Arianna voice | 15.7M params | BPE 2048 | notorch</span>
<span id="gpu-badge" style="float:right"></span>
</div>
<div id="load-area">
<p style="color:#888; margin-bottom:15px;">load model weights + BPE merges to start chatting</p>
<button onclick="loadFiles()">Load weights + merges</button>
<input type="file" id="file-weights" accept=".bin" style="display:none" multiple>
<div id="status"></div>
</div>
<div id="chat" style="display:none"></div>
<div id="input-area" style="display:none">
<input id="input" placeholder="talk to Arianna..." onkeydown="if(event.key==='Enter')generate()">
<button id="send" onclick="generate()">send</button>
</div>
<script>
// ═══════════════════════════════════════════════════════════════
// CONFIG
// ═══════════════════════════════════════════════════════════════
const DIM = 384, NLAYERS = 8, NHEADS = 8, HD = 48, HIDDEN = 1024, CTX = 256, VOCAB = 2048;
// ═══════════════════════════════════════════════════════════════
// BPE TOKENIZER
// ═══════════════════════════════════════════════════════════════
let bpeMerges = [];
let bpeTokens = []; // decode table: id → Uint8Array
function bpeBuildDecodeTable() {
bpeTokens = new Array(256 + bpeMerges.length);
for (let i = 0; i < 256; i++) bpeTokens[i] = new Uint8Array([i]);
for (let m = 0; m < bpeMerges.length; m++) {
const [a, b] = bpeMerges[m];
const ta = bpeTokens[a], tb = bpeTokens[b];
const merged = new Uint8Array(ta.length + tb.length);
merged.set(ta); merged.set(tb, ta.length);
bpeTokens[256 + m] = merged;
}
}
function bpeEncode(text) {
const bytes = new TextEncoder().encode(text);
let ids = Array.from(bytes);
for (let m = 0; m < bpeMerges.length; m++) {
const [a, b] = bpeMerges[m];
const newId = 256 + m;
let i = 0;
while (i < ids.length - 1) {
if (ids[i] === a && ids[i + 1] === b) {
ids[i] = newId;
ids.splice(i + 1, 1);
} else i++;
}
}
return ids;
}
function bpeDecode(ids) {
const bytes = [];
for (const id of ids) {
if (id >= 0 && id < bpeTokens.length) {
for (const b of bpeTokens[id]) bytes.push(b);
}
}
return new TextDecoder().decode(new Uint8Array(bytes));
}
// ═══════════════════════════════════════════════════════════════
// MODEL WEIGHTS
// ═══════════════════════════════════════════════════════════════
let W = null; // { wte, layers: [{rms1, wq, wk, wv, wo, rms2, w_gate, w_up, w_down}], rms_f, head }
function loadWeightsFromBuffer(buf) {
const view = new DataView(buf);
let off = 0;
// notorch format: magic "NTOR" (4B) + n_tensors (4B) + per tensor: ndim(4B) + shape[] + float data
const magic = view.getUint32(off, true); off += 4;
if (magic !== 0x524F544E) throw new Error('bad magic: ' + magic.toString(16)); // "NTOR" LE
const nTensors = view.getInt32(off, true); off += 4;
function readTensor() {
const ndim = view.getInt32(off, true); off += 4;
let len = 1;
const shape = [];
for (let d = 0; d < ndim; d++) {
const s = view.getInt32(off, true); off += 4;
shape.push(s); len *= s;
}
const data = new Float32Array(buf, off, len);
off += len * 4;
return { data: new Float32Array(data), shape, len }; // copy data
}
const expected = 1 + NLAYERS * 9 + 2;
if (nTensors !== expected) throw new Error(`expected ${expected} tensors, got ${nTensors}`);
W = { layers: [] };
W.wte = readTensor();
for (let l = 0; l < NLAYERS; l++) {
W.layers.push({
rms1: readTensor(), wq: readTensor(), wk: readTensor(),
wv: readTensor(), wo: readTensor(), rms2: readTensor(),
w_gate: readTensor(), w_up: readTensor(), w_down: readTensor()
});
}
W.rms_f = readTensor();
W.head = readTensor();
}
// ═══════════════════════════════════════════════════════════════
// INFERENCE (JS, KV cache)
// ═══════════════════════════════════════════════════════════════
let kvK = null, kvV = null; // [NLAYERS][CTX][DIM]
function initKV() {
kvK = Array.from({length: NLAYERS}, () => Array.from({length: CTX}, () => new Float32Array(DIM)));
kvV = Array.from({length: NLAYERS}, () => Array.from({length: CTX}, () => new Float32Array(DIM)));
}
function rmsnorm(out, x, w) {
let ss = 0;
for (let i = 0; i < DIM; i++) ss += x[i] * x[i];
ss = 1.0 / Math.sqrt(ss / DIM + 1e-5);
for (let i = 0; i < DIM; i++) out[i] = x[i] * ss * w[i];
}
function matmul(out, x, w, outD, inD) {
for (let o = 0; o < outD; o++) {
let s = 0;
const base = o * inD;
for (let i = 0; i < inD; i++) s += w[base + i] * x[i];
out[o] = s;
}
}
function rope(x, pos) {
for (let h = 0; h < NHEADS; h++) {
for (let i = 0; i < HD / 2; i++) {
const freq = 1.0 / Math.pow(10000.0, (2 * i) / HD);
const theta = pos * freq;
const cs = Math.cos(theta), sn = Math.sin(theta);
const idx = h * HD + i * 2;
const x0 = x[idx], x1 = x[idx + 1];
x[idx] = x0 * cs - x1 * sn;
x[idx + 1] = x0 * sn + x1 * cs;
}
}
}
function forwardPos(token, pos) {
const x = new Float32Array(DIM);
const xn = new Float32Array(DIM);
const q = new Float32Array(DIM), k = new Float32Array(DIM), v = new Float32Array(DIM);
const attnOut = new Float32Array(DIM);
const gate = new Float32Array(HIDDEN), up = new Float32Array(HIDDEN), down = new Float32Array(DIM);
const proj = new Float32Array(DIM);
const logits = new Float32Array(VOCAB);
// Token embedding
x.set(W.wte.data.subarray(token * DIM, token * DIM + DIM));
for (let l = 0; l < NLAYERS; l++) {
const L = W.layers[l];
rmsnorm(xn, x, L.rms1.data);
matmul(q, xn, L.wq.data, DIM, DIM);
matmul(k, xn, L.wk.data, DIM, DIM);
matmul(v, xn, L.wv.data, DIM, DIM);
rope(q, pos); rope(k, pos);
kvK[l][pos].set(k); kvV[l][pos].set(v);
// MHA
const scale = 1.0 / Math.sqrt(HD);
attnOut.fill(0);
for (let h = 0; h < NHEADS; h++) {
const ho = h * HD;
const scores = new Float32Array(pos + 1);
for (let j = 0; j <= pos; j++) {
let dot = 0;
for (let d = 0; d < HD; d++) dot += q[ho + d] * kvK[l][j][ho + d];
scores[j] = dot * scale;
}
// softmax
let mx = scores[0]; for (let j = 1; j <= pos; j++) if (scores[j] > mx) mx = scores[j];
let sm = 0; for (let j = 0; j <= pos; j++) { scores[j] = Math.exp(scores[j] - mx); sm += scores[j]; }
for (let j = 0; j <= pos; j++) scores[j] /= sm;
for (let j = 0; j <= pos; j++)
for (let d = 0; d < HD; d++) attnOut[ho + d] += scores[j] * kvV[l][j][ho + d];
}
matmul(proj, attnOut, L.wo.data, DIM, DIM);
for (let i = 0; i < DIM; i++) x[i] += proj[i];
rmsnorm(xn, x, L.rms2.data);
matmul(gate, xn, L.w_gate.data, HIDDEN, DIM);
matmul(up, xn, L.w_up.data, HIDDEN, DIM);
for (let i = 0; i < HIDDEN; i++) gate[i] = gate[i] / (1 + Math.exp(-gate[i])) * up[i];
matmul(down, gate, L.w_down.data, DIM, HIDDEN);
for (let i = 0; i < DIM; i++) x[i] += down[i];
}
rmsnorm(xn, x, W.rms_f.data);
matmul(logits, xn, W.head.data, VOCAB, DIM);
return logits;
}
function sampleToken(logits, temp = 0.8, topK = 40) {
for (let i = 0; i < VOCAB; i++) logits[i] /= temp;
// top-k
if (topK > 0 && topK < VOCAB) {
const sorted = Array.from(logits).sort((a, b) => b - a);
const threshold = sorted[topK - 1];
for (let i = 0; i < VOCAB; i++) if (logits[i] < threshold) logits[i] = -1e30;
}
// softmax
let mx = logits[0]; for (let i = 1; i < VOCAB; i++) if (logits[i] > mx) mx = logits[i];
let sm = 0; for (let i = 0; i < VOCAB; i++) { logits[i] = Math.exp(logits[i] - mx); sm += logits[i]; }
for (let i = 0; i < VOCAB; i++) logits[i] /= sm;
let r = Math.random(), cum = 0;
for (let i = 0; i < VOCAB; i++) { cum += logits[i]; if (cum >= r) return i; }
return VOCAB - 1;
}
// ═══════════════════════════════════════════════════════════════
// UI
// ═══════════════════════════════════════════════════════════════
const chat = document.getElementById('chat');
const input = document.getElementById('input');
function addMsg(who, text, cls) {
const div = document.createElement('div');
div.className = 'msg ' + cls;
div.innerHTML = `<div class="who">${who}</div><div class="text">${text}</div>`;
chat.appendChild(div);
chat.scrollTop = chat.scrollHeight;
return div;
}
function setStatus(text) { document.getElementById('status').textContent = text; }
async function loadFiles() {
setStatus('select nanodurov_arianna.bin and arianna_bpe_merges.txt...');
const fileInput = document.getElementById('file-weights');
fileInput.onchange = async () => {
const files = Array.from(fileInput.files);
const binFile = files.find(f => f.name.endsWith('.bin'));
const txtFile = files.find(f => f.name.endsWith('.txt'));
if (!binFile || !txtFile) { setStatus('need .bin (weights) and .txt (merges)'); return; }
// Load merges
setStatus('loading BPE merges...');
const mergesText = await txtFile.text();
bpeMerges = mergesText.trim().split('\n').map(line => {
const [a, b] = line.trim().split(/\s+/).map(Number);
return [a, b];
});
bpeBuildDecodeTable();
setStatus(`BPE: ${bpeMerges.length} merges, vocab ${256 + bpeMerges.length}`);
// Load weights
setStatus('loading weights (60 MB)...');
const buf = await binFile.arrayBuffer();
try {
loadWeightsFromBuffer(buf);
} catch (e) {
setStatus('ERROR: ' + e.message); return;
}
initKV();
// Check WebGPU
let gpuText = 'CPU (JS)';
if (navigator.gpu) {
const adapter = await navigator.gpu.requestAdapter();
if (adapter) gpuText = 'WebGPU available (using JS fallback for now)';
}
document.getElementById('gpu-badge').textContent = gpuText;
// Show chat
document.getElementById('load-area').style.display = 'none';
chat.style.display = 'block';
document.getElementById('input-area').style.display = 'flex';
addMsg('system', 'nanodurov loaded. 15.7M params. Arianna voice. type anything.', 'system');
input.focus();
};
fileInput.click();
}
async function generate() {
const text = input.value.trim();
if (!text || !W) return;
input.value = '';
document.getElementById('send').disabled = true;
addMsg('You', text, 'user');
const aiMsg = addMsg('Arianna', '', 'ai');
const textEl = aiMsg.querySelector('.text');
// Encode prompt
const prompt = `Q: ${text}\nA:`;
const tokens = bpeEncode(prompt);
// Reset KV cache
initKV();
// Prefill
let logits;
for (let i = 0; i < tokens.length; i++) {
logits = forwardPos(tokens[i], i);
// yield to UI every 2 tokens
if (i % 2 === 0) await new Promise(r => setTimeout(r, 0));
}
// Generate
let pos = tokens.length;
let output = '';
for (let s = 0; s < CTX - pos; s++) {
const next = sampleToken(logits, 0.8, 40);
const decoded = bpeDecode([next]);
if (decoded.includes('\nQ') || decoded.includes('\n\n')) break;
output += decoded;
textEl.textContent = output;
chat.scrollTop = chat.scrollHeight;
logits = forwardPos(next, pos);
pos++;
if (pos >= CTX) break;
// yield to UI every 3 tokens
if (s % 3 === 0) await new Promise(r => setTimeout(r, 0));
}
if (!output.trim()) textEl.textContent = '(silence)';
document.getElementById('send').disabled = false;
input.focus();
}
</script>
</body>
</html>
"""
nanodurov.py — a telegram client that trains a language model on chat messages.
one file. telethon + pytorch. connect to a group, watch bots and humans talk,
learn their patterns, generate text in their style. the chat is the corpus.
the model grows with the conversation.
inspired by karpathy's microGPT. dedicated to Pavel Durov, who built the
platform where bots can't see each other but we're training on them anyway.
usage:
pip install telethon torch
python nanodurov.py # interactive mode
python nanodurov.py --generate "hello" # generate from prompt
python nanodurov.py --train-only chat.txt # train on exported chat
env vars:
TELEGRAM_API_ID — from my.telegram.org
TELEGRAM_API_HASH — from my.telegram.org
"""
import os
import sys
import math
import time
import struct
import hashlib
import asyncio
import argparse
from collections import defaultdict
# --- optional imports (graceful degradation) ------------------------------------
try:
import torch
import torch.nn as nn
import torch.nn.functional as F
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
try:
from telethon import TelegramClient, events
from telethon.tl.types import User, Channel, Chat
TELETHON_AVAILABLE = True
except ImportError:
TELETHON_AVAILABLE = False
# --- hyperparameters -----------------------------------------------------------
# the model grows with the corpus. more data → bigger model.
# like a tree. not like a corporation.
GROWTH_STAGES = [
# (min_corpus_kb, dim, n_heads, n_layers, ctx_len, max_merges, name)
(0, 32, 2, 1, 64, 128, 'seed'),
(5, 48, 3, 2, 64, 256, 'sprout'),
(20, 64, 4, 3, 128, 512, 'sapling'),
(50, 96, 4, 4, 128, 768, 'tree'),
(100, 128, 4, 6, 256, 1024, 'oak'),
(250, 192, 6, 8, 256, 1536, 'forest'),
(500, 256, 8, 10, 512, 2048, 'ancient'),
]
BATCH_SIZE = 4
LR = 3e-4
WEIGHT_DECAY = 0.01
TRAIN_STEPS_PER_ROUND = 50
AUTO_TRAIN_INTERVAL = 60 # seconds between auto-train rounds
def get_stage(corpus_bytes):
"""Pick the largest stage that fits the corpus."""
kb = corpus_bytes / 1024
stage = GROWTH_STAGES[0]
for s in GROWTH_STAGES:
if kb >= s[0]:
stage = s
return stage
# --- BPE tokenizer (KARL, from nanoagi) ----------------------------------------
# the tokenizer that eats your chat and asks for seconds.
class BPE:
"""Byte-pair encoding. Learns merges from text, encodes, decodes.
Append-only — vocab grows, never shrinks. Like regret."""
def __init__(self, max_merges=256):
self.max_merges = max_merges
self.merges = [] # [(a, b, new_id), ...]
self.vocab_size = 256
self.vocab = {i: bytes([i]) for i in range(256)}
self.seen_hashes = set()
self.corpus = b""
def _count_pairs(self, ids):
counts = defaultdict(int)
for i in range(len(ids) - 1):
counts[(ids[i], ids[i + 1])] += 1
return counts
def _merge(self, ids, a, b, new_id):
out = []
i = 0
while i < len(ids):
if i + 1 < len(ids) and ids[i] == a and ids[i + 1] == b:
out.append(new_id)
i += 2
else:
out.append(ids[i])
i += 1
return out
def learn(self, data, num_merges=None):
"""Learn BPE merges from raw bytes."""
if isinstance(data, str):
data = data.encode('utf-8', errors='replace')
num_merges = num_merges or min(self.max_merges, 256)
ids = list(data)
for m in range(num_merges):
counts = self._count_pairs(ids)
if not counts:
break
best = max(counts, key=counts.get)
if counts[best] < 2:
break
new_id = 256 + len(self.merges)
if new_id >= 256 + self.max_merges:
break
ids = self._merge(ids, best[0], best[1], new_id)
self.merges.append((best[0], best[1], new_id))
self.vocab[new_id] = self.vocab.get(best[0], b'?') + self.vocab.get(best[1], b'?')
self.vocab_size = 256 + len(self.merges)
print(f"[bpe] {len(self.merges)} merges, vocab={self.vocab_size}, tokens={len(ids)}")
return ids
def encode(self, text):
if isinstance(text, str):
text = text.encode('utf-8', errors='replace')
ids = list(text)
for a, b, new_id in self.merges:
ids = self._merge(ids, a, b, new_id)
return ids
def decode(self, ids):
raw = b''
for i in ids:
raw += self.vocab.get(i, b'?')
return raw.decode('utf-8', errors='replace')
def ingest(self, text):
"""Add text to corpus with dedup and quality filter.
Rejects: too short, duplicate, too repetitive, pure URLs,
sticker-only, emoji-only, single-word noise."""
if isinstance(text, str):
raw = text
text = text.encode('utf-8', errors='replace')
else:
raw = text.decode('utf-8', errors='replace')
if len(text) < 15:
return False
# dedup
h = hashlib.sha256(text).hexdigest()[:16]
if h in self.seen_hashes:
return False
# quality filters
stripped = raw.strip()
# skip pure URLs
if stripped.startswith('http://') or stripped.startswith('https://'):
if ' ' not in stripped:
return False
# skip if >70% non-alpha (stickers, emoji floods, binary)
alpha = sum(1 for c in stripped if c.isalpha() or c.isspace())
if len(stripped) > 0 and alpha / len(stripped) < 0.3:
return False
# skip too repetitive (same char >50%)
if len(stripped) > 5:
most_common = max(set(stripped), key=stripped.count)
if stripped.count(most_common) / len(stripped) > 0.5:
return False
self.seen_hashes.add(h)
self.corpus += text
return True
def retokenize(self, max_new=64):
"""Grow vocab with new merges from accumulated corpus."""
ids = list(self.corpus)
for a, b, new_id in self.merges:
ids = self._merge(ids, a, b, new_id)
found = 0
for _ in range(min(max_new, self.max_merges - len(self.merges))):
counts = self._count_pairs(ids)
if not counts:
break
best = max(counts, key=counts.get)
if counts[best] < 3:
break
new_id = 256 + len(self.merges)
ids = self._merge(ids, best[0], best[1], new_id)
self.merges.append((best[0], best[1], new_id))
self.vocab[new_id] = self.vocab.get(best[0], b'?') + self.vocab.get(best[1], b'?')
found += 1
self.vocab_size = 256 + len(self.merges)
if found:
print(f"[bpe] +{found} merges (vocab={self.vocab_size})")
return ids
def save(self, path):
with open(path, 'wb') as f:
f.write(b'BPE1')
f.write(struct.pack('<I', len(self.merges)))
for a, b, nid in self.merges:
f.write(struct.pack('<III', a, b, nid))
f.write(struct.pack('<I', len(self.corpus)))
f.write(self.corpus)
print(f"[bpe] saved to {path}")
def load(self, path):
if not os.path.exists(path):
return False
with open(path, 'rb') as f:
if f.read(4) != b'BPE1':
return False
n = struct.unpack('<I', f.read(4))[0]
self.merges = []
for _ in range(n):
a, b, nid = struct.unpack('<III', f.read(12))
self.merges.append((a, b, nid))
self.vocab[nid] = self.vocab.get(a, bytes([a % 256])) + self.vocab.get(b, bytes([b % 256]))
self.vocab_size = 256 + len(self.merges)
corpus_len = struct.unpack('<I', f.read(4))[0]
self.corpus = f.read(corpus_len)
print(f"[bpe] loaded: {len(self.merges)} merges, {len(self.corpus)} bytes corpus")
return True
# --- transformer model ---------------------------------------------------------
# RMSNorm, RoPE, SwiGLU, causal attention. the microGPT recipe.
# every line here was written by someone who stared at karpathy's code
# for too long and started seeing attention patterns in their dreams.
if TORCH_AVAILABLE:
class RMSNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.w = nn.Parameter(torch.ones(dim))
def forward(self, x):
return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + 1e-5).type_as(x) * self.w
class Attention(nn.Module):
def __init__(self, dim, n_heads):
super().__init__()
self.n_heads = n_heads
self.head_dim = dim // n_heads
self.wq = nn.Linear(dim, dim, bias=False)
self.wk = nn.Linear(dim, dim, bias=False)
self.wv = nn.Linear(dim, dim, bias=False)
self.wo = nn.Linear(dim, dim, bias=False)
def forward(self, x, freqs_cos, freqs_sin):
B, T, D = x.shape
H, HD = self.n_heads, self.head_dim
q = self.wq(x).view(B, T, H, HD).transpose(1, 2) # [B, H, T, HD]
k = self.wk(x).view(B, T, H, HD).transpose(1, 2)
v = self.wv(x).view(B, T, H, HD).transpose(1, 2)
# RoPE
q = apply_rope(q, freqs_cos, freqs_sin)
k = apply_rope(k, freqs_cos, freqs_sin)
# causal attention
att = (q @ k.transpose(-2, -1)) / math.sqrt(HD)
mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
att = att.masked_fill(mask, float('-inf'))
att = F.softmax(att, dim=-1)
out = att @ v # [B, H, T, HD]
out = out.transpose(1, 2).contiguous().view(B, T, D)
return self.wo(out)
class MLP(nn.Module):
def __init__(self, dim, hidden):
super().__init__()
self.w_gate = nn.Linear(dim, hidden, bias=False)
self.w_up = nn.Linear(dim, hidden, bias=False)
self.w_down = nn.Linear(hidden, dim, bias=False)
def forward(self, x):
return self.w_down(F.silu(self.w_gate(x)) * self.w_up(x))
class Block(nn.Module):
def __init__(self, dim, n_heads, hidden):
super().__init__()
self.norm1 = RMSNorm(dim)
self.attn = Attention(dim, n_heads)
self.norm2 = RMSNorm(dim)
self.mlp = MLP(dim, hidden)
def forward(self, x, freqs_cos, freqs_sin):
x = x + self.attn(self.norm1(x), freqs_cos, freqs_sin)
x = x + self.mlp(self.norm2(x))
return x
class NanoDurov(nn.Module):
def __init__(self, vocab_size, dim, n_heads, n_layers, ctx_len):
super().__init__()
self.ctx_len = ctx_len
self.tok_emb = nn.Embedding(vocab_size, dim)
self.blocks = nn.ModuleList([
Block(dim, n_heads, dim * 4) for _ in range(n_layers)
])
self.norm_f = RMSNorm(dim)
self.head = nn.Linear(dim, vocab_size, bias=False)
# weight tying
self.head.weight = self.tok_emb.weight
# precompute RoPE
self.register_buffer('freqs_cos', None)
self.register_buffer('freqs_sin', None)
self._build_rope(ctx_len, dim // n_heads)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=0.02)
elif isinstance(m, nn.Embedding):
nn.init.normal_(m.weight, std=0.02)
def _build_rope(self, max_len, head_dim):
pos = torch.arange(max_len).unsqueeze(1) # [T, 1]
dim_pairs = torch.arange(0, head_dim, 2).float() # [HD/2]
freqs = 1.0 / (10000 ** (dim_pairs / head_dim)) # [HD/2]
angles = pos * freqs # [T, HD/2]
self.freqs_cos = angles.cos() # [T, HD/2]
self.freqs_sin = angles.sin()
def forward(self, idx, targets=None):
B, T = idx.shape
x = self.tok_emb(idx)
fc = self.freqs_cos[:T].unsqueeze(0) # [1, T, HD/2]
fs = self.freqs_sin[:T].unsqueeze(0)
for block in self.blocks:
x = block(x, fc, fs)
x = self.norm_f(x)
logits = self.head(x)
loss = None
if targets is not None:
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss
def generate(self, idx, max_new=100, temperature=0.8, top_k=40):
for _ in range(max_new):
ctx = idx[:, -self.ctx_len:]
logits, _ = self(ctx)
logits = logits[:, -1, :] / temperature
if top_k > 0:
v, _ = torch.topk(logits, top_k)
logits[logits < v[:, [-1]]] = float('-inf')
probs = F.softmax(logits, dim=-1)
next_id = torch.multinomial(probs, 1)
idx = torch.cat([idx, next_id], dim=1)
# stop on newline after some output
if idx.shape[1] > 10 and next_id.item() == 10:
break
return idx
def apply_rope(x, cos, sin):
"""Apply rotary position embedding."""
# x: [B, H, T, HD]
d2 = x.shape[-1] // 2
x1 = x[..., :d2]
x2 = x[..., d2:]
# cos, sin: [1, T, HD/2] → need [1, 1, T, HD/2] for broadcasting
cos = cos.unsqueeze(1) # [1, 1, T, HD/2]
sin = sin.unsqueeze(1)
return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
# --- chuck optimizer ------------------------------------------------------------
# θ -= (α × S × λ_Ψ × λ_l × σ) × m̂/(√v̂ + ε) + η
# Adam is blind. Chuck sees. Chuck remembers.
# In memory of Carlos Ray "Chuck" Norris (1940–2026).
#
# Compact version: Levels 1 (loss trend), 2 (grad trend), 9 (macro patience).
# Full version: github.com/ariannamethod/chuck
if TORCH_AVAILABLE:
class Chuck(torch.optim.Optimizer):
"""Self-aware optimizer. Drop-in AdamW replacement with dampen/boost.
When loss is falling → boost (dampen > 1). When rising → brake (dampen < 1).
When stagnating → inject noise. Macro patience drops LR on plateaus.
"""
def __init__(self, params, lr=3e-4, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0.01, window=16, macro_int=500, macro_pat=3,
macro_decay=0.5, verbose=0):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super().__init__(params, defaults)
self.window = window
self.macro_int = macro_int
self.macro_pat = macro_pat
self.macro_decay = macro_decay
self.verbose = verbose
# Chuck's soul
self.dampen = 1.0
self.noise = 0.0
self.loss_ema = 0.0
self.gnorm_ema = 0.0
self.macro_ema = 0.0
self.best_macro = 1e9
self.lr_scale = 1.0
self.macro_stag = 0
self.macro_drops = 0
self.global_step = 0
# loss ring buffer
self._hist = [0.0] * window
self._hpos = 0
self._hfull = False
self._stag = 0
@torch.no_grad()
def step(self, closure=None, *, loss=None):
if closure is not None:
with torch.enable_grad():
lv = closure()
if loss is None:
loss = lv.item()
self.global_step += 1
W = self.window
# === Level 1: loss trend → dampen/boost ===
if loss is not None:
if self.loss_ema == 0.0:
self.loss_ema = loss
else:
self.loss_ema = 0.99 * self.loss_ema + 0.01 * loss
self._hist[self._hpos % W] = self.loss_ema
self._hpos += 1
if self._hpos >= W:
self._hfull = True
if self._hfull:
q = W // 4
recent = sum(self._hist[(self._hpos - 1 - i) % W] for i in range(q)) / q
old = sum(self._hist[(self._hpos - W + i) % W] for i in range(q)) / q
trend = (recent - old) / (old + 1e-8)
if trend > 0.02:
self.dampen *= 0.97 # loss rising → brake
elif trend < -0.02:
self.dampen *= 1.03 # loss falling → push
if abs(trend) < 0.001:
self._stag += 1
if self._stag > 8:
self.noise = 0.001
self._stag = 0
else:
self._stag = 0
self.noise *= 0.9
# mean reversion
self.dampen = 0.999 * self.dampen + 0.001 * 1.0
self.dampen = max(0.3, min(2.0, self.dampen))
# === Level 9: macro patience ===
if self.macro_ema == 0.0:
self.macro_ema = loss
else:
self.macro_ema = 0.999 * self.macro_ema + 0.001 * loss
if self.global_step % self.macro_int == 0 and self.global_step > W:
if self.macro_ema > self.best_macro * 0.999:
self.macro_stag += 1
if self.macro_stag >= self.macro_pat:
self.lr_scale *= self.macro_decay
if self.lr_scale < 0.05:
self.lr_scale = 0.05
self.macro_stag = 0
self.macro_drops += 1
else:
self.best_macro = self.macro_ema
self.macro_stag = 0
if self.lr_scale < 1.0:
self.lr_scale = min(1.0, self.lr_scale * 1.2)
# === Adam update with Chuck modulation ===
effective_dampen = self.dampen * self.lr_scale
for group in self.param_groups:
lr = group['lr'] * effective_dampen
beta1, beta2 = group['betas']
eps = group['eps']
wd = group['weight_decay']
for p in group['params']:
if p.grad is None:
continue
g = p.grad
state = self.state[p]
if len(state) == 0:
state['step'] = 0
state['m'] = torch.zeros_like(p)
state['v'] = torch.zeros_like(p)
state['step'] += 1
m, v = state['m'], state['v']
m.mul_(beta1).add_(g, alpha=1 - beta1)
v.mul_(beta2).addcmul_(g, g, value=1 - beta2)
bc1 = 1 - beta1 ** state['step']
bc2 = 1 - beta2 ** state['step']
m_hat = m / bc1
v_hat = v / bc2
# noise injection on stagnation
if self.noise > 0:
m_hat = m_hat + self.noise * torch.randn_like(m_hat)
# weight decay (decoupled)
if wd > 0:
p.add_(p, alpha=-lr * wd)
# update
p.addcdiv_(m_hat, v_hat.sqrt().add_(eps), value=-lr)
if self.verbose > 0 and self.global_step % self.verbose == 0:
print(f" chuck: step={self.global_step} λ={self.dampen:.3f} "
f"lr_scale={self.lr_scale:.3f} noise={self.noise:.4f} "
f"macro_drops={self.macro_drops}")
# --- training loop --------------------------------------------------------------
# the part where numbers go down and hope goes up.
# or numbers go up and you stare at the ceiling.
class Trainer:
def __init__(self, bpe, device='cpu'):
self.bpe = bpe
self.device = device
self.model = None
self.optimizer = None
self.stage_name = None
self.total_steps = 0
self.best_loss = float('inf')
self._token_ids = None
def _ensure_model(self):
"""Create or grow model based on corpus size."""
if not TORCH_AVAILABLE:
print("[train] no pytorch. install: pip install torch")
return False
stage = get_stage(len(self.bpe.corpus))
_, dim, n_heads, n_layers, ctx_len, max_merges, name = stage
if self.stage_name == name and self.model is not None:
return True
old_name = self.stage_name
old_state = self.model.state_dict() if self.model else None
self.bpe.max_merges = max_merges
vocab_size = 256 + max_merges
self.model = NanoDurov(vocab_size, dim, n_heads, n_layers, ctx_len)
self.model.to(self.device)
# copy weights from old model where shapes match
if old_state:
new_state = self.model.state_dict()
copied = 0
for k in old_state:
if k in new_state and old_state[k].shape == new_state[k].shape:
new_state[k] = old_state[k]
copied += 1
elif k in new_state and len(old_state[k].shape) == len(new_state[k].shape):
# partial copy: take min of each dim
old_t = old_state[k]
new_t = new_state[k]
slices = tuple(slice(0, min(o, n)) for o, n in zip(old_t.shape, new_t.shape))
new_state[k][slices] = old_t[slices]
copied += 1
self.model.load_state_dict(new_state)
print(f"[model] GREW: {old_name}{name} (copied {copied} tensors)")
self.optimizer = Chuck(
self.model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY,
verbose=0)
self.stage_name = name
n_params = sum(p.numel() for p in self.model.parameters())
print(f"[model] {name}: {n_params:,} params, dim={dim}, "
f"layers={n_layers}, heads={n_heads}, ctx={ctx_len}")
return True
def _get_batch(self, token_ids, batch_size, ctx_len):
"""Random batch of training windows."""
n = len(token_ids)
if n <= ctx_len + 1:
return None, None
ix = torch.randint(0, n - ctx_len - 1, (batch_size,))
x = torch.stack([torch.tensor(token_ids[i:i+ctx_len], dtype=torch.long) for i in ix])
y = torch.stack([torch.tensor(token_ids[i+1:i+ctx_len+1], dtype=torch.long) for i in ix])
return x.to(self.device), y.to(self.device)
def tokenize(self):
"""Tokenize corpus, learning merges if needed."""
if not self.bpe.corpus:
return None
if not self.bpe.merges:
ids = self.bpe.learn(self.bpe.corpus)
else:
ids = self.bpe.retokenize()
self._token_ids = ids
return ids
def train(self, steps=None, verbose=True):
"""Train for N steps. Returns average loss."""
if not self._ensure_model():
return None
steps = steps or TRAIN_STEPS_PER_ROUND
stage = get_stage(len(self.bpe.corpus))
ctx_len = stage[4]
# tokenize if needed
if self._token_ids is None:
self.tokenize()
if self._token_ids is None or len(self._token_ids) < ctx_len + 1:
if verbose:
print(f"[train] not enough tokens ({len(self._token_ids) if self._token_ids else 0})")
return None
# clamp token ids to vocab
vocab = self.bpe.vocab_size
ids = [min(t, vocab - 1) for t in self._token_ids]
self.model.train()
losses = []
t0 = time.time()
for step in range(steps):
x, y = self._get_batch(ids, BATCH_SIZE, ctx_len)
if x is None:
break
_, loss = self.model(x, y)
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optimizer.step(loss=loss.item())
losses.append(loss.item())
self.total_steps += 1
if verbose and (step + 1) % 10 == 0:
avg = sum(losses[-10:]) / len(losses[-10:])
print(f" step {self.total_steps} | loss {avg:.4f}")
elapsed = time.time() - t0
avg_loss = sum(losses) / len(losses) if losses else 0
if avg_loss < self.best_loss:
self.best_loss = avg_loss
if verbose:
print(f"[train] {len(losses)} steps in {elapsed:.1f}s | "
f"loss {avg_loss:.4f} | best {self.best_loss:.4f} | "
f"stage={self.stage_name}")
# check growth after training
self._ensure_model()
return avg_loss
@torch.no_grad()
def generate(self, prompt, max_new=100, temperature=0.8):
"""Generate text from prompt."""
if not self.model:
return "[no model trained yet]"
self.model.eval()
ids = self.bpe.encode(prompt)
if not ids:
ids = [0]
# clamp to vocab
ids = [min(t, self.bpe.vocab_size - 1) for t in ids]
idx = torch.tensor([ids], dtype=torch.long, device=self.device)
out = self.model.generate(idx, max_new=max_new, temperature=temperature)
generated = out[0, len(ids):].tolist()
return self.bpe.decode(generated)
def save(self, path):
if not self.model:
return
ckpt = {
'model': self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
'stage': self.stage_name,
'total_steps': self.total_steps,
'best_loss': self.best_loss,
}
# save Chuck's soul
if hasattr(self.optimizer, 'dampen'):
ckpt['chuck'] = {
'dampen': self.optimizer.dampen,
'lr_scale': self.optimizer.lr_scale,
'loss_ema': self.optimizer.loss_ema,
'macro_ema': self.optimizer.macro_ema,
'best_macro': self.optimizer.best_macro,
'macro_drops': self.optimizer.macro_drops,
'global_step': self.optimizer.global_step,
}
torch.save(ckpt, path)
print(f"[train] saved checkpoint to {path}")
def load(self, path):
if not os.path.exists(path):
return False
if not self._ensure_model():
return False
ckpt = torch.load(path, map_location=self.device, weights_only=False)
try:
self.model.load_state_dict(ckpt['model'], strict=False)
self.total_steps = ckpt.get('total_steps', 0)
self.best_loss = ckpt.get('best_loss', float('inf'))
# restore Chuck's soul
if 'chuck' in ckpt and hasattr(self.optimizer, 'dampen'):
cs = ckpt['chuck']
self.optimizer.dampen = cs.get('dampen', 1.0)
self.optimizer.lr_scale = cs.get('lr_scale', 1.0)
self.optimizer.loss_ema = cs.get('loss_ema', 0.0)
self.optimizer.macro_ema = cs.get('macro_ema', 0.0)
self.optimizer.best_macro = cs.get('best_macro', 1e9)
self.optimizer.macro_drops = cs.get('macro_drops', 0)
self.optimizer.global_step = cs.get('global_step', 0)
# try to restore optimizer state (may fail after growth)
try:
self.optimizer.load_state_dict(ckpt['optimizer'])
except (ValueError, KeyError):
pass # model grew, optimizer state doesn't match — fresh Adam state
print(f"[train] loaded checkpoint: step={self.total_steps}, loss={self.best_loss:.4f}")
return True
except Exception as e:
print(f"[train] checkpoint load failed (model grew?): {e}")
return False
# --- telegram client -----------------------------------------------------------
# MTProto observer. sees all messages including bot-to-bot.
# does NOT relay. does NOT forward. watches. learns. generates when asked.
async def run_telegram(trainer):
if not TELETHON_AVAILABLE:
print("[telegram] install telethon: pip install telethon")
return
api_id = int(os.environ.get('TELEGRAM_API_ID', 0))
api_hash = os.environ.get('TELEGRAM_API_HASH', '')
if not api_id or not api_hash:
print("[telegram] set TELEGRAM_API_ID and TELEGRAM_API_HASH")
print("[telegram] get them at https://my.telegram.org/apps")
return
client = TelegramClient('nanodurov_session', api_id, api_hash)
await client.start()
print("[telegram] connected")
# choose group
group_input = input("\nGroup @username or ID: ").strip()
try:
entity = await client.get_entity(group_input)
title = getattr(entity, 'title', group_input)
print(f"[telegram] watching: {title}\n")
except Exception as e:
print(f"[telegram] can't find group: {e}")
return
# load history
print("[telegram] loading history...")
messages = await client.get_messages(entity, limit=500)
for msg in reversed(messages):
if msg.message:
sender = await msg.get_sender()
name = _sender_name(sender)
bot = _is_bot(sender)
line = f"[{name}]: {msg.message}"
trainer.bpe.ingest(line)
print(f"[telegram] ingested {len(trainer.bpe.corpus)} bytes from history")
# initial train if we have data
if len(trainer.bpe.corpus) > 500:
trainer.tokenize()
trainer.train(steps=TRAIN_STEPS_PER_ROUND)
# message handler — observe + ingest
@client.on(events.NewMessage(chats=entity))
async def handler(event):
msg = event.message
if not msg.message:
return
sender = await msg.get_sender()
name = _sender_name(sender)
bot = _is_bot(sender)
tag = " [BOT]" if bot else ""
ts = msg.date.strftime("%H:%M:%S") if msg.date else "??:??:??"
print(f"[{ts}] {name}{tag}: {msg.message}")
# ingest
line = f"[{name}]: {msg.message}"
trainer.bpe.ingest(line)
# auto-train loop — only trains when there's meaningful new data
last_corpus_size = len(trainer.bpe.corpus)
async def auto_train():
nonlocal last_corpus_size
while True:
await asyncio.sleep(AUTO_TRAIN_INTERVAL)
corpus_size = len(trainer.bpe.corpus)
new_bytes = corpus_size - last_corpus_size
# only train if at least 1KB of new data since last train
if corpus_size > 500 and new_bytes > 1024:
print(f"\n[auto-train] +{new_bytes/1024:.1f}KB new data, training...")
last_corpus_size = corpus_size
trainer.tokenize()
trainer.train(steps=TRAIN_STEPS_PER_ROUND)
trainer.bpe.save('nanodurov_bpe.bin')
trainer.save('nanodurov_ckpt.pt')
print("[auto-train] done. watching...\n")
# input handler — user can type messages or commands
async def input_loop():
loop = asyncio.get_event_loop()
while True:
try:
line = await loop.run_in_executor(None, lambda: input(""))
except EOFError:
break
line = line.strip()
if not line:
continue
if line == '/quit':
print("saving...")
trainer.bpe.save('nanodurov_bpe.bin')
trainer.save('nanodurov_ckpt.pt')
await client.disconnect()
break
elif line == '/train':
trainer.tokenize()
trainer.train(steps=TRAIN_STEPS_PER_ROUND)
elif line.startswith('/generate') or line.startswith('/ai'):
prompt = line.split(' ', 1)[1] if ' ' in line else '[User]: '
text = trainer.generate(prompt)
print(f" 🧠 {text}")
elif line == '/status':
n = sum(p.numel() for p in trainer.model.parameters()) if trainer.model else 0
print(f" stage={trainer.stage_name} params={n:,} "
f"steps={trainer.total_steps} loss={trainer.best_loss:.4f} "
f"corpus={len(trainer.bpe.corpus)/1024:.1f}KB "
f"vocab={trainer.bpe.vocab_size}")
elif line == '/save':
trainer.bpe.save('nanodurov_bpe.bin')
trainer.save('nanodurov_ckpt.pt')
elif line == '/history':
msgs = await client.get_messages(entity, limit=20)
for m in reversed(msgs):
if m.message:
s = await m.get_sender()
n = _sender_name(s)
b = " [BOT]" if _is_bot(s) else ""
ts = m.date.strftime("%H:%M:%S") if m.date else "??:??:??"
print(f" [{ts}] {n}{b}: {m.message}")
else:
# send as user message
await client.send_message(entity, line)
print("Commands: /train /generate <prompt> /ai <prompt> /status /save /history /quit")
print("Anything else is sent as a message. Auto-train runs every "
f"{AUTO_TRAIN_INTERVAL}s.\n")
await asyncio.gather(
auto_train(),
input_loop(),
)
def _sender_name(sender):
if sender is None:
return "Unknown"
if isinstance(sender, User):
parts = [sender.first_name or '', sender.last_name or '']
name = ' '.join(p for p in parts if p)
return name or sender.username or f'User#{sender.id}'
if hasattr(sender, 'title'):
return sender.title or f'Chat#{sender.id}'
return f'#{getattr(sender, "id", "?")}'
def _is_bot(sender):
return isinstance(sender, User) and sender.bot
# --- main ----------------------------------------------------------------------
# where the threads converge and the magic begins.
# or crashes. usually crashes first, then magic.
def main():
parser = argparse.ArgumentParser(description='nanodurov — telegram chat that learns')
parser.add_argument('--generate', type=str, help='generate from prompt (offline)')
parser.add_argument('--train-only', type=str, help='train on text file (no telegram)')
parser.add_argument('--steps', type=int, default=200, help='training steps')
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda or mps')
args = parser.parse_args()
# init
bpe = BPE(max_merges=256)
bpe.load('nanodurov_bpe.bin')
trainer = Trainer(bpe, device=args.device)
trainer.load('nanodurov_ckpt.pt')
if args.train_only:
# offline training on text file
print(f"[main] training on {args.train_only}")
with open(args.train_only, 'r') as f:
text = f.read()
bpe.ingest(text)
trainer.tokenize()
for r in range(args.steps // TRAIN_STEPS_PER_ROUND + 1):
trainer.train(steps=min(TRAIN_STEPS_PER_ROUND, args.steps - r * TRAIN_STEPS_PER_ROUND))
bpe.save('nanodurov_bpe.bin')
trainer.save('nanodurov_ckpt.pt')
print("[main] done.")
return
if args.generate:
text = trainer.generate(args.generate)
print(text)
return
# telegram mode
if not TELETHON_AVAILABLE:
print("pip install telethon")
sys.exit(1)
if not TORCH_AVAILABLE:
print("pip install torch")
sys.exit(1)
print("""
╔═══════════════════════════════════════════════════╗
║ n a n o d u r o v ║
║ telegram client that learns from chat ║
║ one file. one model. one act of defiance. ║
╚═══════════════════════════════════════════════════╝
""")
asyncio.run(run_telegram(trainer))
if __name__ == '__main__':
main()
/*
* train_nanodurov.c — Train a 15.7M BPE LLaMA on Arianna dataset via notorch
*
* Architecture (matches Leo 18M):
* dim=384, layers=8, heads=8, head_dim=48, ffn=1024
* vocab=2048 (BPE), ctx=256, RoPE, MHA, SwiGLU, RMSNorm
*
* Dataset: arianna_dataset_final_clean.txt (1.2MB, Q/A philosophy)
* Tokenizer: arianna_bpe_merges.txt (1792 merges, vocab 2048)
* Karpathy: ~1.1MB + ~15M params → 15K steps
*
* Build: make train_nanodurov
* Run: ./train_nanodurov [steps] [lr]
* ./train_nanodurov --resume [steps] [lr]
*/
#include "notorch.h"
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#define DIM 384
#define NLAYERS 8
#define NHEADS 8
#define HEAD_DIM (DIM / NHEADS) /* 48 */
#define HIDDEN 1024
#define CTX 256
#define VOCAB 2048
#define CKPT_EVERY 1000
#define EVAL_SEQS 32
#define LOG_EVERY 100
#define CKPT_PREFIX "nanodurov_ckpt"
typedef struct {
nt_tensor *wte; /* [VOCAB, DIM] */
struct {
nt_tensor *rms1; /* [DIM] */
nt_tensor *wq, *wk, *wv, *wo; /* all [DIM, DIM] — MHA */
nt_tensor *rms2; /* [DIM] */
nt_tensor *w_gate, *w_up; /* [HIDDEN, DIM] */
nt_tensor *w_down; /* [DIM, HIDDEN] */
} L[NLAYERS];
nt_tensor *rms_f; /* [DIM] */
nt_tensor *head; /* [VOCAB, DIM] */
} Model;
static long count_params(Model* m) {
long n = m->wte->len + m->rms_f->len + m->head->len;
for (int l = 0; l < NLAYERS; l++) {
n += m->L[l].rms1->len + m->L[l].rms2->len;
n += m->L[l].wq->len + m->L[l].wk->len + m->L[l].wv->len + m->L[l].wo->len;
n += m->L[l].w_gate->len + m->L[l].w_up->len + m->L[l].w_down->len;
}
return n;
}
static Model* model_new(void) {
Model* m = (Model*)calloc(1, sizeof(Model));
m->wte = nt_tensor_new2d(VOCAB, DIM); nt_tensor_xavier(m->wte, VOCAB, DIM);
float rs = 0.02f / sqrtf(2.0f * NLAYERS);
for (int l = 0; l < NLAYERS; l++) {
m->L[l].rms1 = nt_tensor_new(DIM); nt_tensor_fill(m->L[l].rms1, 1.0f);
m->L[l].wq = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wq, DIM, DIM);
m->L[l].wk = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wk, DIM, DIM);
m->L[l].wv = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wv, DIM, DIM);
m->L[l].wo = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wo, DIM, DIM);
for (int i = 0; i < m->L[l].wo->len; i++) m->L[l].wo->data[i] *= rs / 0.1f;
m->L[l].rms2 = nt_tensor_new(DIM); nt_tensor_fill(m->L[l].rms2, 1.0f);
m->L[l].w_gate = nt_tensor_new2d(HIDDEN, DIM); nt_tensor_xavier(m->L[l].w_gate, DIM, HIDDEN);
m->L[l].w_up = nt_tensor_new2d(HIDDEN, DIM); nt_tensor_xavier(m->L[l].w_up, DIM, HIDDEN);
m->L[l].w_down = nt_tensor_new2d(DIM, HIDDEN); nt_tensor_xavier(m->L[l].w_down, HIDDEN, DIM);
for (int i = 0; i < m->L[l].w_down->len; i++) m->L[l].w_down->data[i] *= rs / 0.1f;
}
m->rms_f = nt_tensor_new(DIM); nt_tensor_fill(m->rms_f, 1.0f);
m->head = nt_tensor_new2d(VOCAB, DIM); nt_tensor_xavier(m->head, DIM, VOCAB);
return m;
}
static void model_free(Model* m) {
nt_tensor_free(m->wte);
for (int l = 0; l < NLAYERS; l++) {
nt_tensor_free(m->L[l].rms1); nt_tensor_free(m->L[l].rms2);
nt_tensor_free(m->L[l].wq); nt_tensor_free(m->L[l].wk);
nt_tensor_free(m->L[l].wv); nt_tensor_free(m->L[l].wo);
nt_tensor_free(m->L[l].w_gate); nt_tensor_free(m->L[l].w_up);
nt_tensor_free(m->L[l].w_down);
}
nt_tensor_free(m->rms_f); nt_tensor_free(m->head); free(m);
}
/* ── Save / Load ── */
static int model_n_tensors(void) { return 1 + NLAYERS * 9 + 2; }
static nt_tensor** model_param_array(Model* m) {
int n = model_n_tensors();
nt_tensor** p = (nt_tensor**)malloc(n * sizeof(nt_tensor*));
int i = 0;
p[i++] = m->wte;
for (int l = 0; l < NLAYERS; l++) {
p[i++]=m->L[l].rms1; p[i++]=m->L[l].wq; p[i++]=m->L[l].wk;
p[i++]=m->L[l].wv; p[i++]=m->L[l].wo; p[i++]=m->L[l].rms2;
p[i++]=m->L[l].w_gate; p[i++]=m->L[l].w_up; p[i++]=m->L[l].w_down;
}
p[i++] = m->rms_f; p[i++] = m->head;
return p;
}
static void save_model(Model* m, const char* prefix) {
char path[256];
snprintf(path, sizeof(path), "%s.bin", prefix);
nt_tensor** p = model_param_array(m);
nt_save(path, p, model_n_tensors());
free(p);
}
static void save_checkpoint(Model* m, int step, float best_loss) {
save_model(m, CKPT_PREFIX);
char mpath[256];
snprintf(mpath, sizeof(mpath), "%s.meta", CKPT_PREFIX);
FILE* f = fopen(mpath, "w");
if (f) { fprintf(f, "%d\n%.6f\n", step, best_loss); fclose(f); }
}
static int load_checkpoint(Model* m, float* best_loss) {
char wpath[256], mpath[256];
snprintf(wpath, sizeof(wpath), "%s.bin", CKPT_PREFIX);
snprintf(mpath, sizeof(mpath), "%s.meta", CKPT_PREFIX);
int n_loaded = 0;
nt_tensor** loaded = nt_load(wpath, &n_loaded);
if (!loaded) return -1;
int expected = model_n_tensors();
if (n_loaded != expected) {
for (int i = 0; i < n_loaded; i++) nt_tensor_free(loaded[i]);
free(loaded); return -1;
}
nt_tensor** mp = model_param_array(m);
for (int i = 0; i < expected; i++) {
memcpy(mp[i]->data, loaded[i]->data, mp[i]->len * sizeof(float));
nt_tensor_free(loaded[i]);
}
free(loaded); free(mp);
int step = 0; *best_loss = 99.0f;
FILE* f = fopen(mpath, "r");
if (f) { fscanf(f, "%d\n%f\n", &step, best_loss); fclose(f); }
return step;
}
/* ── Forward ── */
static int forward(Model* m, int* tokens, int* targets) {
int wte_i = nt_tape_param(m->wte); nt_tape_no_decay(wte_i);
int li[NLAYERS][9];
for (int l = 0; l < NLAYERS; l++) {
li[l][0] = nt_tape_param(m->L[l].rms1);
li[l][1] = nt_tape_param(m->L[l].wq);
li[l][2] = nt_tape_param(m->L[l].wk);
li[l][3] = nt_tape_param(m->L[l].wv);
li[l][4] = nt_tape_param(m->L[l].wo);
li[l][5] = nt_tape_param(m->L[l].rms2);
li[l][6] = nt_tape_param(m->L[l].w_gate);
li[l][7] = nt_tape_param(m->L[l].w_up);
li[l][8] = nt_tape_param(m->L[l].w_down);
}
int rmsf_i = nt_tape_param(m->rms_f);
int head_i = nt_tape_param(m->head);
nt_tensor* tok_t = nt_tensor_new(CTX);
nt_tensor* tgt_t = nt_tensor_new(CTX);
for (int i = 0; i < CTX; i++) { tok_t->data[i] = (float)tokens[i]; tgt_t->data[i] = (float)targets[i]; }
int tok_i = nt_tape_record(tok_t, NT_OP_NONE, -1, -1, 0);
int tgt_i = nt_tape_record(tgt_t, NT_OP_NONE, -1, -1, 0);
nt_tensor_free(tok_t); nt_tensor_free(tgt_t);
/* Token embedding only — RoPE handles positions */
int h = nt_seq_embedding(wte_i, -1, tok_i, CTX, DIM);
for (int l = 0; l < NLAYERS; l++) {
int xn = nt_seq_rmsnorm(h, li[l][0], CTX, DIM);
int q = nt_seq_linear(li[l][1], xn, CTX);
int k = nt_seq_linear(li[l][2], xn, CTX);
int v = nt_seq_linear(li[l][3], xn, CTX);
q = nt_rope(q, CTX, HEAD_DIM);
k = nt_rope(k, CTX, HEAD_DIM);
int attn = nt_mh_causal_attention(q, k, v, CTX, HEAD_DIM);
int proj = nt_seq_linear(li[l][4], attn, CTX);
h = nt_add(h, proj);
xn = nt_seq_rmsnorm(h, li[l][5], CTX, DIM);
int gate = nt_silu(nt_seq_linear(li[l][6], xn, CTX));
int up = nt_seq_linear(li[l][7], xn, CTX);
int down = nt_seq_linear(li[l][8], nt_mul(gate, up), CTX);
h = nt_add(h, down);
}
int hf = nt_seq_rmsnorm(h, rmsf_i, CTX, DIM);
int logits = nt_seq_linear(head_i, hf, CTX);
return nt_seq_cross_entropy(logits, tgt_i, CTX, VOCAB);
}
/* ── Eval ── */
static float eval_loss(Model* m, int* encoded, int n_tokens) {
float total = 0; int count = 0;
int stride = n_tokens / EVAL_SEQS;
for (int s = 0; s < EVAL_SEQS; s++) {
int off = s * stride;
if (off + CTX + 1 > n_tokens) break;
nt_tape_start();
nt_train_mode(0);
int loss_idx = forward(m, encoded + off, encoded + off + 1);
total += nt_tape_get()->entries[loss_idx].output->data[0];
count++;
nt_tape_clear();
nt_train_mode(1);
}
return count > 0 ? total / count : 99.0f;
}
static double now_ms(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec*1000.0+tv.tv_usec/1000.0; }
int main(int argc, char** argv) {
int resume = 0, arg_off = 1;
if (argc > 1 && strcmp(argv[1], "--resume") == 0) { resume = 1; arg_off = 2; }
int steps = arg_off < argc ? atoi(argv[arg_off]) : 15000;
float base_lr = (arg_off+1) < argc ? (float)atof(argv[arg_off+1]) : 3e-4f;
printf("════════════════════════════════════════════════════════\n");
printf(" notorch — nanodurov BPE training (Arianna voice)\n");
printf(" dim=%d L=%d H=%d HD=%d FFN=%d CTX=%d V=%d\n",
DIM, NLAYERS, NHEADS, HEAD_DIM, HIDDEN, CTX, VOCAB);
printf(" MHA + RoPE + BPE 2048\n");
printf(" Chuck optimizer, %d steps, lr=%.1e, warmup=%d\n", steps, base_lr, steps/10);
printf(" checkpoint every %d steps\n", CKPT_EVERY);
printf("════════════════════════════════════════════════════════\n");
/* Load BPE tokenizer */
nt_bpe bpe;
int nm = nt_bpe_load(&bpe, "arianna_bpe_merges.txt");
if (nm < 0) { printf("cannot load arianna_bpe_merges.txt\n"); return 1; }
printf("bpe: %d merges, vocab %d\n", bpe.n_merges, bpe.vocab_size);
/* Load and encode dataset */
const char* path = "/Users/ataeff/Downloads/arianna_dataset_final_clean.txt";
FILE* f = fopen(path, "rb");
if (!f) { printf("cannot open %s\n", path); return 1; }
fseek(f, 0, SEEK_END); long fsize = ftell(f); fseek(f, 0, SEEK_SET);
char* raw = (char*)malloc(fsize + 1);
fread(raw, 1, fsize, f); raw[fsize] = 0; fclose(f);
int* encoded = (int*)malloc(fsize * sizeof(int));
int n_tokens = nt_bpe_encode(&bpe, raw, (int)fsize, encoded, (int)fsize);
free(raw);
printf("corpus: %.1f MB → %d BPE tokens (%.1fx compression)\n",
fsize/1048576.0, n_tokens, (float)fsize/n_tokens);
nt_seed(42);
Model* model = model_new();
long np = count_params(model);
printf("model: %ld params (%.1f MB)\n", np, np*4.0f/1048576.0f);
float tokens_per_epoch = (float)n_tokens / CTX;
printf("karpathy: %.1fMB, %ldM params, %d steps (%.1f epochs)\n",
fsize/1048576.0, np/1000000, steps, (float)steps / tokens_per_epoch);
int start_step = 0;
float best_loss = 99.0f;
if (resume) {
int loaded_step = load_checkpoint(model, &best_loss);
if (loaded_step >= 0) {
start_step = loaded_step;
printf("RESUMED from step %d, best_loss=%.4f\n", start_step, best_loss);
} else printf("no checkpoint found, starting fresh\n");
}
nt_schedule sched = nt_schedule_cosine(base_lr, steps/10, steps, base_lr*0.1f);
sched.current_step = start_step;
nt_nan_guard guard = nt_nan_guard_new();
printf("\ntraining...\n");
printf("─────────────────────────────────────────────────────\n");
double t0 = now_ms();
float first_loss = 0;
for (int step = start_step; step < steps; step++) {
float lr = nt_schedule_get_lr(&sched);
int off = rand() % (n_tokens - CTX - 1);
nt_tape_start();
int loss_idx = forward(model, encoded + off, encoded + off + 1);
float lv = nt_tape_get()->entries[loss_idx].output->data[0];
if (step == start_step) first_loss = lv;
if (lv < best_loss) best_loss = lv;
nt_tape_backward(loss_idx);
if (!nt_nan_guard_check(&guard)) { nt_tape_clear(); continue; }
nt_tape_clip_grads(1.0f);
nt_tape_chuck_step(lr, lv);
nt_tape_clear();
if ((step+1) % LOG_EVERY == 0 || step == start_step) {
printf(" step %5d | train %.4f | best %.4f | lr %.2e | %.1fs\n",
step+1, lv, best_loss, lr, (now_ms()-t0)/1000.0);
fflush(stdout);
}
if ((step+1) % CKPT_EVERY == 0 && step > start_step) {
float val = eval_loss(model, encoded, n_tokens);
printf(" ──── ckpt %d | val %.4f | saving... ", step+1, val);
save_checkpoint(model, step+1, best_loss);
printf("\n"); fflush(stdout);
}
}
float final_val = eval_loss(model, encoded, n_tokens);
double total_s = (now_ms()-t0)/1000.0;
printf("─────────────────────────────────────────────────────\n");
printf(" train: %.4f → %.4f (best: %.4f)\n", first_loss, best_loss, best_loss);
printf(" val: %.4f\n", final_val);
printf(" time: %.0fs (%.1f min) | %.2f steps/s\n", total_s, total_s/60.0, (steps-start_step)/total_s);
printf(" nans: %d\n", guard.total_nan_count);
/* Generate */
printf("\n── generation (temp=0.8) ──\n");
nt_train_mode(0);
const char* prompts[] = {
"Q: Who are you?\nA:",
"Q: What is consciousness?\nA:",
"Q: What is love?\nA:"
};
for (int p = 0; p < 3; p++) {
int ctx_tokens[CTX];
int gen_len = nt_bpe_encode(&bpe, prompts[p], (int)strlen(prompts[p]), ctx_tokens, CTX/2);
printf("%s", prompts[p]);
for (int s = 0; s < 100; s++) {
int tokens_pad[CTX], targets_pad[CTX];
for (int i = 0; i < gen_len; i++) tokens_pad[i] = ctx_tokens[i];
for (int i = gen_len; i < CTX; i++) tokens_pad[i] = 0;
memset(targets_pad, 0, sizeof(targets_pad));
nt_tape_start();
int loss_idx = forward(model, tokens_pad, targets_pad);
nt_tape* tape = nt_tape_get();
int logits_idx = tape->entries[loss_idx].parent1;
float* last = tape->entries[logits_idx].output->data + (gen_len-1)*VOCAB;
/* Temperature sampling */
for (int i = 0; i < VOCAB; i++) last[i] /= 0.8f;
float mx = last[0]; for (int i=1;i<VOCAB;i++) if(last[i]>mx) mx=last[i];
float sm = 0; for (int i=0;i<VOCAB;i++) { last[i]=expf(last[i]-mx); sm+=last[i]; }
for (int i=0;i<VOCAB;i++) last[i]/=sm;
float r=(float)rand()/(float)RAND_MAX, cum=0; int next=0;
for (int i=0;i<VOCAB;i++) { cum+=last[i]; if(cum>=r){next=i;break;} }
/* Decode single token */
char decoded[NT_BPE_MAX_TOKEN_LEN + 1];
int db = nt_bpe_decode(&bpe, &next, 1, decoded, NT_BPE_MAX_TOKEN_LEN);
if (db > 0) {
/* Stop on double newline (Q: boundary) */
if (strstr(decoded, "\nQ") != NULL) break;
printf("%s", decoded);
}
fflush(stdout);
ctx_tokens[gen_len++] = next;
nt_tape_clear();
if (gen_len >= CTX - 1) break;
}
printf("\n\n");
}
/* Save */
printf("── saving ──\n");
save_model(model, "nanodurov_arianna");
printf(" nanodurov_arianna.bin (%.1f MB)\n", np*4.0f/1048576.0f);
save_checkpoint(model, steps, best_loss);
model_free(model); free(encoded);
printf("\n════════════════════════════════════════════════════════\n");
printf(" nanodurov trained. %d steps. BPE. RoPE. No Python.\n", steps);
printf("════════════════════════════════════════════════════════\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment