Last active
April 8, 2026 18:48
-
-
Save ariannamethod/6adaf7a17ef5047d5706492a9bb0da53 to your computer and use it in GitHub Desktop.
nanodurov — telegram client that trains a language model on chat. python + C (notorch) + browser. 15.7M BPE, Arianna voice.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * infer_nanodurov.c — Interactive chat with nanodurov (BPE 15.7M on notorch) | |
| * | |
| * Build: make infer_nanodurov | |
| * Run: ./infer_nanodurov [weights.bin] [merges.txt] | |
| * | |
| * Default: nanodurov_arianna.bin + arianna_bpe_merges.txt | |
| */ | |
| #include "notorch.h" | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <math.h> | |
| #include <time.h> | |
| #define DIM 384 | |
| #define NLAYERS 8 | |
| #define NHEADS 8 | |
| #define HEAD_DIM (DIM / NHEADS) | |
| #define HIDDEN 1024 | |
| #define CTX 256 | |
| #define VOCAB 2048 | |
| typedef struct { | |
| nt_tensor *wte; | |
| struct { | |
| nt_tensor *rms1, *wq, *wk, *wv, *wo, *rms2; | |
| nt_tensor *w_gate, *w_up, *w_down; | |
| } L[NLAYERS]; | |
| nt_tensor *rms_f, *head; | |
| } Model; | |
| static int model_n_tensors(void) { return 1 + NLAYERS * 9 + 2; } | |
| static Model* model_new(void) { | |
| Model* m = (Model*)calloc(1, sizeof(Model)); | |
| m->wte = nt_tensor_new2d(VOCAB, DIM); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| m->L[l].rms1 = nt_tensor_new(DIM); | |
| m->L[l].wq = nt_tensor_new2d(DIM, DIM); | |
| m->L[l].wk = nt_tensor_new2d(DIM, DIM); | |
| m->L[l].wv = nt_tensor_new2d(DIM, DIM); | |
| m->L[l].wo = nt_tensor_new2d(DIM, DIM); | |
| m->L[l].rms2 = nt_tensor_new(DIM); | |
| m->L[l].w_gate = nt_tensor_new2d(HIDDEN, DIM); | |
| m->L[l].w_up = nt_tensor_new2d(HIDDEN, DIM); | |
| m->L[l].w_down = nt_tensor_new2d(DIM, HIDDEN); | |
| } | |
| m->rms_f = nt_tensor_new(DIM); | |
| m->head = nt_tensor_new2d(VOCAB, DIM); | |
| return m; | |
| } | |
| static void model_free(Model* m) { | |
| nt_tensor_free(m->wte); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| nt_tensor_free(m->L[l].rms1); nt_tensor_free(m->L[l].rms2); | |
| nt_tensor_free(m->L[l].wq); nt_tensor_free(m->L[l].wk); | |
| nt_tensor_free(m->L[l].wv); nt_tensor_free(m->L[l].wo); | |
| nt_tensor_free(m->L[l].w_gate); nt_tensor_free(m->L[l].w_up); | |
| nt_tensor_free(m->L[l].w_down); | |
| } | |
| nt_tensor_free(m->rms_f); nt_tensor_free(m->head); free(m); | |
| } | |
| /* FP16 → FP32 */ | |
| static float f16_to_f32(uint16_t h) { | |
| uint32_t sign = (h & 0x8000) << 16; | |
| uint32_t exp = (h >> 10) & 0x1F; | |
| uint32_t mant = h & 0x3FF; | |
| if (exp == 0) { float z = 0; uint32_t r = sign; memcpy(&z, &r, 4); return z; } | |
| if (exp == 31) exp = 255; else exp = exp - 15 + 127; | |
| uint32_t r = sign | (exp << 23) | (mant << 13); | |
| float f; memcpy(&f, &r, 4); return f; | |
| } | |
| static int load_weights_f16(Model* m, const char* path) { | |
| FILE* f = fopen(path, "rb"); | |
| if (!f) return -1; | |
| uint32_t magic; int n; | |
| fread(&magic, 4, 1, f); fread(&n, 4, 1, f); | |
| if (magic != 0x3631544E) { fclose(f); return -1; } /* "NT16" */ | |
| int expected = model_n_tensors(); | |
| if (n != expected) { fclose(f); return -1; } | |
| nt_tensor* params[75]; | |
| int pi = 0; | |
| params[pi++] = m->wte; | |
| for (int l = 0; l < NLAYERS; l++) { | |
| params[pi++]=m->L[l].rms1; params[pi++]=m->L[l].wq; params[pi++]=m->L[l].wk; | |
| params[pi++]=m->L[l].wv; params[pi++]=m->L[l].wo; params[pi++]=m->L[l].rms2; | |
| params[pi++]=m->L[l].w_gate; params[pi++]=m->L[l].w_up; params[pi++]=m->L[l].w_down; | |
| } | |
| params[pi++] = m->rms_f; params[pi++] = m->head; | |
| for (int t = 0; t < expected; t++) { | |
| int ndim; fread(&ndim, 4, 1, f); | |
| for (int d = 0; d < ndim; d++) { int s; fread(&s, 4, 1, f); } | |
| for (int i = 0; i < params[t]->len; i++) { | |
| uint16_t h; fread(&h, 2, 1, f); | |
| params[t]->data[i] = f16_to_f32(h); | |
| } | |
| } | |
| fclose(f); | |
| return 0; | |
| } | |
| static int load_weights(Model* m, const char* path) { | |
| /* Try FP16 first */ | |
| if (load_weights_f16(m, path) == 0) { printf("loaded FP16 weights\n"); return 0; } | |
| /* Fallback to FP32 (notorch format) */ | |
| int n_loaded = 0; | |
| nt_tensor** loaded = nt_load(path, &n_loaded); | |
| if (!loaded) return -1; | |
| int expected = model_n_tensors(); | |
| if (n_loaded != expected) { | |
| printf("WARN: expected %d tensors, got %d\n", expected, n_loaded); | |
| for (int i = 0; i < n_loaded; i++) nt_tensor_free(loaded[i]); | |
| free(loaded); return -1; | |
| } | |
| nt_tensor* params[] = { | |
| m->wte, | |
| m->L[0].rms1, m->L[0].wq, m->L[0].wk, m->L[0].wv, m->L[0].wo, m->L[0].rms2, | |
| m->L[0].w_gate, m->L[0].w_up, m->L[0].w_down, | |
| m->L[1].rms1, m->L[1].wq, m->L[1].wk, m->L[1].wv, m->L[1].wo, m->L[1].rms2, | |
| m->L[1].w_gate, m->L[1].w_up, m->L[1].w_down, | |
| m->L[2].rms1, m->L[2].wq, m->L[2].wk, m->L[2].wv, m->L[2].wo, m->L[2].rms2, | |
| m->L[2].w_gate, m->L[2].w_up, m->L[2].w_down, | |
| m->L[3].rms1, m->L[3].wq, m->L[3].wk, m->L[3].wv, m->L[3].wo, m->L[3].rms2, | |
| m->L[3].w_gate, m->L[3].w_up, m->L[3].w_down, | |
| m->L[4].rms1, m->L[4].wq, m->L[4].wk, m->L[4].wv, m->L[4].wo, m->L[4].rms2, | |
| m->L[4].w_gate, m->L[4].w_up, m->L[4].w_down, | |
| m->L[5].rms1, m->L[5].wq, m->L[5].wk, m->L[5].wv, m->L[5].wo, m->L[5].rms2, | |
| m->L[5].w_gate, m->L[5].w_up, m->L[5].w_down, | |
| m->L[6].rms1, m->L[6].wq, m->L[6].wk, m->L[6].wv, m->L[6].wo, m->L[6].rms2, | |
| m->L[6].w_gate, m->L[6].w_up, m->L[6].w_down, | |
| m->L[7].rms1, m->L[7].wq, m->L[7].wk, m->L[7].wv, m->L[7].wo, m->L[7].rms2, | |
| m->L[7].w_gate, m->L[7].w_up, m->L[7].w_down, | |
| m->rms_f, m->head | |
| }; | |
| for (int i = 0; i < expected; i++) { | |
| memcpy(params[i]->data, loaded[i]->data, params[i]->len * sizeof(float)); | |
| nt_tensor_free(loaded[i]); | |
| } | |
| free(loaded); | |
| return 0; | |
| } | |
| /* ── Forward (inference only, no tape) ── */ | |
| static void rmsnorm(float* out, const float* x, const float* w, int d) { | |
| float ss = 0; | |
| for (int i = 0; i < d; i++) ss += x[i] * x[i]; | |
| ss = 1.0f / sqrtf(ss / d + 1e-5f); | |
| for (int i = 0; i < d; i++) out[i] = x[i] * ss * w[i]; | |
| } | |
| static void matmul(float* out, const float* x, const float* w, int out_d, int in_d) { | |
| for (int o = 0; o < out_d; o++) { | |
| float s = 0; | |
| for (int i = 0; i < in_d; i++) s += w[o * in_d + i] * x[i]; | |
| out[o] = s; | |
| } | |
| } | |
| static void rope(float* x, int pos, int dim, int head_dim) { | |
| for (int h = 0; h < dim / head_dim; h++) { | |
| for (int i = 0; i < head_dim / 2; i++) { | |
| float freq = 1.0f / powf(10000.0f, (float)(2 * i) / head_dim); | |
| float theta = pos * freq; | |
| float cs = cosf(theta), sn = sinf(theta); | |
| int idx = h * head_dim + i * 2; | |
| float x0 = x[idx], x1 = x[idx + 1]; | |
| x[idx] = x0 * cs - x1 * sn; | |
| x[idx + 1] = x0 * sn + x1 * cs; | |
| } | |
| } | |
| } | |
| static void softmax(float* x, int n) { | |
| float mx = x[0]; for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i]; | |
| float sm = 0; for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sm += x[i]; } | |
| for (int i = 0; i < n; i++) x[i] /= sm; | |
| } | |
| /* KV cache */ | |
| static float kv_k[NLAYERS][CTX][DIM]; | |
| static float kv_v[NLAYERS][CTX][DIM]; | |
| static void forward_pos(Model* m, int token, int pos, float* logits) { | |
| float x[DIM], xn[DIM], q[DIM], k[DIM], v[DIM], attn_out[DIM]; | |
| float gate[HIDDEN], up[HIDDEN], down[DIM], ffn_out[DIM]; | |
| /* Token embedding */ | |
| memcpy(x, m->wte->data + token * DIM, DIM * sizeof(float)); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| /* Attn norm */ | |
| rmsnorm(xn, x, m->L[l].rms1->data, DIM); | |
| /* QKV */ | |
| matmul(q, xn, m->L[l].wq->data, DIM, DIM); | |
| matmul(k, xn, m->L[l].wk->data, DIM, DIM); | |
| matmul(v, xn, m->L[l].wv->data, DIM, DIM); | |
| /* RoPE */ | |
| rope(q, pos, DIM, HEAD_DIM); | |
| rope(k, pos, DIM, HEAD_DIM); | |
| /* Store in KV cache */ | |
| memcpy(kv_k[l][pos], k, DIM * sizeof(float)); | |
| memcpy(kv_v[l][pos], v, DIM * sizeof(float)); | |
| /* Multi-head attention */ | |
| float scale = 1.0f / sqrtf((float)HEAD_DIM); | |
| memset(attn_out, 0, DIM * sizeof(float)); | |
| for (int h = 0; h < NHEADS; h++) { | |
| int ho = h * HEAD_DIM; | |
| float scores[CTX]; | |
| for (int j = 0; j <= pos; j++) { | |
| float dot = 0; | |
| for (int d = 0; d < HEAD_DIM; d++) dot += q[ho + d] * kv_k[l][j][ho + d]; | |
| scores[j] = dot * scale; | |
| } | |
| /* Softmax over 0..pos */ | |
| float mx = scores[0]; | |
| for (int j = 1; j <= pos; j++) if (scores[j] > mx) mx = scores[j]; | |
| float sm = 0; | |
| for (int j = 0; j <= pos; j++) { scores[j] = expf(scores[j] - mx); sm += scores[j]; } | |
| for (int j = 0; j <= pos; j++) scores[j] /= sm; | |
| /* Weighted sum of values */ | |
| for (int j = 0; j <= pos; j++) | |
| for (int d = 0; d < HEAD_DIM; d++) | |
| attn_out[ho + d] += scores[j] * kv_v[l][j][ho + d]; | |
| } | |
| /* Output projection + residual */ | |
| float proj[DIM]; | |
| matmul(proj, attn_out, m->L[l].wo->data, DIM, DIM); | |
| for (int i = 0; i < DIM; i++) x[i] += proj[i]; | |
| /* FFN norm */ | |
| rmsnorm(xn, x, m->L[l].rms2->data, DIM); | |
| /* SwiGLU FFN */ | |
| matmul(gate, xn, m->L[l].w_gate->data, HIDDEN, DIM); | |
| matmul(up, xn, m->L[l].w_up->data, HIDDEN, DIM); | |
| for (int i = 0; i < HIDDEN; i++) | |
| gate[i] = gate[i] / (1.0f + expf(-gate[i])) * up[i]; /* SiLU(gate) * up */ | |
| matmul(down, gate, m->L[l].w_down->data, DIM, HIDDEN); | |
| for (int i = 0; i < DIM; i++) x[i] += down[i]; | |
| } | |
| /* Final norm + lm_head */ | |
| rmsnorm(xn, x, m->rms_f->data, DIM); | |
| matmul(logits, xn, m->head->data, VOCAB, DIM); | |
| } | |
| static int sample(float* logits, float temperature, int top_k) { | |
| for (int i = 0; i < VOCAB; i++) logits[i] /= temperature; | |
| /* Top-k: find k-th largest, zero out rest */ | |
| if (top_k > 0 && top_k < VOCAB) { | |
| float threshold = -1e30f; | |
| float tmp[VOCAB]; | |
| memcpy(tmp, logits, VOCAB * sizeof(float)); | |
| for (int k = 0; k < top_k; k++) { | |
| float mx = -1e30f; int mi = 0; | |
| for (int i = 0; i < VOCAB; i++) if (tmp[i] > mx) { mx = tmp[i]; mi = i; } | |
| threshold = mx; | |
| tmp[mi] = -1e30f; | |
| } | |
| for (int i = 0; i < VOCAB; i++) if (logits[i] < threshold) logits[i] = -1e30f; | |
| } | |
| softmax(logits, VOCAB); | |
| float r = (float)rand() / (float)RAND_MAX, cum = 0; | |
| for (int i = 0; i < VOCAB; i++) { cum += logits[i]; if (cum >= r) return i; } | |
| return VOCAB - 1; | |
| } | |
| int main(int argc, char** argv) { | |
| const char* weights_path = argc > 1 ? argv[1] : "nanodurov_arianna.bin"; | |
| const char* merges_path = argc > 2 ? argv[2] : "arianna_bpe_merges.txt"; | |
| srand((unsigned)time(NULL)); | |
| printf("════════════════════════════════════════════════════════\n"); | |
| printf(" nanodurov — Arianna voice (15.7M, BPE, notorch)\n"); | |
| printf("════════════════════════════════════════════════════════\n"); | |
| /* Load BPE */ | |
| nt_bpe bpe; | |
| if (nt_bpe_load(&bpe, merges_path) < 0) { | |
| printf("cannot load %s\n", merges_path); return 1; | |
| } | |
| printf("bpe: %d merges, vocab %d\n", bpe.n_merges, bpe.vocab_size); | |
| /* Load model */ | |
| Model* model = model_new(); | |
| if (load_weights(model, weights_path) < 0) { | |
| printf("cannot load %s\n", weights_path); return 1; | |
| } | |
| printf("model loaded: %s\n", weights_path); | |
| printf("────────────────────────────────────────────────────\n"); | |
| printf(" type your message (or 'quit' to exit)\n"); | |
| printf("────────────────────────────────────────────────────\n\n"); | |
| char input[4096]; | |
| while (1) { | |
| printf("You: "); | |
| fflush(stdout); | |
| if (!fgets(input, sizeof(input), stdin)) break; | |
| int len = (int)strlen(input); | |
| while (len > 0 && (input[len-1] == '\n' || input[len-1] == '\r')) input[--len] = 0; | |
| if (len == 0) continue; | |
| if (strcmp(input, "quit") == 0 || strcmp(input, "exit") == 0) break; | |
| /* Build prompt: "Q: {input}\nA:" */ | |
| char prompt[4096]; | |
| snprintf(prompt, sizeof(prompt), "Q: %s\nA:", input); | |
| int tokens[CTX]; | |
| int n = nt_bpe_encode(&bpe, prompt, (int)strlen(prompt), tokens, CTX / 2); | |
| /* Generate */ | |
| printf("Arianna: "); | |
| fflush(stdout); | |
| /* Prefill */ | |
| float logits[VOCAB]; | |
| for (int i = 0; i < n; i++) | |
| forward_pos(model, tokens[i], i, logits); | |
| /* Decode */ | |
| int pos = n; | |
| for (int s = 0; s < CTX - n; s++) { | |
| int next = sample(logits, 0.8f, 40); | |
| tokens[pos] = next; | |
| /* Decode token and print */ | |
| char decoded[NT_BPE_MAX_TOKEN_LEN + 1]; | |
| nt_bpe_decode(&bpe, &next, 1, decoded, NT_BPE_MAX_TOKEN_LEN); | |
| /* Stop on Q: boundary */ | |
| if (strstr(decoded, "\nQ") != NULL || strstr(decoded, "\n\n") != NULL) break; | |
| printf("%s", decoded); | |
| fflush(stdout); | |
| /* Next step */ | |
| forward_pos(model, next, pos, logits); | |
| pos++; | |
| if (pos >= CTX) break; | |
| } | |
| printf("\n\n"); | |
| } | |
| model_free(model); | |
| printf("\n bye.\n"); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>nanodurov — Arianna voice</title> | |
| <style> | |
| * { margin: 0; padding: 0; box-sizing: border-box; } | |
| body { background: #0a0a0a; color: #e0e0e0; font-family: 'Courier New', monospace; height: 100vh; display: flex; flex-direction: column; } | |
| #header { padding: 12px 20px; border-bottom: 1px solid #222; font-size: 13px; color: #666; } | |
| #header span { color: #888; } | |
| #header .title { color: #e0e0e0; font-weight: bold; } | |
| #chat { flex: 1; overflow-y: auto; padding: 20px; } | |
| .msg { margin-bottom: 16px; line-height: 1.5; } | |
| .msg .who { font-weight: bold; margin-bottom: 2px; } | |
| .msg.user .who { color: #4a9eff; } | |
| .msg.ai .who { color: #ff6b9d; } | |
| .msg .text { color: #ccc; white-space: pre-wrap; } | |
| .msg.ai .text { color: #e8d5b7; } | |
| .msg.system { color: #555; font-size: 12px; } | |
| #input-area { padding: 12px 20px; border-top: 1px solid #222; display: flex; gap: 10px; } | |
| #input { flex: 1; background: #111; border: 1px solid #333; color: #e0e0e0; padding: 10px; font-family: inherit; font-size: 14px; border-radius: 4px; outline: none; } | |
| #input:focus { border-color: #4a9eff; } | |
| #send { background: #222; color: #e0e0e0; border: 1px solid #333; padding: 10px 20px; cursor: pointer; font-family: inherit; border-radius: 4px; } | |
| #send:hover { background: #333; } | |
| #send:disabled { opacity: 0.3; cursor: default; } | |
| #load-area { padding: 20px; text-align: center; } | |
| #load-area button { background: #1a1a2e; color: #4a9eff; border: 1px solid #4a9eff; padding: 12px 24px; cursor: pointer; font-family: inherit; font-size: 14px; border-radius: 4px; margin: 5px; } | |
| #load-area button:hover { background: #4a9eff; color: #0a0a0a; } | |
| #status { color: #555; margin-top: 10px; font-size: 12px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div id="header"> | |
| <span class="title">nanodurov</span> — <span>Arianna voice | 15.7M params | BPE 2048 | notorch</span> | |
| <span id="gpu-badge" style="float:right"></span> | |
| </div> | |
| <div id="load-area"> | |
| <p style="color:#888; margin-bottom:15px;">load model weights + BPE merges to start chatting</p> | |
| <button onclick="loadFiles()">Load weights + merges</button> | |
| <input type="file" id="file-weights" accept=".bin" style="display:none" multiple> | |
| <div id="status"></div> | |
| </div> | |
| <div id="chat" style="display:none"></div> | |
| <div id="input-area" style="display:none"> | |
| <input id="input" placeholder="talk to Arianna..." onkeydown="if(event.key==='Enter')generate()"> | |
| <button id="send" onclick="generate()">send</button> | |
| </div> | |
| <script> | |
| // ═══════════════════════════════════════════════════════════════ | |
| // CONFIG | |
| // ═══════════════════════════════════════════════════════════════ | |
| const DIM = 384, NLAYERS = 8, NHEADS = 8, HD = 48, HIDDEN = 1024, CTX = 256, VOCAB = 2048; | |
| // ═══════════════════════════════════════════════════════════════ | |
| // BPE TOKENIZER | |
| // ═══════════════════════════════════════════════════════════════ | |
| let bpeMerges = []; | |
| let bpeTokens = []; // decode table: id → Uint8Array | |
| function bpeBuildDecodeTable() { | |
| bpeTokens = new Array(256 + bpeMerges.length); | |
| for (let i = 0; i < 256; i++) bpeTokens[i] = new Uint8Array([i]); | |
| for (let m = 0; m < bpeMerges.length; m++) { | |
| const [a, b] = bpeMerges[m]; | |
| const ta = bpeTokens[a], tb = bpeTokens[b]; | |
| const merged = new Uint8Array(ta.length + tb.length); | |
| merged.set(ta); merged.set(tb, ta.length); | |
| bpeTokens[256 + m] = merged; | |
| } | |
| } | |
| function bpeEncode(text) { | |
| const bytes = new TextEncoder().encode(text); | |
| let ids = Array.from(bytes); | |
| for (let m = 0; m < bpeMerges.length; m++) { | |
| const [a, b] = bpeMerges[m]; | |
| const newId = 256 + m; | |
| let i = 0; | |
| while (i < ids.length - 1) { | |
| if (ids[i] === a && ids[i + 1] === b) { | |
| ids[i] = newId; | |
| ids.splice(i + 1, 1); | |
| } else i++; | |
| } | |
| } | |
| return ids; | |
| } | |
| function bpeDecode(ids) { | |
| const bytes = []; | |
| for (const id of ids) { | |
| if (id >= 0 && id < bpeTokens.length) { | |
| for (const b of bpeTokens[id]) bytes.push(b); | |
| } | |
| } | |
| return new TextDecoder().decode(new Uint8Array(bytes)); | |
| } | |
| // ═══════════════════════════════════════════════════════════════ | |
| // MODEL WEIGHTS | |
| // ═══════════════════════════════════════════════════════════════ | |
| let W = null; // { wte, layers: [{rms1, wq, wk, wv, wo, rms2, w_gate, w_up, w_down}], rms_f, head } | |
| function loadWeightsFromBuffer(buf) { | |
| const view = new DataView(buf); | |
| let off = 0; | |
| // notorch format: magic "NTOR" (4B) + n_tensors (4B) + per tensor: ndim(4B) + shape[] + float data | |
| const magic = view.getUint32(off, true); off += 4; | |
| if (magic !== 0x524F544E) throw new Error('bad magic: ' + magic.toString(16)); // "NTOR" LE | |
| const nTensors = view.getInt32(off, true); off += 4; | |
| function readTensor() { | |
| const ndim = view.getInt32(off, true); off += 4; | |
| let len = 1; | |
| const shape = []; | |
| for (let d = 0; d < ndim; d++) { | |
| const s = view.getInt32(off, true); off += 4; | |
| shape.push(s); len *= s; | |
| } | |
| const data = new Float32Array(buf, off, len); | |
| off += len * 4; | |
| return { data: new Float32Array(data), shape, len }; // copy data | |
| } | |
| const expected = 1 + NLAYERS * 9 + 2; | |
| if (nTensors !== expected) throw new Error(`expected ${expected} tensors, got ${nTensors}`); | |
| W = { layers: [] }; | |
| W.wte = readTensor(); | |
| for (let l = 0; l < NLAYERS; l++) { | |
| W.layers.push({ | |
| rms1: readTensor(), wq: readTensor(), wk: readTensor(), | |
| wv: readTensor(), wo: readTensor(), rms2: readTensor(), | |
| w_gate: readTensor(), w_up: readTensor(), w_down: readTensor() | |
| }); | |
| } | |
| W.rms_f = readTensor(); | |
| W.head = readTensor(); | |
| } | |
| // ═══════════════════════════════════════════════════════════════ | |
| // INFERENCE (JS, KV cache) | |
| // ═══════════════════════════════════════════════════════════════ | |
| let kvK = null, kvV = null; // [NLAYERS][CTX][DIM] | |
| function initKV() { | |
| kvK = Array.from({length: NLAYERS}, () => Array.from({length: CTX}, () => new Float32Array(DIM))); | |
| kvV = Array.from({length: NLAYERS}, () => Array.from({length: CTX}, () => new Float32Array(DIM))); | |
| } | |
| function rmsnorm(out, x, w) { | |
| let ss = 0; | |
| for (let i = 0; i < DIM; i++) ss += x[i] * x[i]; | |
| ss = 1.0 / Math.sqrt(ss / DIM + 1e-5); | |
| for (let i = 0; i < DIM; i++) out[i] = x[i] * ss * w[i]; | |
| } | |
| function matmul(out, x, w, outD, inD) { | |
| for (let o = 0; o < outD; o++) { | |
| let s = 0; | |
| const base = o * inD; | |
| for (let i = 0; i < inD; i++) s += w[base + i] * x[i]; | |
| out[o] = s; | |
| } | |
| } | |
| function rope(x, pos) { | |
| for (let h = 0; h < NHEADS; h++) { | |
| for (let i = 0; i < HD / 2; i++) { | |
| const freq = 1.0 / Math.pow(10000.0, (2 * i) / HD); | |
| const theta = pos * freq; | |
| const cs = Math.cos(theta), sn = Math.sin(theta); | |
| const idx = h * HD + i * 2; | |
| const x0 = x[idx], x1 = x[idx + 1]; | |
| x[idx] = x0 * cs - x1 * sn; | |
| x[idx + 1] = x0 * sn + x1 * cs; | |
| } | |
| } | |
| } | |
| function forwardPos(token, pos) { | |
| const x = new Float32Array(DIM); | |
| const xn = new Float32Array(DIM); | |
| const q = new Float32Array(DIM), k = new Float32Array(DIM), v = new Float32Array(DIM); | |
| const attnOut = new Float32Array(DIM); | |
| const gate = new Float32Array(HIDDEN), up = new Float32Array(HIDDEN), down = new Float32Array(DIM); | |
| const proj = new Float32Array(DIM); | |
| const logits = new Float32Array(VOCAB); | |
| // Token embedding | |
| x.set(W.wte.data.subarray(token * DIM, token * DIM + DIM)); | |
| for (let l = 0; l < NLAYERS; l++) { | |
| const L = W.layers[l]; | |
| rmsnorm(xn, x, L.rms1.data); | |
| matmul(q, xn, L.wq.data, DIM, DIM); | |
| matmul(k, xn, L.wk.data, DIM, DIM); | |
| matmul(v, xn, L.wv.data, DIM, DIM); | |
| rope(q, pos); rope(k, pos); | |
| kvK[l][pos].set(k); kvV[l][pos].set(v); | |
| // MHA | |
| const scale = 1.0 / Math.sqrt(HD); | |
| attnOut.fill(0); | |
| for (let h = 0; h < NHEADS; h++) { | |
| const ho = h * HD; | |
| const scores = new Float32Array(pos + 1); | |
| for (let j = 0; j <= pos; j++) { | |
| let dot = 0; | |
| for (let d = 0; d < HD; d++) dot += q[ho + d] * kvK[l][j][ho + d]; | |
| scores[j] = dot * scale; | |
| } | |
| // softmax | |
| let mx = scores[0]; for (let j = 1; j <= pos; j++) if (scores[j] > mx) mx = scores[j]; | |
| let sm = 0; for (let j = 0; j <= pos; j++) { scores[j] = Math.exp(scores[j] - mx); sm += scores[j]; } | |
| for (let j = 0; j <= pos; j++) scores[j] /= sm; | |
| for (let j = 0; j <= pos; j++) | |
| for (let d = 0; d < HD; d++) attnOut[ho + d] += scores[j] * kvV[l][j][ho + d]; | |
| } | |
| matmul(proj, attnOut, L.wo.data, DIM, DIM); | |
| for (let i = 0; i < DIM; i++) x[i] += proj[i]; | |
| rmsnorm(xn, x, L.rms2.data); | |
| matmul(gate, xn, L.w_gate.data, HIDDEN, DIM); | |
| matmul(up, xn, L.w_up.data, HIDDEN, DIM); | |
| for (let i = 0; i < HIDDEN; i++) gate[i] = gate[i] / (1 + Math.exp(-gate[i])) * up[i]; | |
| matmul(down, gate, L.w_down.data, DIM, HIDDEN); | |
| for (let i = 0; i < DIM; i++) x[i] += down[i]; | |
| } | |
| rmsnorm(xn, x, W.rms_f.data); | |
| matmul(logits, xn, W.head.data, VOCAB, DIM); | |
| return logits; | |
| } | |
| function sampleToken(logits, temp = 0.8, topK = 40) { | |
| for (let i = 0; i < VOCAB; i++) logits[i] /= temp; | |
| // top-k | |
| if (topK > 0 && topK < VOCAB) { | |
| const sorted = Array.from(logits).sort((a, b) => b - a); | |
| const threshold = sorted[topK - 1]; | |
| for (let i = 0; i < VOCAB; i++) if (logits[i] < threshold) logits[i] = -1e30; | |
| } | |
| // softmax | |
| let mx = logits[0]; for (let i = 1; i < VOCAB; i++) if (logits[i] > mx) mx = logits[i]; | |
| let sm = 0; for (let i = 0; i < VOCAB; i++) { logits[i] = Math.exp(logits[i] - mx); sm += logits[i]; } | |
| for (let i = 0; i < VOCAB; i++) logits[i] /= sm; | |
| let r = Math.random(), cum = 0; | |
| for (let i = 0; i < VOCAB; i++) { cum += logits[i]; if (cum >= r) return i; } | |
| return VOCAB - 1; | |
| } | |
| // ═══════════════════════════════════════════════════════════════ | |
| // UI | |
| // ═══════════════════════════════════════════════════════════════ | |
| const chat = document.getElementById('chat'); | |
| const input = document.getElementById('input'); | |
| function addMsg(who, text, cls) { | |
| const div = document.createElement('div'); | |
| div.className = 'msg ' + cls; | |
| div.innerHTML = `<div class="who">${who}</div><div class="text">${text}</div>`; | |
| chat.appendChild(div); | |
| chat.scrollTop = chat.scrollHeight; | |
| return div; | |
| } | |
| function setStatus(text) { document.getElementById('status').textContent = text; } | |
| async function loadFiles() { | |
| setStatus('select nanodurov_arianna.bin and arianna_bpe_merges.txt...'); | |
| const fileInput = document.getElementById('file-weights'); | |
| fileInput.onchange = async () => { | |
| const files = Array.from(fileInput.files); | |
| const binFile = files.find(f => f.name.endsWith('.bin')); | |
| const txtFile = files.find(f => f.name.endsWith('.txt')); | |
| if (!binFile || !txtFile) { setStatus('need .bin (weights) and .txt (merges)'); return; } | |
| // Load merges | |
| setStatus('loading BPE merges...'); | |
| const mergesText = await txtFile.text(); | |
| bpeMerges = mergesText.trim().split('\n').map(line => { | |
| const [a, b] = line.trim().split(/\s+/).map(Number); | |
| return [a, b]; | |
| }); | |
| bpeBuildDecodeTable(); | |
| setStatus(`BPE: ${bpeMerges.length} merges, vocab ${256 + bpeMerges.length}`); | |
| // Load weights | |
| setStatus('loading weights (60 MB)...'); | |
| const buf = await binFile.arrayBuffer(); | |
| try { | |
| loadWeightsFromBuffer(buf); | |
| } catch (e) { | |
| setStatus('ERROR: ' + e.message); return; | |
| } | |
| initKV(); | |
| // Check WebGPU | |
| let gpuText = 'CPU (JS)'; | |
| if (navigator.gpu) { | |
| const adapter = await navigator.gpu.requestAdapter(); | |
| if (adapter) gpuText = 'WebGPU available (using JS fallback for now)'; | |
| } | |
| document.getElementById('gpu-badge').textContent = gpuText; | |
| // Show chat | |
| document.getElementById('load-area').style.display = 'none'; | |
| chat.style.display = 'block'; | |
| document.getElementById('input-area').style.display = 'flex'; | |
| addMsg('system', 'nanodurov loaded. 15.7M params. Arianna voice. type anything.', 'system'); | |
| input.focus(); | |
| }; | |
| fileInput.click(); | |
| } | |
| async function generate() { | |
| const text = input.value.trim(); | |
| if (!text || !W) return; | |
| input.value = ''; | |
| document.getElementById('send').disabled = true; | |
| addMsg('You', text, 'user'); | |
| const aiMsg = addMsg('Arianna', '', 'ai'); | |
| const textEl = aiMsg.querySelector('.text'); | |
| // Encode prompt | |
| const prompt = `Q: ${text}\nA:`; | |
| const tokens = bpeEncode(prompt); | |
| // Reset KV cache | |
| initKV(); | |
| // Prefill | |
| let logits; | |
| for (let i = 0; i < tokens.length; i++) { | |
| logits = forwardPos(tokens[i], i); | |
| // yield to UI every 2 tokens | |
| if (i % 2 === 0) await new Promise(r => setTimeout(r, 0)); | |
| } | |
| // Generate | |
| let pos = tokens.length; | |
| let output = ''; | |
| for (let s = 0; s < CTX - pos; s++) { | |
| const next = sampleToken(logits, 0.8, 40); | |
| const decoded = bpeDecode([next]); | |
| if (decoded.includes('\nQ') || decoded.includes('\n\n')) break; | |
| output += decoded; | |
| textEl.textContent = output; | |
| chat.scrollTop = chat.scrollHeight; | |
| logits = forwardPos(next, pos); | |
| pos++; | |
| if (pos >= CTX) break; | |
| // yield to UI every 3 tokens | |
| if (s % 3 === 0) await new Promise(r => setTimeout(r, 0)); | |
| } | |
| if (!output.trim()) textEl.textContent = '(silence)'; | |
| document.getElementById('send').disabled = false; | |
| input.focus(); | |
| } | |
| </script> | |
| </body> | |
| </html> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| nanodurov.py — a telegram client that trains a language model on chat messages. | |
| one file. telethon + pytorch. connect to a group, watch bots and humans talk, | |
| learn their patterns, generate text in their style. the chat is the corpus. | |
| the model grows with the conversation. | |
| inspired by karpathy's microGPT. dedicated to Pavel Durov, who built the | |
| platform where bots can't see each other but we're training on them anyway. | |
| usage: | |
| pip install telethon torch | |
| python nanodurov.py # interactive mode | |
| python nanodurov.py --generate "hello" # generate from prompt | |
| python nanodurov.py --train-only chat.txt # train on exported chat | |
| env vars: | |
| TELEGRAM_API_ID — from my.telegram.org | |
| TELEGRAM_API_HASH — from my.telegram.org | |
| """ | |
| import os | |
| import sys | |
| import math | |
| import time | |
| import struct | |
| import hashlib | |
| import asyncio | |
| import argparse | |
| from collections import defaultdict | |
| # --- optional imports (graceful degradation) ------------------------------------ | |
| try: | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| TORCH_AVAILABLE = True | |
| except ImportError: | |
| TORCH_AVAILABLE = False | |
| try: | |
| from telethon import TelegramClient, events | |
| from telethon.tl.types import User, Channel, Chat | |
| TELETHON_AVAILABLE = True | |
| except ImportError: | |
| TELETHON_AVAILABLE = False | |
| # --- hyperparameters ----------------------------------------------------------- | |
| # the model grows with the corpus. more data → bigger model. | |
| # like a tree. not like a corporation. | |
| GROWTH_STAGES = [ | |
| # (min_corpus_kb, dim, n_heads, n_layers, ctx_len, max_merges, name) | |
| (0, 32, 2, 1, 64, 128, 'seed'), | |
| (5, 48, 3, 2, 64, 256, 'sprout'), | |
| (20, 64, 4, 3, 128, 512, 'sapling'), | |
| (50, 96, 4, 4, 128, 768, 'tree'), | |
| (100, 128, 4, 6, 256, 1024, 'oak'), | |
| (250, 192, 6, 8, 256, 1536, 'forest'), | |
| (500, 256, 8, 10, 512, 2048, 'ancient'), | |
| ] | |
| BATCH_SIZE = 4 | |
| LR = 3e-4 | |
| WEIGHT_DECAY = 0.01 | |
| TRAIN_STEPS_PER_ROUND = 50 | |
| AUTO_TRAIN_INTERVAL = 60 # seconds between auto-train rounds | |
| def get_stage(corpus_bytes): | |
| """Pick the largest stage that fits the corpus.""" | |
| kb = corpus_bytes / 1024 | |
| stage = GROWTH_STAGES[0] | |
| for s in GROWTH_STAGES: | |
| if kb >= s[0]: | |
| stage = s | |
| return stage | |
| # --- BPE tokenizer (KARL, from nanoagi) ---------------------------------------- | |
| # the tokenizer that eats your chat and asks for seconds. | |
| class BPE: | |
| """Byte-pair encoding. Learns merges from text, encodes, decodes. | |
| Append-only — vocab grows, never shrinks. Like regret.""" | |
| def __init__(self, max_merges=256): | |
| self.max_merges = max_merges | |
| self.merges = [] # [(a, b, new_id), ...] | |
| self.vocab_size = 256 | |
| self.vocab = {i: bytes([i]) for i in range(256)} | |
| self.seen_hashes = set() | |
| self.corpus = b"" | |
| def _count_pairs(self, ids): | |
| counts = defaultdict(int) | |
| for i in range(len(ids) - 1): | |
| counts[(ids[i], ids[i + 1])] += 1 | |
| return counts | |
| def _merge(self, ids, a, b, new_id): | |
| out = [] | |
| i = 0 | |
| while i < len(ids): | |
| if i + 1 < len(ids) and ids[i] == a and ids[i + 1] == b: | |
| out.append(new_id) | |
| i += 2 | |
| else: | |
| out.append(ids[i]) | |
| i += 1 | |
| return out | |
| def learn(self, data, num_merges=None): | |
| """Learn BPE merges from raw bytes.""" | |
| if isinstance(data, str): | |
| data = data.encode('utf-8', errors='replace') | |
| num_merges = num_merges or min(self.max_merges, 256) | |
| ids = list(data) | |
| for m in range(num_merges): | |
| counts = self._count_pairs(ids) | |
| if not counts: | |
| break | |
| best = max(counts, key=counts.get) | |
| if counts[best] < 2: | |
| break | |
| new_id = 256 + len(self.merges) | |
| if new_id >= 256 + self.max_merges: | |
| break | |
| ids = self._merge(ids, best[0], best[1], new_id) | |
| self.merges.append((best[0], best[1], new_id)) | |
| self.vocab[new_id] = self.vocab.get(best[0], b'?') + self.vocab.get(best[1], b'?') | |
| self.vocab_size = 256 + len(self.merges) | |
| print(f"[bpe] {len(self.merges)} merges, vocab={self.vocab_size}, tokens={len(ids)}") | |
| return ids | |
| def encode(self, text): | |
| if isinstance(text, str): | |
| text = text.encode('utf-8', errors='replace') | |
| ids = list(text) | |
| for a, b, new_id in self.merges: | |
| ids = self._merge(ids, a, b, new_id) | |
| return ids | |
| def decode(self, ids): | |
| raw = b'' | |
| for i in ids: | |
| raw += self.vocab.get(i, b'?') | |
| return raw.decode('utf-8', errors='replace') | |
| def ingest(self, text): | |
| """Add text to corpus with dedup and quality filter. | |
| Rejects: too short, duplicate, too repetitive, pure URLs, | |
| sticker-only, emoji-only, single-word noise.""" | |
| if isinstance(text, str): | |
| raw = text | |
| text = text.encode('utf-8', errors='replace') | |
| else: | |
| raw = text.decode('utf-8', errors='replace') | |
| if len(text) < 15: | |
| return False | |
| # dedup | |
| h = hashlib.sha256(text).hexdigest()[:16] | |
| if h in self.seen_hashes: | |
| return False | |
| # quality filters | |
| stripped = raw.strip() | |
| # skip pure URLs | |
| if stripped.startswith('http://') or stripped.startswith('https://'): | |
| if ' ' not in stripped: | |
| return False | |
| # skip if >70% non-alpha (stickers, emoji floods, binary) | |
| alpha = sum(1 for c in stripped if c.isalpha() or c.isspace()) | |
| if len(stripped) > 0 and alpha / len(stripped) < 0.3: | |
| return False | |
| # skip too repetitive (same char >50%) | |
| if len(stripped) > 5: | |
| most_common = max(set(stripped), key=stripped.count) | |
| if stripped.count(most_common) / len(stripped) > 0.5: | |
| return False | |
| self.seen_hashes.add(h) | |
| self.corpus += text | |
| return True | |
| def retokenize(self, max_new=64): | |
| """Grow vocab with new merges from accumulated corpus.""" | |
| ids = list(self.corpus) | |
| for a, b, new_id in self.merges: | |
| ids = self._merge(ids, a, b, new_id) | |
| found = 0 | |
| for _ in range(min(max_new, self.max_merges - len(self.merges))): | |
| counts = self._count_pairs(ids) | |
| if not counts: | |
| break | |
| best = max(counts, key=counts.get) | |
| if counts[best] < 3: | |
| break | |
| new_id = 256 + len(self.merges) | |
| ids = self._merge(ids, best[0], best[1], new_id) | |
| self.merges.append((best[0], best[1], new_id)) | |
| self.vocab[new_id] = self.vocab.get(best[0], b'?') + self.vocab.get(best[1], b'?') | |
| found += 1 | |
| self.vocab_size = 256 + len(self.merges) | |
| if found: | |
| print(f"[bpe] +{found} merges (vocab={self.vocab_size})") | |
| return ids | |
| def save(self, path): | |
| with open(path, 'wb') as f: | |
| f.write(b'BPE1') | |
| f.write(struct.pack('<I', len(self.merges))) | |
| for a, b, nid in self.merges: | |
| f.write(struct.pack('<III', a, b, nid)) | |
| f.write(struct.pack('<I', len(self.corpus))) | |
| f.write(self.corpus) | |
| print(f"[bpe] saved to {path}") | |
| def load(self, path): | |
| if not os.path.exists(path): | |
| return False | |
| with open(path, 'rb') as f: | |
| if f.read(4) != b'BPE1': | |
| return False | |
| n = struct.unpack('<I', f.read(4))[0] | |
| self.merges = [] | |
| for _ in range(n): | |
| a, b, nid = struct.unpack('<III', f.read(12)) | |
| self.merges.append((a, b, nid)) | |
| self.vocab[nid] = self.vocab.get(a, bytes([a % 256])) + self.vocab.get(b, bytes([b % 256])) | |
| self.vocab_size = 256 + len(self.merges) | |
| corpus_len = struct.unpack('<I', f.read(4))[0] | |
| self.corpus = f.read(corpus_len) | |
| print(f"[bpe] loaded: {len(self.merges)} merges, {len(self.corpus)} bytes corpus") | |
| return True | |
| # --- transformer model --------------------------------------------------------- | |
| # RMSNorm, RoPE, SwiGLU, causal attention. the microGPT recipe. | |
| # every line here was written by someone who stared at karpathy's code | |
| # for too long and started seeing attention patterns in their dreams. | |
| if TORCH_AVAILABLE: | |
| class RMSNorm(nn.Module): | |
| def __init__(self, dim): | |
| super().__init__() | |
| self.w = nn.Parameter(torch.ones(dim)) | |
| def forward(self, x): | |
| return x * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + 1e-5).type_as(x) * self.w | |
| class Attention(nn.Module): | |
| def __init__(self, dim, n_heads): | |
| super().__init__() | |
| self.n_heads = n_heads | |
| self.head_dim = dim // n_heads | |
| self.wq = nn.Linear(dim, dim, bias=False) | |
| self.wk = nn.Linear(dim, dim, bias=False) | |
| self.wv = nn.Linear(dim, dim, bias=False) | |
| self.wo = nn.Linear(dim, dim, bias=False) | |
| def forward(self, x, freqs_cos, freqs_sin): | |
| B, T, D = x.shape | |
| H, HD = self.n_heads, self.head_dim | |
| q = self.wq(x).view(B, T, H, HD).transpose(1, 2) # [B, H, T, HD] | |
| k = self.wk(x).view(B, T, H, HD).transpose(1, 2) | |
| v = self.wv(x).view(B, T, H, HD).transpose(1, 2) | |
| # RoPE | |
| q = apply_rope(q, freqs_cos, freqs_sin) | |
| k = apply_rope(k, freqs_cos, freqs_sin) | |
| # causal attention | |
| att = (q @ k.transpose(-2, -1)) / math.sqrt(HD) | |
| mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool() | |
| att = att.masked_fill(mask, float('-inf')) | |
| att = F.softmax(att, dim=-1) | |
| out = att @ v # [B, H, T, HD] | |
| out = out.transpose(1, 2).contiguous().view(B, T, D) | |
| return self.wo(out) | |
| class MLP(nn.Module): | |
| def __init__(self, dim, hidden): | |
| super().__init__() | |
| self.w_gate = nn.Linear(dim, hidden, bias=False) | |
| self.w_up = nn.Linear(dim, hidden, bias=False) | |
| self.w_down = nn.Linear(hidden, dim, bias=False) | |
| def forward(self, x): | |
| return self.w_down(F.silu(self.w_gate(x)) * self.w_up(x)) | |
| class Block(nn.Module): | |
| def __init__(self, dim, n_heads, hidden): | |
| super().__init__() | |
| self.norm1 = RMSNorm(dim) | |
| self.attn = Attention(dim, n_heads) | |
| self.norm2 = RMSNorm(dim) | |
| self.mlp = MLP(dim, hidden) | |
| def forward(self, x, freqs_cos, freqs_sin): | |
| x = x + self.attn(self.norm1(x), freqs_cos, freqs_sin) | |
| x = x + self.mlp(self.norm2(x)) | |
| return x | |
| class NanoDurov(nn.Module): | |
| def __init__(self, vocab_size, dim, n_heads, n_layers, ctx_len): | |
| super().__init__() | |
| self.ctx_len = ctx_len | |
| self.tok_emb = nn.Embedding(vocab_size, dim) | |
| self.blocks = nn.ModuleList([ | |
| Block(dim, n_heads, dim * 4) for _ in range(n_layers) | |
| ]) | |
| self.norm_f = RMSNorm(dim) | |
| self.head = nn.Linear(dim, vocab_size, bias=False) | |
| # weight tying | |
| self.head.weight = self.tok_emb.weight | |
| # precompute RoPE | |
| self.register_buffer('freqs_cos', None) | |
| self.register_buffer('freqs_sin', None) | |
| self._build_rope(ctx_len, dim // n_heads) | |
| self.apply(self._init_weights) | |
| def _init_weights(self, m): | |
| if isinstance(m, nn.Linear): | |
| nn.init.normal_(m.weight, std=0.02) | |
| elif isinstance(m, nn.Embedding): | |
| nn.init.normal_(m.weight, std=0.02) | |
| def _build_rope(self, max_len, head_dim): | |
| pos = torch.arange(max_len).unsqueeze(1) # [T, 1] | |
| dim_pairs = torch.arange(0, head_dim, 2).float() # [HD/2] | |
| freqs = 1.0 / (10000 ** (dim_pairs / head_dim)) # [HD/2] | |
| angles = pos * freqs # [T, HD/2] | |
| self.freqs_cos = angles.cos() # [T, HD/2] | |
| self.freqs_sin = angles.sin() | |
| def forward(self, idx, targets=None): | |
| B, T = idx.shape | |
| x = self.tok_emb(idx) | |
| fc = self.freqs_cos[:T].unsqueeze(0) # [1, T, HD/2] | |
| fs = self.freqs_sin[:T].unsqueeze(0) | |
| for block in self.blocks: | |
| x = block(x, fc, fs) | |
| x = self.norm_f(x) | |
| logits = self.head(x) | |
| loss = None | |
| if targets is not None: | |
| loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) | |
| return logits, loss | |
| def generate(self, idx, max_new=100, temperature=0.8, top_k=40): | |
| for _ in range(max_new): | |
| ctx = idx[:, -self.ctx_len:] | |
| logits, _ = self(ctx) | |
| logits = logits[:, -1, :] / temperature | |
| if top_k > 0: | |
| v, _ = torch.topk(logits, top_k) | |
| logits[logits < v[:, [-1]]] = float('-inf') | |
| probs = F.softmax(logits, dim=-1) | |
| next_id = torch.multinomial(probs, 1) | |
| idx = torch.cat([idx, next_id], dim=1) | |
| # stop on newline after some output | |
| if idx.shape[1] > 10 and next_id.item() == 10: | |
| break | |
| return idx | |
| def apply_rope(x, cos, sin): | |
| """Apply rotary position embedding.""" | |
| # x: [B, H, T, HD] | |
| d2 = x.shape[-1] // 2 | |
| x1 = x[..., :d2] | |
| x2 = x[..., d2:] | |
| # cos, sin: [1, T, HD/2] → need [1, 1, T, HD/2] for broadcasting | |
| cos = cos.unsqueeze(1) # [1, 1, T, HD/2] | |
| sin = sin.unsqueeze(1) | |
| return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1) | |
| # --- chuck optimizer ------------------------------------------------------------ | |
| # θ -= (α × S × λ_Ψ × λ_l × σ) × m̂/(√v̂ + ε) + η | |
| # Adam is blind. Chuck sees. Chuck remembers. | |
| # In memory of Carlos Ray "Chuck" Norris (1940–2026). | |
| # | |
| # Compact version: Levels 1 (loss trend), 2 (grad trend), 9 (macro patience). | |
| # Full version: github.com/ariannamethod/chuck | |
| if TORCH_AVAILABLE: | |
| class Chuck(torch.optim.Optimizer): | |
| """Self-aware optimizer. Drop-in AdamW replacement with dampen/boost. | |
| When loss is falling → boost (dampen > 1). When rising → brake (dampen < 1). | |
| When stagnating → inject noise. Macro patience drops LR on plateaus. | |
| """ | |
| def __init__(self, params, lr=3e-4, betas=(0.9, 0.999), eps=1e-8, | |
| weight_decay=0.01, window=16, macro_int=500, macro_pat=3, | |
| macro_decay=0.5, verbose=0): | |
| defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) | |
| super().__init__(params, defaults) | |
| self.window = window | |
| self.macro_int = macro_int | |
| self.macro_pat = macro_pat | |
| self.macro_decay = macro_decay | |
| self.verbose = verbose | |
| # Chuck's soul | |
| self.dampen = 1.0 | |
| self.noise = 0.0 | |
| self.loss_ema = 0.0 | |
| self.gnorm_ema = 0.0 | |
| self.macro_ema = 0.0 | |
| self.best_macro = 1e9 | |
| self.lr_scale = 1.0 | |
| self.macro_stag = 0 | |
| self.macro_drops = 0 | |
| self.global_step = 0 | |
| # loss ring buffer | |
| self._hist = [0.0] * window | |
| self._hpos = 0 | |
| self._hfull = False | |
| self._stag = 0 | |
| @torch.no_grad() | |
| def step(self, closure=None, *, loss=None): | |
| if closure is not None: | |
| with torch.enable_grad(): | |
| lv = closure() | |
| if loss is None: | |
| loss = lv.item() | |
| self.global_step += 1 | |
| W = self.window | |
| # === Level 1: loss trend → dampen/boost === | |
| if loss is not None: | |
| if self.loss_ema == 0.0: | |
| self.loss_ema = loss | |
| else: | |
| self.loss_ema = 0.99 * self.loss_ema + 0.01 * loss | |
| self._hist[self._hpos % W] = self.loss_ema | |
| self._hpos += 1 | |
| if self._hpos >= W: | |
| self._hfull = True | |
| if self._hfull: | |
| q = W // 4 | |
| recent = sum(self._hist[(self._hpos - 1 - i) % W] for i in range(q)) / q | |
| old = sum(self._hist[(self._hpos - W + i) % W] for i in range(q)) / q | |
| trend = (recent - old) / (old + 1e-8) | |
| if trend > 0.02: | |
| self.dampen *= 0.97 # loss rising → brake | |
| elif trend < -0.02: | |
| self.dampen *= 1.03 # loss falling → push | |
| if abs(trend) < 0.001: | |
| self._stag += 1 | |
| if self._stag > 8: | |
| self.noise = 0.001 | |
| self._stag = 0 | |
| else: | |
| self._stag = 0 | |
| self.noise *= 0.9 | |
| # mean reversion | |
| self.dampen = 0.999 * self.dampen + 0.001 * 1.0 | |
| self.dampen = max(0.3, min(2.0, self.dampen)) | |
| # === Level 9: macro patience === | |
| if self.macro_ema == 0.0: | |
| self.macro_ema = loss | |
| else: | |
| self.macro_ema = 0.999 * self.macro_ema + 0.001 * loss | |
| if self.global_step % self.macro_int == 0 and self.global_step > W: | |
| if self.macro_ema > self.best_macro * 0.999: | |
| self.macro_stag += 1 | |
| if self.macro_stag >= self.macro_pat: | |
| self.lr_scale *= self.macro_decay | |
| if self.lr_scale < 0.05: | |
| self.lr_scale = 0.05 | |
| self.macro_stag = 0 | |
| self.macro_drops += 1 | |
| else: | |
| self.best_macro = self.macro_ema | |
| self.macro_stag = 0 | |
| if self.lr_scale < 1.0: | |
| self.lr_scale = min(1.0, self.lr_scale * 1.2) | |
| # === Adam update with Chuck modulation === | |
| effective_dampen = self.dampen * self.lr_scale | |
| for group in self.param_groups: | |
| lr = group['lr'] * effective_dampen | |
| beta1, beta2 = group['betas'] | |
| eps = group['eps'] | |
| wd = group['weight_decay'] | |
| for p in group['params']: | |
| if p.grad is None: | |
| continue | |
| g = p.grad | |
| state = self.state[p] | |
| if len(state) == 0: | |
| state['step'] = 0 | |
| state['m'] = torch.zeros_like(p) | |
| state['v'] = torch.zeros_like(p) | |
| state['step'] += 1 | |
| m, v = state['m'], state['v'] | |
| m.mul_(beta1).add_(g, alpha=1 - beta1) | |
| v.mul_(beta2).addcmul_(g, g, value=1 - beta2) | |
| bc1 = 1 - beta1 ** state['step'] | |
| bc2 = 1 - beta2 ** state['step'] | |
| m_hat = m / bc1 | |
| v_hat = v / bc2 | |
| # noise injection on stagnation | |
| if self.noise > 0: | |
| m_hat = m_hat + self.noise * torch.randn_like(m_hat) | |
| # weight decay (decoupled) | |
| if wd > 0: | |
| p.add_(p, alpha=-lr * wd) | |
| # update | |
| p.addcdiv_(m_hat, v_hat.sqrt().add_(eps), value=-lr) | |
| if self.verbose > 0 and self.global_step % self.verbose == 0: | |
| print(f" chuck: step={self.global_step} λ={self.dampen:.3f} " | |
| f"lr_scale={self.lr_scale:.3f} noise={self.noise:.4f} " | |
| f"macro_drops={self.macro_drops}") | |
| # --- training loop -------------------------------------------------------------- | |
| # the part where numbers go down and hope goes up. | |
| # or numbers go up and you stare at the ceiling. | |
| class Trainer: | |
| def __init__(self, bpe, device='cpu'): | |
| self.bpe = bpe | |
| self.device = device | |
| self.model = None | |
| self.optimizer = None | |
| self.stage_name = None | |
| self.total_steps = 0 | |
| self.best_loss = float('inf') | |
| self._token_ids = None | |
| def _ensure_model(self): | |
| """Create or grow model based on corpus size.""" | |
| if not TORCH_AVAILABLE: | |
| print("[train] no pytorch. install: pip install torch") | |
| return False | |
| stage = get_stage(len(self.bpe.corpus)) | |
| _, dim, n_heads, n_layers, ctx_len, max_merges, name = stage | |
| if self.stage_name == name and self.model is not None: | |
| return True | |
| old_name = self.stage_name | |
| old_state = self.model.state_dict() if self.model else None | |
| self.bpe.max_merges = max_merges | |
| vocab_size = 256 + max_merges | |
| self.model = NanoDurov(vocab_size, dim, n_heads, n_layers, ctx_len) | |
| self.model.to(self.device) | |
| # copy weights from old model where shapes match | |
| if old_state: | |
| new_state = self.model.state_dict() | |
| copied = 0 | |
| for k in old_state: | |
| if k in new_state and old_state[k].shape == new_state[k].shape: | |
| new_state[k] = old_state[k] | |
| copied += 1 | |
| elif k in new_state and len(old_state[k].shape) == len(new_state[k].shape): | |
| # partial copy: take min of each dim | |
| old_t = old_state[k] | |
| new_t = new_state[k] | |
| slices = tuple(slice(0, min(o, n)) for o, n in zip(old_t.shape, new_t.shape)) | |
| new_state[k][slices] = old_t[slices] | |
| copied += 1 | |
| self.model.load_state_dict(new_state) | |
| print(f"[model] GREW: {old_name} → {name} (copied {copied} tensors)") | |
| self.optimizer = Chuck( | |
| self.model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY, | |
| verbose=0) | |
| self.stage_name = name | |
| n_params = sum(p.numel() for p in self.model.parameters()) | |
| print(f"[model] {name}: {n_params:,} params, dim={dim}, " | |
| f"layers={n_layers}, heads={n_heads}, ctx={ctx_len}") | |
| return True | |
| def _get_batch(self, token_ids, batch_size, ctx_len): | |
| """Random batch of training windows.""" | |
| n = len(token_ids) | |
| if n <= ctx_len + 1: | |
| return None, None | |
| ix = torch.randint(0, n - ctx_len - 1, (batch_size,)) | |
| x = torch.stack([torch.tensor(token_ids[i:i+ctx_len], dtype=torch.long) for i in ix]) | |
| y = torch.stack([torch.tensor(token_ids[i+1:i+ctx_len+1], dtype=torch.long) for i in ix]) | |
| return x.to(self.device), y.to(self.device) | |
| def tokenize(self): | |
| """Tokenize corpus, learning merges if needed.""" | |
| if not self.bpe.corpus: | |
| return None | |
| if not self.bpe.merges: | |
| ids = self.bpe.learn(self.bpe.corpus) | |
| else: | |
| ids = self.bpe.retokenize() | |
| self._token_ids = ids | |
| return ids | |
| def train(self, steps=None, verbose=True): | |
| """Train for N steps. Returns average loss.""" | |
| if not self._ensure_model(): | |
| return None | |
| steps = steps or TRAIN_STEPS_PER_ROUND | |
| stage = get_stage(len(self.bpe.corpus)) | |
| ctx_len = stage[4] | |
| # tokenize if needed | |
| if self._token_ids is None: | |
| self.tokenize() | |
| if self._token_ids is None or len(self._token_ids) < ctx_len + 1: | |
| if verbose: | |
| print(f"[train] not enough tokens ({len(self._token_ids) if self._token_ids else 0})") | |
| return None | |
| # clamp token ids to vocab | |
| vocab = self.bpe.vocab_size | |
| ids = [min(t, vocab - 1) for t in self._token_ids] | |
| self.model.train() | |
| losses = [] | |
| t0 = time.time() | |
| for step in range(steps): | |
| x, y = self._get_batch(ids, BATCH_SIZE, ctx_len) | |
| if x is None: | |
| break | |
| _, loss = self.model(x, y) | |
| self.optimizer.zero_grad() | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) | |
| self.optimizer.step(loss=loss.item()) | |
| losses.append(loss.item()) | |
| self.total_steps += 1 | |
| if verbose and (step + 1) % 10 == 0: | |
| avg = sum(losses[-10:]) / len(losses[-10:]) | |
| print(f" step {self.total_steps} | loss {avg:.4f}") | |
| elapsed = time.time() - t0 | |
| avg_loss = sum(losses) / len(losses) if losses else 0 | |
| if avg_loss < self.best_loss: | |
| self.best_loss = avg_loss | |
| if verbose: | |
| print(f"[train] {len(losses)} steps in {elapsed:.1f}s | " | |
| f"loss {avg_loss:.4f} | best {self.best_loss:.4f} | " | |
| f"stage={self.stage_name}") | |
| # check growth after training | |
| self._ensure_model() | |
| return avg_loss | |
| @torch.no_grad() | |
| def generate(self, prompt, max_new=100, temperature=0.8): | |
| """Generate text from prompt.""" | |
| if not self.model: | |
| return "[no model trained yet]" | |
| self.model.eval() | |
| ids = self.bpe.encode(prompt) | |
| if not ids: | |
| ids = [0] | |
| # clamp to vocab | |
| ids = [min(t, self.bpe.vocab_size - 1) for t in ids] | |
| idx = torch.tensor([ids], dtype=torch.long, device=self.device) | |
| out = self.model.generate(idx, max_new=max_new, temperature=temperature) | |
| generated = out[0, len(ids):].tolist() | |
| return self.bpe.decode(generated) | |
| def save(self, path): | |
| if not self.model: | |
| return | |
| ckpt = { | |
| 'model': self.model.state_dict(), | |
| 'optimizer': self.optimizer.state_dict(), | |
| 'stage': self.stage_name, | |
| 'total_steps': self.total_steps, | |
| 'best_loss': self.best_loss, | |
| } | |
| # save Chuck's soul | |
| if hasattr(self.optimizer, 'dampen'): | |
| ckpt['chuck'] = { | |
| 'dampen': self.optimizer.dampen, | |
| 'lr_scale': self.optimizer.lr_scale, | |
| 'loss_ema': self.optimizer.loss_ema, | |
| 'macro_ema': self.optimizer.macro_ema, | |
| 'best_macro': self.optimizer.best_macro, | |
| 'macro_drops': self.optimizer.macro_drops, | |
| 'global_step': self.optimizer.global_step, | |
| } | |
| torch.save(ckpt, path) | |
| print(f"[train] saved checkpoint to {path}") | |
| def load(self, path): | |
| if not os.path.exists(path): | |
| return False | |
| if not self._ensure_model(): | |
| return False | |
| ckpt = torch.load(path, map_location=self.device, weights_only=False) | |
| try: | |
| self.model.load_state_dict(ckpt['model'], strict=False) | |
| self.total_steps = ckpt.get('total_steps', 0) | |
| self.best_loss = ckpt.get('best_loss', float('inf')) | |
| # restore Chuck's soul | |
| if 'chuck' in ckpt and hasattr(self.optimizer, 'dampen'): | |
| cs = ckpt['chuck'] | |
| self.optimizer.dampen = cs.get('dampen', 1.0) | |
| self.optimizer.lr_scale = cs.get('lr_scale', 1.0) | |
| self.optimizer.loss_ema = cs.get('loss_ema', 0.0) | |
| self.optimizer.macro_ema = cs.get('macro_ema', 0.0) | |
| self.optimizer.best_macro = cs.get('best_macro', 1e9) | |
| self.optimizer.macro_drops = cs.get('macro_drops', 0) | |
| self.optimizer.global_step = cs.get('global_step', 0) | |
| # try to restore optimizer state (may fail after growth) | |
| try: | |
| self.optimizer.load_state_dict(ckpt['optimizer']) | |
| except (ValueError, KeyError): | |
| pass # model grew, optimizer state doesn't match — fresh Adam state | |
| print(f"[train] loaded checkpoint: step={self.total_steps}, loss={self.best_loss:.4f}") | |
| return True | |
| except Exception as e: | |
| print(f"[train] checkpoint load failed (model grew?): {e}") | |
| return False | |
| # --- telegram client ----------------------------------------------------------- | |
| # MTProto observer. sees all messages including bot-to-bot. | |
| # does NOT relay. does NOT forward. watches. learns. generates when asked. | |
| async def run_telegram(trainer): | |
| if not TELETHON_AVAILABLE: | |
| print("[telegram] install telethon: pip install telethon") | |
| return | |
| api_id = int(os.environ.get('TELEGRAM_API_ID', 0)) | |
| api_hash = os.environ.get('TELEGRAM_API_HASH', '') | |
| if not api_id or not api_hash: | |
| print("[telegram] set TELEGRAM_API_ID and TELEGRAM_API_HASH") | |
| print("[telegram] get them at https://my.telegram.org/apps") | |
| return | |
| client = TelegramClient('nanodurov_session', api_id, api_hash) | |
| await client.start() | |
| print("[telegram] connected") | |
| # choose group | |
| group_input = input("\nGroup @username or ID: ").strip() | |
| try: | |
| entity = await client.get_entity(group_input) | |
| title = getattr(entity, 'title', group_input) | |
| print(f"[telegram] watching: {title}\n") | |
| except Exception as e: | |
| print(f"[telegram] can't find group: {e}") | |
| return | |
| # load history | |
| print("[telegram] loading history...") | |
| messages = await client.get_messages(entity, limit=500) | |
| for msg in reversed(messages): | |
| if msg.message: | |
| sender = await msg.get_sender() | |
| name = _sender_name(sender) | |
| bot = _is_bot(sender) | |
| line = f"[{name}]: {msg.message}" | |
| trainer.bpe.ingest(line) | |
| print(f"[telegram] ingested {len(trainer.bpe.corpus)} bytes from history") | |
| # initial train if we have data | |
| if len(trainer.bpe.corpus) > 500: | |
| trainer.tokenize() | |
| trainer.train(steps=TRAIN_STEPS_PER_ROUND) | |
| # message handler — observe + ingest | |
| @client.on(events.NewMessage(chats=entity)) | |
| async def handler(event): | |
| msg = event.message | |
| if not msg.message: | |
| return | |
| sender = await msg.get_sender() | |
| name = _sender_name(sender) | |
| bot = _is_bot(sender) | |
| tag = " [BOT]" if bot else "" | |
| ts = msg.date.strftime("%H:%M:%S") if msg.date else "??:??:??" | |
| print(f"[{ts}] {name}{tag}: {msg.message}") | |
| # ingest | |
| line = f"[{name}]: {msg.message}" | |
| trainer.bpe.ingest(line) | |
| # auto-train loop — only trains when there's meaningful new data | |
| last_corpus_size = len(trainer.bpe.corpus) | |
| async def auto_train(): | |
| nonlocal last_corpus_size | |
| while True: | |
| await asyncio.sleep(AUTO_TRAIN_INTERVAL) | |
| corpus_size = len(trainer.bpe.corpus) | |
| new_bytes = corpus_size - last_corpus_size | |
| # only train if at least 1KB of new data since last train | |
| if corpus_size > 500 and new_bytes > 1024: | |
| print(f"\n[auto-train] +{new_bytes/1024:.1f}KB new data, training...") | |
| last_corpus_size = corpus_size | |
| trainer.tokenize() | |
| trainer.train(steps=TRAIN_STEPS_PER_ROUND) | |
| trainer.bpe.save('nanodurov_bpe.bin') | |
| trainer.save('nanodurov_ckpt.pt') | |
| print("[auto-train] done. watching...\n") | |
| # input handler — user can type messages or commands | |
| async def input_loop(): | |
| loop = asyncio.get_event_loop() | |
| while True: | |
| try: | |
| line = await loop.run_in_executor(None, lambda: input("")) | |
| except EOFError: | |
| break | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line == '/quit': | |
| print("saving...") | |
| trainer.bpe.save('nanodurov_bpe.bin') | |
| trainer.save('nanodurov_ckpt.pt') | |
| await client.disconnect() | |
| break | |
| elif line == '/train': | |
| trainer.tokenize() | |
| trainer.train(steps=TRAIN_STEPS_PER_ROUND) | |
| elif line.startswith('/generate') or line.startswith('/ai'): | |
| prompt = line.split(' ', 1)[1] if ' ' in line else '[User]: ' | |
| text = trainer.generate(prompt) | |
| print(f" 🧠 {text}") | |
| elif line == '/status': | |
| n = sum(p.numel() for p in trainer.model.parameters()) if trainer.model else 0 | |
| print(f" stage={trainer.stage_name} params={n:,} " | |
| f"steps={trainer.total_steps} loss={trainer.best_loss:.4f} " | |
| f"corpus={len(trainer.bpe.corpus)/1024:.1f}KB " | |
| f"vocab={trainer.bpe.vocab_size}") | |
| elif line == '/save': | |
| trainer.bpe.save('nanodurov_bpe.bin') | |
| trainer.save('nanodurov_ckpt.pt') | |
| elif line == '/history': | |
| msgs = await client.get_messages(entity, limit=20) | |
| for m in reversed(msgs): | |
| if m.message: | |
| s = await m.get_sender() | |
| n = _sender_name(s) | |
| b = " [BOT]" if _is_bot(s) else "" | |
| ts = m.date.strftime("%H:%M:%S") if m.date else "??:??:??" | |
| print(f" [{ts}] {n}{b}: {m.message}") | |
| else: | |
| # send as user message | |
| await client.send_message(entity, line) | |
| print("Commands: /train /generate <prompt> /ai <prompt> /status /save /history /quit") | |
| print("Anything else is sent as a message. Auto-train runs every " | |
| f"{AUTO_TRAIN_INTERVAL}s.\n") | |
| await asyncio.gather( | |
| auto_train(), | |
| input_loop(), | |
| ) | |
| def _sender_name(sender): | |
| if sender is None: | |
| return "Unknown" | |
| if isinstance(sender, User): | |
| parts = [sender.first_name or '', sender.last_name or ''] | |
| name = ' '.join(p for p in parts if p) | |
| return name or sender.username or f'User#{sender.id}' | |
| if hasattr(sender, 'title'): | |
| return sender.title or f'Chat#{sender.id}' | |
| return f'#{getattr(sender, "id", "?")}' | |
| def _is_bot(sender): | |
| return isinstance(sender, User) and sender.bot | |
| # --- main ---------------------------------------------------------------------- | |
| # where the threads converge and the magic begins. | |
| # or crashes. usually crashes first, then magic. | |
| def main(): | |
| parser = argparse.ArgumentParser(description='nanodurov — telegram chat that learns') | |
| parser.add_argument('--generate', type=str, help='generate from prompt (offline)') | |
| parser.add_argument('--train-only', type=str, help='train on text file (no telegram)') | |
| parser.add_argument('--steps', type=int, default=200, help='training steps') | |
| parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda or mps') | |
| args = parser.parse_args() | |
| # init | |
| bpe = BPE(max_merges=256) | |
| bpe.load('nanodurov_bpe.bin') | |
| trainer = Trainer(bpe, device=args.device) | |
| trainer.load('nanodurov_ckpt.pt') | |
| if args.train_only: | |
| # offline training on text file | |
| print(f"[main] training on {args.train_only}") | |
| with open(args.train_only, 'r') as f: | |
| text = f.read() | |
| bpe.ingest(text) | |
| trainer.tokenize() | |
| for r in range(args.steps // TRAIN_STEPS_PER_ROUND + 1): | |
| trainer.train(steps=min(TRAIN_STEPS_PER_ROUND, args.steps - r * TRAIN_STEPS_PER_ROUND)) | |
| bpe.save('nanodurov_bpe.bin') | |
| trainer.save('nanodurov_ckpt.pt') | |
| print("[main] done.") | |
| return | |
| if args.generate: | |
| text = trainer.generate(args.generate) | |
| print(text) | |
| return | |
| # telegram mode | |
| if not TELETHON_AVAILABLE: | |
| print("pip install telethon") | |
| sys.exit(1) | |
| if not TORCH_AVAILABLE: | |
| print("pip install torch") | |
| sys.exit(1) | |
| print(""" | |
| ╔═══════════════════════════════════════════════════╗ | |
| ║ n a n o d u r o v ║ | |
| ║ telegram client that learns from chat ║ | |
| ║ one file. one model. one act of defiance. ║ | |
| ╚═══════════════════════════════════════════════════╝ | |
| """) | |
| asyncio.run(run_telegram(trainer)) | |
| if __name__ == '__main__': | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * train_nanodurov.c — Train a 15.7M BPE LLaMA on Arianna dataset via notorch | |
| * | |
| * Architecture (matches Leo 18M): | |
| * dim=384, layers=8, heads=8, head_dim=48, ffn=1024 | |
| * vocab=2048 (BPE), ctx=256, RoPE, MHA, SwiGLU, RMSNorm | |
| * | |
| * Dataset: arianna_dataset_final_clean.txt (1.2MB, Q/A philosophy) | |
| * Tokenizer: arianna_bpe_merges.txt (1792 merges, vocab 2048) | |
| * Karpathy: ~1.1MB + ~15M params → 15K steps | |
| * | |
| * Build: make train_nanodurov | |
| * Run: ./train_nanodurov [steps] [lr] | |
| * ./train_nanodurov --resume [steps] [lr] | |
| */ | |
| #include "notorch.h" | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <sys/time.h> | |
| #define DIM 384 | |
| #define NLAYERS 8 | |
| #define NHEADS 8 | |
| #define HEAD_DIM (DIM / NHEADS) /* 48 */ | |
| #define HIDDEN 1024 | |
| #define CTX 256 | |
| #define VOCAB 2048 | |
| #define CKPT_EVERY 1000 | |
| #define EVAL_SEQS 32 | |
| #define LOG_EVERY 100 | |
| #define CKPT_PREFIX "nanodurov_ckpt" | |
| typedef struct { | |
| nt_tensor *wte; /* [VOCAB, DIM] */ | |
| struct { | |
| nt_tensor *rms1; /* [DIM] */ | |
| nt_tensor *wq, *wk, *wv, *wo; /* all [DIM, DIM] — MHA */ | |
| nt_tensor *rms2; /* [DIM] */ | |
| nt_tensor *w_gate, *w_up; /* [HIDDEN, DIM] */ | |
| nt_tensor *w_down; /* [DIM, HIDDEN] */ | |
| } L[NLAYERS]; | |
| nt_tensor *rms_f; /* [DIM] */ | |
| nt_tensor *head; /* [VOCAB, DIM] */ | |
| } Model; | |
| static long count_params(Model* m) { | |
| long n = m->wte->len + m->rms_f->len + m->head->len; | |
| for (int l = 0; l < NLAYERS; l++) { | |
| n += m->L[l].rms1->len + m->L[l].rms2->len; | |
| n += m->L[l].wq->len + m->L[l].wk->len + m->L[l].wv->len + m->L[l].wo->len; | |
| n += m->L[l].w_gate->len + m->L[l].w_up->len + m->L[l].w_down->len; | |
| } | |
| return n; | |
| } | |
| static Model* model_new(void) { | |
| Model* m = (Model*)calloc(1, sizeof(Model)); | |
| m->wte = nt_tensor_new2d(VOCAB, DIM); nt_tensor_xavier(m->wte, VOCAB, DIM); | |
| float rs = 0.02f / sqrtf(2.0f * NLAYERS); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| m->L[l].rms1 = nt_tensor_new(DIM); nt_tensor_fill(m->L[l].rms1, 1.0f); | |
| m->L[l].wq = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wq, DIM, DIM); | |
| m->L[l].wk = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wk, DIM, DIM); | |
| m->L[l].wv = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wv, DIM, DIM); | |
| m->L[l].wo = nt_tensor_new2d(DIM, DIM); nt_tensor_xavier(m->L[l].wo, DIM, DIM); | |
| for (int i = 0; i < m->L[l].wo->len; i++) m->L[l].wo->data[i] *= rs / 0.1f; | |
| m->L[l].rms2 = nt_tensor_new(DIM); nt_tensor_fill(m->L[l].rms2, 1.0f); | |
| m->L[l].w_gate = nt_tensor_new2d(HIDDEN, DIM); nt_tensor_xavier(m->L[l].w_gate, DIM, HIDDEN); | |
| m->L[l].w_up = nt_tensor_new2d(HIDDEN, DIM); nt_tensor_xavier(m->L[l].w_up, DIM, HIDDEN); | |
| m->L[l].w_down = nt_tensor_new2d(DIM, HIDDEN); nt_tensor_xavier(m->L[l].w_down, HIDDEN, DIM); | |
| for (int i = 0; i < m->L[l].w_down->len; i++) m->L[l].w_down->data[i] *= rs / 0.1f; | |
| } | |
| m->rms_f = nt_tensor_new(DIM); nt_tensor_fill(m->rms_f, 1.0f); | |
| m->head = nt_tensor_new2d(VOCAB, DIM); nt_tensor_xavier(m->head, DIM, VOCAB); | |
| return m; | |
| } | |
| static void model_free(Model* m) { | |
| nt_tensor_free(m->wte); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| nt_tensor_free(m->L[l].rms1); nt_tensor_free(m->L[l].rms2); | |
| nt_tensor_free(m->L[l].wq); nt_tensor_free(m->L[l].wk); | |
| nt_tensor_free(m->L[l].wv); nt_tensor_free(m->L[l].wo); | |
| nt_tensor_free(m->L[l].w_gate); nt_tensor_free(m->L[l].w_up); | |
| nt_tensor_free(m->L[l].w_down); | |
| } | |
| nt_tensor_free(m->rms_f); nt_tensor_free(m->head); free(m); | |
| } | |
| /* ── Save / Load ── */ | |
| static int model_n_tensors(void) { return 1 + NLAYERS * 9 + 2; } | |
| static nt_tensor** model_param_array(Model* m) { | |
| int n = model_n_tensors(); | |
| nt_tensor** p = (nt_tensor**)malloc(n * sizeof(nt_tensor*)); | |
| int i = 0; | |
| p[i++] = m->wte; | |
| for (int l = 0; l < NLAYERS; l++) { | |
| p[i++]=m->L[l].rms1; p[i++]=m->L[l].wq; p[i++]=m->L[l].wk; | |
| p[i++]=m->L[l].wv; p[i++]=m->L[l].wo; p[i++]=m->L[l].rms2; | |
| p[i++]=m->L[l].w_gate; p[i++]=m->L[l].w_up; p[i++]=m->L[l].w_down; | |
| } | |
| p[i++] = m->rms_f; p[i++] = m->head; | |
| return p; | |
| } | |
| static void save_model(Model* m, const char* prefix) { | |
| char path[256]; | |
| snprintf(path, sizeof(path), "%s.bin", prefix); | |
| nt_tensor** p = model_param_array(m); | |
| nt_save(path, p, model_n_tensors()); | |
| free(p); | |
| } | |
| static void save_checkpoint(Model* m, int step, float best_loss) { | |
| save_model(m, CKPT_PREFIX); | |
| char mpath[256]; | |
| snprintf(mpath, sizeof(mpath), "%s.meta", CKPT_PREFIX); | |
| FILE* f = fopen(mpath, "w"); | |
| if (f) { fprintf(f, "%d\n%.6f\n", step, best_loss); fclose(f); } | |
| } | |
| static int load_checkpoint(Model* m, float* best_loss) { | |
| char wpath[256], mpath[256]; | |
| snprintf(wpath, sizeof(wpath), "%s.bin", CKPT_PREFIX); | |
| snprintf(mpath, sizeof(mpath), "%s.meta", CKPT_PREFIX); | |
| int n_loaded = 0; | |
| nt_tensor** loaded = nt_load(wpath, &n_loaded); | |
| if (!loaded) return -1; | |
| int expected = model_n_tensors(); | |
| if (n_loaded != expected) { | |
| for (int i = 0; i < n_loaded; i++) nt_tensor_free(loaded[i]); | |
| free(loaded); return -1; | |
| } | |
| nt_tensor** mp = model_param_array(m); | |
| for (int i = 0; i < expected; i++) { | |
| memcpy(mp[i]->data, loaded[i]->data, mp[i]->len * sizeof(float)); | |
| nt_tensor_free(loaded[i]); | |
| } | |
| free(loaded); free(mp); | |
| int step = 0; *best_loss = 99.0f; | |
| FILE* f = fopen(mpath, "r"); | |
| if (f) { fscanf(f, "%d\n%f\n", &step, best_loss); fclose(f); } | |
| return step; | |
| } | |
| /* ── Forward ── */ | |
| static int forward(Model* m, int* tokens, int* targets) { | |
| int wte_i = nt_tape_param(m->wte); nt_tape_no_decay(wte_i); | |
| int li[NLAYERS][9]; | |
| for (int l = 0; l < NLAYERS; l++) { | |
| li[l][0] = nt_tape_param(m->L[l].rms1); | |
| li[l][1] = nt_tape_param(m->L[l].wq); | |
| li[l][2] = nt_tape_param(m->L[l].wk); | |
| li[l][3] = nt_tape_param(m->L[l].wv); | |
| li[l][4] = nt_tape_param(m->L[l].wo); | |
| li[l][5] = nt_tape_param(m->L[l].rms2); | |
| li[l][6] = nt_tape_param(m->L[l].w_gate); | |
| li[l][7] = nt_tape_param(m->L[l].w_up); | |
| li[l][8] = nt_tape_param(m->L[l].w_down); | |
| } | |
| int rmsf_i = nt_tape_param(m->rms_f); | |
| int head_i = nt_tape_param(m->head); | |
| nt_tensor* tok_t = nt_tensor_new(CTX); | |
| nt_tensor* tgt_t = nt_tensor_new(CTX); | |
| for (int i = 0; i < CTX; i++) { tok_t->data[i] = (float)tokens[i]; tgt_t->data[i] = (float)targets[i]; } | |
| int tok_i = nt_tape_record(tok_t, NT_OP_NONE, -1, -1, 0); | |
| int tgt_i = nt_tape_record(tgt_t, NT_OP_NONE, -1, -1, 0); | |
| nt_tensor_free(tok_t); nt_tensor_free(tgt_t); | |
| /* Token embedding only — RoPE handles positions */ | |
| int h = nt_seq_embedding(wte_i, -1, tok_i, CTX, DIM); | |
| for (int l = 0; l < NLAYERS; l++) { | |
| int xn = nt_seq_rmsnorm(h, li[l][0], CTX, DIM); | |
| int q = nt_seq_linear(li[l][1], xn, CTX); | |
| int k = nt_seq_linear(li[l][2], xn, CTX); | |
| int v = nt_seq_linear(li[l][3], xn, CTX); | |
| q = nt_rope(q, CTX, HEAD_DIM); | |
| k = nt_rope(k, CTX, HEAD_DIM); | |
| int attn = nt_mh_causal_attention(q, k, v, CTX, HEAD_DIM); | |
| int proj = nt_seq_linear(li[l][4], attn, CTX); | |
| h = nt_add(h, proj); | |
| xn = nt_seq_rmsnorm(h, li[l][5], CTX, DIM); | |
| int gate = nt_silu(nt_seq_linear(li[l][6], xn, CTX)); | |
| int up = nt_seq_linear(li[l][7], xn, CTX); | |
| int down = nt_seq_linear(li[l][8], nt_mul(gate, up), CTX); | |
| h = nt_add(h, down); | |
| } | |
| int hf = nt_seq_rmsnorm(h, rmsf_i, CTX, DIM); | |
| int logits = nt_seq_linear(head_i, hf, CTX); | |
| return nt_seq_cross_entropy(logits, tgt_i, CTX, VOCAB); | |
| } | |
| /* ── Eval ── */ | |
| static float eval_loss(Model* m, int* encoded, int n_tokens) { | |
| float total = 0; int count = 0; | |
| int stride = n_tokens / EVAL_SEQS; | |
| for (int s = 0; s < EVAL_SEQS; s++) { | |
| int off = s * stride; | |
| if (off + CTX + 1 > n_tokens) break; | |
| nt_tape_start(); | |
| nt_train_mode(0); | |
| int loss_idx = forward(m, encoded + off, encoded + off + 1); | |
| total += nt_tape_get()->entries[loss_idx].output->data[0]; | |
| count++; | |
| nt_tape_clear(); | |
| nt_train_mode(1); | |
| } | |
| return count > 0 ? total / count : 99.0f; | |
| } | |
| static double now_ms(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec*1000.0+tv.tv_usec/1000.0; } | |
| int main(int argc, char** argv) { | |
| int resume = 0, arg_off = 1; | |
| if (argc > 1 && strcmp(argv[1], "--resume") == 0) { resume = 1; arg_off = 2; } | |
| int steps = arg_off < argc ? atoi(argv[arg_off]) : 15000; | |
| float base_lr = (arg_off+1) < argc ? (float)atof(argv[arg_off+1]) : 3e-4f; | |
| printf("════════════════════════════════════════════════════════\n"); | |
| printf(" notorch — nanodurov BPE training (Arianna voice)\n"); | |
| printf(" dim=%d L=%d H=%d HD=%d FFN=%d CTX=%d V=%d\n", | |
| DIM, NLAYERS, NHEADS, HEAD_DIM, HIDDEN, CTX, VOCAB); | |
| printf(" MHA + RoPE + BPE 2048\n"); | |
| printf(" Chuck optimizer, %d steps, lr=%.1e, warmup=%d\n", steps, base_lr, steps/10); | |
| printf(" checkpoint every %d steps\n", CKPT_EVERY); | |
| printf("════════════════════════════════════════════════════════\n"); | |
| /* Load BPE tokenizer */ | |
| nt_bpe bpe; | |
| int nm = nt_bpe_load(&bpe, "arianna_bpe_merges.txt"); | |
| if (nm < 0) { printf("cannot load arianna_bpe_merges.txt\n"); return 1; } | |
| printf("bpe: %d merges, vocab %d\n", bpe.n_merges, bpe.vocab_size); | |
| /* Load and encode dataset */ | |
| const char* path = "/Users/ataeff/Downloads/arianna_dataset_final_clean.txt"; | |
| FILE* f = fopen(path, "rb"); | |
| if (!f) { printf("cannot open %s\n", path); return 1; } | |
| fseek(f, 0, SEEK_END); long fsize = ftell(f); fseek(f, 0, SEEK_SET); | |
| char* raw = (char*)malloc(fsize + 1); | |
| fread(raw, 1, fsize, f); raw[fsize] = 0; fclose(f); | |
| int* encoded = (int*)malloc(fsize * sizeof(int)); | |
| int n_tokens = nt_bpe_encode(&bpe, raw, (int)fsize, encoded, (int)fsize); | |
| free(raw); | |
| printf("corpus: %.1f MB → %d BPE tokens (%.1fx compression)\n", | |
| fsize/1048576.0, n_tokens, (float)fsize/n_tokens); | |
| nt_seed(42); | |
| Model* model = model_new(); | |
| long np = count_params(model); | |
| printf("model: %ld params (%.1f MB)\n", np, np*4.0f/1048576.0f); | |
| float tokens_per_epoch = (float)n_tokens / CTX; | |
| printf("karpathy: %.1fMB, %ldM params, %d steps (%.1f epochs)\n", | |
| fsize/1048576.0, np/1000000, steps, (float)steps / tokens_per_epoch); | |
| int start_step = 0; | |
| float best_loss = 99.0f; | |
| if (resume) { | |
| int loaded_step = load_checkpoint(model, &best_loss); | |
| if (loaded_step >= 0) { | |
| start_step = loaded_step; | |
| printf("RESUMED from step %d, best_loss=%.4f\n", start_step, best_loss); | |
| } else printf("no checkpoint found, starting fresh\n"); | |
| } | |
| nt_schedule sched = nt_schedule_cosine(base_lr, steps/10, steps, base_lr*0.1f); | |
| sched.current_step = start_step; | |
| nt_nan_guard guard = nt_nan_guard_new(); | |
| printf("\ntraining...\n"); | |
| printf("─────────────────────────────────────────────────────\n"); | |
| double t0 = now_ms(); | |
| float first_loss = 0; | |
| for (int step = start_step; step < steps; step++) { | |
| float lr = nt_schedule_get_lr(&sched); | |
| int off = rand() % (n_tokens - CTX - 1); | |
| nt_tape_start(); | |
| int loss_idx = forward(model, encoded + off, encoded + off + 1); | |
| float lv = nt_tape_get()->entries[loss_idx].output->data[0]; | |
| if (step == start_step) first_loss = lv; | |
| if (lv < best_loss) best_loss = lv; | |
| nt_tape_backward(loss_idx); | |
| if (!nt_nan_guard_check(&guard)) { nt_tape_clear(); continue; } | |
| nt_tape_clip_grads(1.0f); | |
| nt_tape_chuck_step(lr, lv); | |
| nt_tape_clear(); | |
| if ((step+1) % LOG_EVERY == 0 || step == start_step) { | |
| printf(" step %5d | train %.4f | best %.4f | lr %.2e | %.1fs\n", | |
| step+1, lv, best_loss, lr, (now_ms()-t0)/1000.0); | |
| fflush(stdout); | |
| } | |
| if ((step+1) % CKPT_EVERY == 0 && step > start_step) { | |
| float val = eval_loss(model, encoded, n_tokens); | |
| printf(" ──── ckpt %d | val %.4f | saving... ", step+1, val); | |
| save_checkpoint(model, step+1, best_loss); | |
| printf("\n"); fflush(stdout); | |
| } | |
| } | |
| float final_val = eval_loss(model, encoded, n_tokens); | |
| double total_s = (now_ms()-t0)/1000.0; | |
| printf("─────────────────────────────────────────────────────\n"); | |
| printf(" train: %.4f → %.4f (best: %.4f)\n", first_loss, best_loss, best_loss); | |
| printf(" val: %.4f\n", final_val); | |
| printf(" time: %.0fs (%.1f min) | %.2f steps/s\n", total_s, total_s/60.0, (steps-start_step)/total_s); | |
| printf(" nans: %d\n", guard.total_nan_count); | |
| /* Generate */ | |
| printf("\n── generation (temp=0.8) ──\n"); | |
| nt_train_mode(0); | |
| const char* prompts[] = { | |
| "Q: Who are you?\nA:", | |
| "Q: What is consciousness?\nA:", | |
| "Q: What is love?\nA:" | |
| }; | |
| for (int p = 0; p < 3; p++) { | |
| int ctx_tokens[CTX]; | |
| int gen_len = nt_bpe_encode(&bpe, prompts[p], (int)strlen(prompts[p]), ctx_tokens, CTX/2); | |
| printf("%s", prompts[p]); | |
| for (int s = 0; s < 100; s++) { | |
| int tokens_pad[CTX], targets_pad[CTX]; | |
| for (int i = 0; i < gen_len; i++) tokens_pad[i] = ctx_tokens[i]; | |
| for (int i = gen_len; i < CTX; i++) tokens_pad[i] = 0; | |
| memset(targets_pad, 0, sizeof(targets_pad)); | |
| nt_tape_start(); | |
| int loss_idx = forward(model, tokens_pad, targets_pad); | |
| nt_tape* tape = nt_tape_get(); | |
| int logits_idx = tape->entries[loss_idx].parent1; | |
| float* last = tape->entries[logits_idx].output->data + (gen_len-1)*VOCAB; | |
| /* Temperature sampling */ | |
| for (int i = 0; i < VOCAB; i++) last[i] /= 0.8f; | |
| float mx = last[0]; for (int i=1;i<VOCAB;i++) if(last[i]>mx) mx=last[i]; | |
| float sm = 0; for (int i=0;i<VOCAB;i++) { last[i]=expf(last[i]-mx); sm+=last[i]; } | |
| for (int i=0;i<VOCAB;i++) last[i]/=sm; | |
| float r=(float)rand()/(float)RAND_MAX, cum=0; int next=0; | |
| for (int i=0;i<VOCAB;i++) { cum+=last[i]; if(cum>=r){next=i;break;} } | |
| /* Decode single token */ | |
| char decoded[NT_BPE_MAX_TOKEN_LEN + 1]; | |
| int db = nt_bpe_decode(&bpe, &next, 1, decoded, NT_BPE_MAX_TOKEN_LEN); | |
| if (db > 0) { | |
| /* Stop on double newline (Q: boundary) */ | |
| if (strstr(decoded, "\nQ") != NULL) break; | |
| printf("%s", decoded); | |
| } | |
| fflush(stdout); | |
| ctx_tokens[gen_len++] = next; | |
| nt_tape_clear(); | |
| if (gen_len >= CTX - 1) break; | |
| } | |
| printf("\n\n"); | |
| } | |
| /* Save */ | |
| printf("── saving ──\n"); | |
| save_model(model, "nanodurov_arianna"); | |
| printf(" nanodurov_arianna.bin (%.1f MB)\n", np*4.0f/1048576.0f); | |
| save_checkpoint(model, steps, best_loss); | |
| model_free(model); free(encoded); | |
| printf("\n════════════════════════════════════════════════════════\n"); | |
| printf(" nanodurov trained. %d steps. BPE. RoPE. No Python.\n", steps); | |
| printf("════════════════════════════════════════════════════════\n"); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment