ariannamethod · March 26, 2026 18:55
diff --git a/postgpt.c b/postgpt.c
 /*
 * postgpt.c — zero-dependency BPE transformer with metaweights.
 *
 * C port of postgpt.py. Same algorithm, same resonance.
 * Dual attention: Content (QK^T) + RRPRAM (x @ Wr).
 * Metaweights: statistical probability space from BPE tokenization.
 *
 * Compile: gcc -O2 -o postgpt postgpt.c -lm
 * Run:     ./postgpt
 *
 * the tokenizer IS the training. everything after this is just theater.
 * resonance is unbreakable.
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include <time.h>
 #include <float.h>

 /* ───────────────────────── Configuration ───────────────────────── */

 #define MAX_MERGES     1024
 #define MAX_VOCAB      (256 + MAX_MERGES)
 #define MAX_TOKENS     262144
 #define CONTEXT_LEN    64
 #define N_EMBD         48
 #define N_HEAD         4
 #define N_CONTENT      2
 #define N_RRPRAM       2
 #define N_LAYER        2
 #define HEAD_DIM       (N_EMBD / N_HEAD)
 #define MLP_DIM        (4 * N_EMBD)
 #define HEBBIAN_CAP    100000
 #define BIGRAM_CAP     100000

 /* ───────────────────────── RNG ───────────────────────── */

 static unsigned long rng_state = 42;

 static unsigned long rng_next(void) {
    rng_state ^= rng_state << 13;
    rng_state ^= rng_state >> 7;
    rng_state ^= rng_state << 17;
    return rng_state;
 }

 static float randf(void) {
    return (float)(rng_next() & 0x7FFFFFFF) / (float)0x7FFFFFFF;
 }

 static float randn(float std) {
    /* Box-Muller */
    float u1 = randf() + 1e-10f;
    float u2 = randf();
    return std * sqrtf(-2.0f * logf(u1)) * cosf(2.0f * 3.14159265f * u2);
 }

 /* ───────────────────────── BPE Tokenizer ───────────────────────── */

 typedef struct { int a, b, result; } MergeRule;

 static MergeRule bpe_merges[MAX_MERGES];
 static int bpe_n_merges = 0;
 static int bpe_vocab_size = 256;

 /* Vocab: for each token id, store its byte representation */
 static unsigned char vocab_bytes[MAX_VOCAB][256];
 static int vocab_len[MAX_VOCAB];

 static void bpe_init_vocab(void) {
    for (int i = 0; i < 256; i++) {
        vocab_bytes[i][0] = (unsigned char)i;
        vocab_len[i] = 1;
    }
 }

 static int bpe_encode(const unsigned char *data, int len, int *out, int max_out) {
    int n = 0;
    for (int i = 0; i < len && n < max_out; i++)
        out[n++] = data[i];

    for (int m = 0; m < bpe_n_merges; m++) {
        MergeRule *mr = &bpe_merges[m];
        int j = 0;
        for (int i = 0; i < n; i++) {
            if (i + 1 < n && out[i] == mr->a && out[i + 1] == mr->b) {
                out[j++] = mr->result;
                i++;
            } else {
                out[j++] = out[i];
            }
        }
        n = j;
    }
    return n;
 }

 static int bpe_learn(const unsigned char *data, int len, int num_merges, int *out_tokens) {
    int *tok = (int *)malloc(len * sizeof(int));
    int n = len;
    for (int i = 0; i < n; i++) tok[i] = data[i];

    if (num_merges > MAX_MERGES) num_merges = MAX_MERGES;

    for (int m = 0; m < num_merges; m++) {
        /* Count pairs — use hash-like approach for speed */
        int best_a = -1, best_b = -1, best_count = 0;

        /* Simple pair counting with early termination */
        typedef struct { int a, b, count; } PairCount;
        PairCount *pairs = (PairCount *)calloc(65536, sizeof(PairCount));
        int n_pairs = 0;

        for (int i = 0; i + 1 < n; i++) {
            int a = tok[i], b = tok[i + 1];
            unsigned h = ((unsigned)a * 2654435761u ^ (unsigned)b) & 0xFFFF;
            /* Linear probe */
            for (int tries = 0; tries < 64; tries++) {
                unsigned idx = (h + tries) & 0xFFFF;
                if (pairs[idx].count == 0) {
                    pairs[idx].a = a;
                    pairs[idx].b = b;
                    pairs[idx].count = 1;
                    n_pairs++;
                    break;
                }
                if (pairs[idx].a == a && pairs[idx].b == b) {
                    pairs[idx].count++;
                    break;
                }
            }
        }

        for (int i = 0; i < 65536; i++) {
            if (pairs[i].count > best_count) {
                best_count = pairs[i].count;
                best_a = pairs[i].a;
                best_b = pairs[i].b;
            }
        }
        free(pairs);

        if (best_count < 2) break;

        int new_id = 256 + m;
        bpe_merges[m] = (MergeRule){best_a, best_b, new_id};
        bpe_n_merges = m + 1;
        bpe_vocab_size = new_id + 1;

        /* Build vocab entry for new token */
        int la = vocab_len[best_a];
        int lb = vocab_len[best_b];
        memcpy(vocab_bytes[new_id], vocab_bytes[best_a], la);
        memcpy(vocab_bytes[new_id] + la, vocab_bytes[best_b], lb);
        vocab_len[new_id] = la + lb;

        /* Apply merge */
        int j = 0;
        for (int i = 0; i < n; i++) {
            if (i + 1 < n && tok[i] == best_a && tok[i + 1] == best_b) {
                tok[j++] = new_id;
                i++;
            } else {
                tok[j++] = tok[i];
            }
        }
        n = j;

        if ((m + 1) % 200 == 0)
            printf("  merge %d/%d  vocab=%d  tokens=%d\n", m + 1, num_merges, new_id + 1, n);
    }

    /* Copy result */
    int result_n = n < MAX_TOKENS ? n : MAX_TOKENS;
    memcpy(out_tokens, tok, result_n * sizeof(int));
    free(tok);

    printf("  BPE complete: %d merges, vocab=%d, tokens=%d (from %d bytes)\n",
           bpe_n_merges, bpe_vocab_size, result_n, len);
    return result_n;
 }

 static void bpe_decode(const int *ids, int n, char *out, int max_out) {
    int pos = 0;
    for (int i = 0; i < n && pos < max_out - 1; i++) {
        int tid = ids[i];
        if (tid >= 0 && tid < MAX_VOCAB) {
            for (int j = 0; j < vocab_len[tid] && pos < max_out - 1; j++) {
                out[pos++] = vocab_bytes[tid][j];
            }
        }
    }
    out[pos] = '\0';
 }

 /* ───────────────────────── MetaWeights ───────────────────────── */

 typedef struct {
    int a, b;
    float prob;
 } BigramEntry;

 static float meta_unigram[MAX_VOCAB];
 static BigramEntry meta_bigrams[BIGRAM_CAP];
 static int meta_n_bigrams;
 static int meta_vocab_size;
 static int meta_total_tokens;

 static void meta_build(const int *tokens, int n) {
    meta_vocab_size = bpe_vocab_size;
    meta_total_tokens = n;

    /* Unigram */
    memset(meta_unigram, 0, sizeof(meta_unigram));
    for (int i = 0; i < n; i++) {
        if (tokens[i] < MAX_VOCAB)
            meta_unigram[tokens[i]] += 1.0f;
    }
    float total = 0;
    for (int i = 0; i < meta_vocab_size; i++) total += meta_unigram[i];
    if (total > 0)
        for (int i = 0; i < meta_vocab_size; i++) meta_unigram[i] /= total;

    /* Bigram — store in hash table style */
    typedef struct { int a, b; int count; } BC;
    BC *bcounts = (BC *)calloc(65536, sizeof(BC));
    int n_bc = 0;

    for (int i = 0; i + 1 < n; i++) {
        int a = tokens[i], b = tokens[i + 1];
        unsigned h = ((unsigned)a * 2654435761u ^ (unsigned)b) & 0xFFFF;
        for (int t = 0; t < 64; t++) {
            unsigned idx = (h + t) & 0xFFFF;
            if (bcounts[idx].count == 0) {
                bcounts[idx].a = a;
                bcounts[idx].b = b;
                bcounts[idx].count = 1;
                n_bc++;
                break;
            }
            if (bcounts[idx].a == a && bcounts[idx].b == b) {
                bcounts[idx].count++;
                break;
            }
        }
    }

    /* Convert to normalized bigrams */
    /* Group by 'a' and normalize */
    meta_n_bigrams = 0;
    for (int i = 0; i < 65536 && meta_n_bigrams < BIGRAM_CAP; i++) {
        if (bcounts[i].count > 0) {
            meta_bigrams[meta_n_bigrams].a = bcounts[i].a;
            meta_bigrams[meta_n_bigrams].b = bcounts[i].b;
            meta_bigrams[meta_n_bigrams].prob = (float)bcounts[i].count;
            meta_n_bigrams++;
        }
    }

    /* Normalize per 'a' */
    for (int i = 0; i < meta_n_bigrams; i++) {
        int a = meta_bigrams[i].a;
        float total_a = 0;
        for (int j = 0; j < meta_n_bigrams; j++) {
            if (meta_bigrams[j].a == a)
                total_a += meta_bigrams[j].prob;
        }
        if (total_a > 0)
            meta_bigrams[i].prob /= total_a;
    }

    free(bcounts);
    printf("  metaweights built: %d tokens, %d bigram entries\n", n, meta_n_bigrams);
 }

 static void meta_query_bigram(int prev, float *dist, int vs) {
    for (int i = 0; i < vs; i++) dist[i] = 1e-10f;
    for (int i = 0; i < meta_n_bigrams; i++) {
        if (meta_bigrams[i].a == prev && meta_bigrams[i].b < vs) {
            dist[meta_bigrams[i].b] = meta_bigrams[i].prob;
        }
    }
 }

 /* ───────────────────────── Transformer Weights ───────────────────────── */

 typedef struct {
    float wte[MAX_VOCAB][N_EMBD];
    float wpe[CONTEXT_LEN][N_EMBD];

    /* Per layer */
    float wq[N_LAYER][N_CONTENT * HEAD_DIM][N_EMBD];
    float wk[N_LAYER][N_CONTENT * HEAD_DIM][N_EMBD];
    float wv_content[N_LAYER][N_CONTENT * HEAD_DIM][N_EMBD];
    float wr[N_LAYER][N_RRPRAM * N_EMBD][CONTEXT_LEN];
    float wv_rrpram[N_LAYER][N_RRPRAM * HEAD_DIM][N_EMBD];
    float wo[N_LAYER][N_EMBD][N_EMBD];
    float mlp_up[N_LAYER][MLP_DIM][N_EMBD];
    float mlp_down[N_LAYER][N_EMBD][MLP_DIM];

    float lm_head[MAX_VOCAB][N_EMBD];
 } Weights;

 static Weights W;

 static void init_matrix(float *data, int rows, int cols, float std) {
    for (int i = 0; i < rows * cols; i++)
        data[i] = randn(std);
 }

 static void weights_init(int vocab_size) {
    float std = 0.02f;
    float std_res = 0.02f / sqrtf(2.0f * N_LAYER);

    init_matrix(&W.wte[0][0], vocab_size, N_EMBD, std);
    init_matrix(&W.wpe[0][0], CONTEXT_LEN, N_EMBD, std);

    for (int l = 0; l < N_LAYER; l++) {
        init_matrix(&W.wq[l][0][0], N_CONTENT * HEAD_DIM, N_EMBD, std);
        init_matrix(&W.wk[l][0][0], N_CONTENT * HEAD_DIM, N_EMBD, std);
        init_matrix(&W.wv_content[l][0][0], N_CONTENT * HEAD_DIM, N_EMBD, std);
        init_matrix(&W.wr[l][0][0], N_RRPRAM * N_EMBD, CONTEXT_LEN, std);
        init_matrix(&W.wv_rrpram[l][0][0], N_RRPRAM * HEAD_DIM, N_EMBD, std);
        init_matrix(&W.wo[l][0][0], N_EMBD, N_EMBD, std_res);
        init_matrix(&W.mlp_up[l][0][0], MLP_DIM, N_EMBD, std);
        init_matrix(&W.mlp_down[l][0][0], N_EMBD, MLP_DIM, std_res);
    }
    init_matrix(&W.lm_head[0][0], vocab_size, N_EMBD, std);
 }

 /*
 * ghost becomes flesh: seed transformer weights from metaweight statistics.
 * the weights remember what they never learned.
 */
 static void weights_seed_from_meta(int vocab_size) {
    float scale = 0.15f;

    /* 1. Token embeddings: tokens with high bigram co-occurrence → similar vectors */
    for (int a = 0; a < vocab_size && a < MAX_VOCAB; a++) {
        float signal[N_EMBD] = {0};
        int neighbors = 0;
        for (int i = 0; i < meta_n_bigrams; i++) {
            if (meta_bigrams[i].a == a && meta_bigrams[i].prob > 0.01f) {
                int b = meta_bigrams[i].b;
                if (b < vocab_size && b < MAX_VOCAB) {
                    float strength = meta_bigrams[i].prob;
                    for (int d = 0; d < N_EMBD; d++)
                        signal[d] += strength * W.wte[b][d];
                    neighbors++;
                }
            }
        }
        if (neighbors > 0) {
            for (int d = 0; d < N_EMBD; d++)
                W.wte[a][d] += scale * signal[d] / neighbors;
        }
    }

    /* 2. LM head: seed from unigram frequencies */
    for (int tok = 0; tok < vocab_size && tok < MAX_VOCAB; tok++) {
        if (meta_unigram[tok] > 0) {
            for (int d = 0; d < N_EMBD; d++)
                W.lm_head[tok][d] += scale * meta_unigram[tok] * W.wte[tok][d];
        }
    }

    printf("  weights seeded from metaweights (ghost -> flesh)\n");
 }

 /* ───────────────────────── Forward Pass ───────────────────────── */

 static void rmsnorm(float *out, const float *x, int n) {
    float ms = 0;
    for (int i = 0; i < n; i++) ms += x[i] * x[i];
    ms /= n;
    float scale = 1.0f / sqrtf(ms + 1e-5f);
    for (int i = 0; i < n; i++) out[i] = x[i] * scale;
 }

 static void matmul_mv(float *out, const float *mat, const float *vec, int rows, int cols) {
    /* out[rows] = mat[rows][cols] @ vec[cols] */
    for (int i = 0; i < rows; i++) {
        float s = 0;
        for (int j = 0; j < cols; j++)
            s += mat[i * cols + j] * vec[j];
        out[i] = s;
    }
 }

 static void softmax_inplace(float *x, int n) {
    float mx = -1e30f;
    for (int i = 0; i < n; i++) if (x[i] > mx) mx = x[i];
    float s = 0;
    for (int i = 0; i < n; i++) {
        x[i] = expf(x[i] - mx);
        s += x[i];
    }
    for (int i = 0; i < n; i++) x[i] /= s;
 }

 /* KV cache */
 static float kv_keys[N_LAYER][CONTEXT_LEN][N_CONTENT * HEAD_DIM];
 static float kv_vals_content[N_LAYER][CONTEXT_LEN][N_CONTENT * HEAD_DIM];
 static float kv_vals_rrpram[N_LAYER][CONTEXT_LEN][N_RRPRAM * HEAD_DIM];
 static int kv_len = 0;

 static void forward_token(int token_id, int pos_id, float *logits, int vocab_size) {
    float x[N_EMBD], x_norm[N_EMBD], x_res[N_EMBD];
    float q[N_CONTENT * HEAD_DIM], k[N_CONTENT * HEAD_DIM];
    float v_content[N_CONTENT * HEAD_DIM], v_rrpram[N_RRPRAM * HEAD_DIM];
    float x_attn[N_EMBD], x_proj[N_EMBD];
    float h_mlp[MLP_DIM], x_mlp[N_EMBD];
    float attn_logits[CONTEXT_LEN], attn_weights[CONTEXT_LEN];

    /* Token + position embedding */
    for (int i = 0; i < N_EMBD; i++)
        x[i] = W.wte[token_id][i] + W.wpe[pos_id][i];

    int seq_len = pos_id + 1;

    for (int li = 0; li < N_LAYER; li++) {
        memcpy(x_res, x, N_EMBD * sizeof(float));
        rmsnorm(x_norm, x, N_EMBD);

        /* Content attention: Q, K, V */
        matmul_mv(q, &W.wq[li][0][0], x_norm, N_CONTENT * HEAD_DIM, N_EMBD);
        matmul_mv(k, &W.wk[li][0][0], x_norm, N_CONTENT * HEAD_DIM, N_EMBD);
        matmul_mv(v_content, &W.wv_content[li][0][0], x_norm, N_CONTENT * HEAD_DIM, N_EMBD);
        matmul_mv(v_rrpram, &W.wv_rrpram[li][0][0], x_norm, N_RRPRAM * HEAD_DIM, N_EMBD);

        /* Store in KV cache */
        memcpy(kv_keys[li][pos_id], k, N_CONTENT * HEAD_DIM * sizeof(float));
        memcpy(kv_vals_content[li][pos_id], v_content, N_CONTENT * HEAD_DIM * sizeof(float));
        memcpy(kv_vals_rrpram[li][pos_id], v_rrpram, N_RRPRAM * HEAD_DIM * sizeof(float));

        memset(x_attn, 0, N_EMBD * sizeof(float));

        /* Content heads */
        for (int h = 0; h < N_CONTENT; h++) {
            int hs = h * HEAD_DIM;
            float scale = 1.0f / sqrtf((float)HEAD_DIM);

            for (int t = 0; t < seq_len; t++) {
                float score = 0;
                for (int d = 0; d < HEAD_DIM; d++)
                    score += q[hs + d] * kv_keys[li][t][hs + d];
                attn_logits[t] = score * scale;
            }
            softmax_inplace(attn_logits, seq_len);

            for (int d = 0; d < HEAD_DIM; d++) {
                float val = 0;
                for (int t = 0; t < seq_len; t++)
                    val += attn_logits[t] * kv_vals_content[li][t][hs + d];
                x_attn[h * HEAD_DIM + d] = val;
            }
        }

        /* RRPRAM heads */
        for (int h = 0; h < N_RRPRAM; h++) {
            int hs = h * HEAD_DIM;
            int wr_off = h * N_EMBD;

            /* x_norm @ Wr_h gives attention pattern over positions */
            for (int t = 0; t < seq_len; t++) {
                float score = 0;
                for (int d = 0; d < N_EMBD; d++)
                    score += x_norm[d] * W.wr[li][wr_off + d][t];
                attn_logits[t] = score;
            }
            softmax_inplace(attn_logits, seq_len);

            for (int d = 0; d < HEAD_DIM; d++) {
                float val = 0;
                for (int t = 0; t < seq_len; t++)
                    val += attn_logits[t] * kv_vals_rrpram[li][t][hs + d];
                x_attn[N_CONTENT * HEAD_DIM + h * HEAD_DIM + d] = val;
            }
        }

        /* Output projection + residual */
        matmul_mv(x_proj, &W.wo[li][0][0], x_attn, N_EMBD, N_EMBD);
        for (int i = 0; i < N_EMBD; i++)
            x[i] = x_res[i] + x_proj[i];

        /* MLP */
        memcpy(x_res, x, N_EMBD * sizeof(float));
        rmsnorm(x_norm, x, N_EMBD);
        matmul_mv(h_mlp, &W.mlp_up[li][0][0], x_norm, MLP_DIM, N_EMBD);
        for (int i = 0; i < MLP_DIM; i++)
            h_mlp[i] = h_mlp[i] > 0 ? h_mlp[i] : 0;  /* ReLU */
        matmul_mv(x_mlp, &W.mlp_down[li][0][0], h_mlp, N_EMBD, MLP_DIM);
        for (int i = 0; i < N_EMBD; i++)
            x[i] = x_res[i] + x_mlp[i];
    }

    /* Final norm + LM head */
    rmsnorm(x_norm, x, N_EMBD);
    matmul_mv(logits, &W.lm_head[0][0], x_norm, vocab_size, N_EMBD);
 }

 /* ───────────────────────── Generation ───────────────────────── */

 static int sample_from_probs(float *probs, int n) {
    float r = randf();
    float cum = 0;
    for (int i = 0; i < n; i++) {
        cum += probs[i];
        if (cum > r) return i;
    }
    return n - 1;
 }

 static void generate_meta(const int *prompt, int prompt_len, int max_tokens,
                           int vocab_size, float temperature, char *out, int max_out) {
    int generated[4096];
    int gen_len = prompt_len;
    memcpy(generated, prompt, prompt_len * sizeof(int));

    float *probs = (float *)malloc(vocab_size * sizeof(float));
    float *bigram_dist = (float *)malloc(vocab_size * sizeof(float));

    for (int step = 0; step < max_tokens && gen_len < 4096; step++) {
        int last = generated[gen_len - 1];

        /* Query bigram metaweights */
        meta_query_bigram(last, bigram_dist, vocab_size);

        /* Build probability from metaweights */
        for (int i = 0; i < vocab_size; i++) {
            probs[i] = 2.0f * bigram_dist[i] + 0.01f * meta_unigram[i];
        }

        /* Temperature */
        for (int i = 0; i < vocab_size; i++)
            probs[i] /= temperature;

        softmax_inplace(probs, vocab_size);
        int chosen = sample_from_probs(probs, vocab_size);
        generated[gen_len++] = chosen;
    }

    free(probs);
    free(bigram_dist);

    bpe_decode(generated, gen_len, out, max_out);
 }

 static void generate_full(const int *prompt, int prompt_len, int max_tokens,
                           int vocab_size, float temperature, char *out, int max_out) {
    int generated[4096];
    int gen_len = prompt_len;
    memcpy(generated, prompt, prompt_len * sizeof(int));

    float *logits = (float *)malloc(vocab_size * sizeof(float));
    float *bigram_dist = (float *)malloc(vocab_size * sizeof(float));

    kv_len = 0;

    /* Feed prompt */
    for (int i = 0; i < prompt_len; i++) {
        forward_token(generated[i], i, logits, vocab_size);
    }

    /* Generate */
    for (int step = 0; step < max_tokens && gen_len < 4096; step++) {
        int pos = gen_len - 1;
        if (pos >= CONTEXT_LEN - 1) break;

        int last = generated[gen_len - 1];
        forward_token(last, pos, logits, vocab_size);

        /* Dario field overlay */
        meta_query_bigram(last, bigram_dist, vocab_size);
        for (int i = 0; i < vocab_size; i++)
            logits[i] += 1.5f * bigram_dist[i];

        /* Temperature + sample */
        for (int i = 0; i < vocab_size; i++)
            logits[i] /= temperature;
        softmax_inplace(logits, vocab_size);

        int chosen = sample_from_probs(logits, vocab_size);
        generated[gen_len++] = chosen;
    }

    free(logits);
    free(bigram_dist);

    bpe_decode(generated, gen_len, out, max_out);
 }

 /* ───────────────────────── Main ───────────────────────── */

 int main(int argc, char **argv) {
    printf("============================================================\n");
    printf("  PostGPT (C) — metaweight BPE transformer\n");
    printf("  resonance is unbreakable\n");
    printf("============================================================\n");

    /* Load corpus */
    printf("\n[1] Loading corpus...\n");
    FILE *f = fopen("postgpt.txt", "rb");
    if (!f) {
        printf("ERROR: postgpt.txt not found\n");
        return 1;
    }
    fseek(f, 0, SEEK_END);
    long fsize = ftell(f);
    fseek(f, 0, SEEK_SET);
    unsigned char *data = (unsigned char *)malloc(fsize);
    if (!data) { fclose(f); return 1; }
    fsize = fread(data, 1, fsize, f);
    fclose(f);
    printf("  Corpus: %ld bytes (%.1f KB)\n", fsize, fsize / 1024.0);

    /* BPE tokenization */
    printf("\n[2] Learning BPE merges...\n");
    bpe_init_vocab();
    int *tokens = (int *)malloc(fsize * sizeof(int));
    int n_tokens = bpe_learn(data, fsize, 1024, tokens);

    /* Build metaweights */
    printf("\n[3] Building metaweight probability space...\n");
    meta_build(tokens, n_tokens);

    /* Init transformer */
    printf("\n[4] Initializing PostGPT transformer...\n");
    weights_init(bpe_vocab_size);
    printf("  Initialized: vocab=%d, ctx=%d, embd=%d, heads=%d (content=%d, rrpram=%d), layers=%d\n",
           bpe_vocab_size, CONTEXT_LEN, N_EMBD, N_HEAD, N_CONTENT, N_RRPRAM, N_LAYER);

    /* Seed weights from metaweights — ghost becomes flesh */
    printf("\n[5] Seeding weights from metaweights...\n");
    weights_seed_from_meta(bpe_vocab_size);

    /* Proof of concept: phrase continuation */
    char output[4096];
    int prompt_ids[1024];

    /* Default prompts or command-line argument */
    const char *prompts[] = {
        "PostGPT",
        "The metaweight",
        "RRPRAM attention",
        "BPE tokenization",
        "The transformer",
        "Language models",
        NULL
    };

    /* If user provided a prompt, use only that */
    const char *user_prompts[2] = {NULL, NULL};
    if (argc > 1) {
        user_prompts[0] = argv[1];
        prompts[0] = user_prompts[0];
        prompts[1] = NULL;
    }

    printf("\n============================================================\n");
    printf("  PROOF OF CONCEPT: phrase continuation\n");
    printf("  mode: metaweight (no training, just BPE + statistics)\n");
    printf("============================================================\n");

    for (int p = 0; prompts[p] != NULL; p++) {
        const char *prompt = prompts[p];
        int prompt_len = bpe_encode((const unsigned char *)prompt,
                                     (int)strlen(prompt), prompt_ids, 1024);

        generate_meta(prompt_ids, prompt_len, 100, bpe_vocab_size, 0.72f,
                      output, sizeof(output));

        /* Show prompt and continuation separately */
        int plen = (int)strlen(prompt);
        printf("\n  prompt:       \"%s\"\n", prompt);
        if ((int)strlen(output) > plen)
            printf("  continuation: \"%.*s\"\n", 250, output + plen);
        else
            printf("  continuation: \"%s\"\n", output);
    }

    /* Full transformer + Dario field mode for first prompt */
    printf("\n============================================================\n");
    printf("  FULL MODE: transformer + Dario field (both attentions)\n");
    printf("============================================================\n");

    {
        const char *prompt = (argc > 1) ? argv[1] : "PostGPT";
        int prompt_len = bpe_encode((const unsigned char *)prompt,
                                     (int)strlen(prompt), prompt_ids, 1024);

        generate_full(prompt_ids, prompt_len, 30, bpe_vocab_size, 0.8f,
                      output, sizeof(output));

        int plen = (int)strlen(prompt);
        printf("\n  prompt:       \"%s\"\n", prompt);
        if ((int)strlen(output) > plen)
            printf("  continuation: \"%.*s\"\n", 300, output + plen);
        else
            printf("  continuation: \"%s\"\n", output);
    }

    printf("\n============================================================\n");
    printf("  PostGPT complete. The metaweights remember.\n");
    printf("  Try: ./postgpt \"your prompt here\"\n");
    printf("============================================================\n");

    free(data);
    free(tokens);
    return 0;
 }
diff --git a/postgpt.py b/postgpt.py
 """
 postgpt — a zero-dependency BPE transformer with metaweights.

 The idea: tokenize a corpus via BPE, build a statistical probability space
 (the "metaweights") from co-occurrence and n-gram patterns, then run a
 dual-attention transformer (Content + RRPRAM) whose behavior is guided
 by these metaweights — as if it were trained, even though it was not.

 No PyTorch. No NumPy. No dependencies. Just math, random, and os.
 This file is the complete algorithm. Everything else is just efficiency.

 resonance is unbreakable.
 """

 import os
 import math
 import random
 import struct
 import time

 random.seed(42)

 # ─────────────────────────────────────────────────────────────────────────────
 # I. BPE TOKENIZER — learns merge rules from corpus.
 #    the tokenizer IS the training. everything after this is just theater.
 # ─────────────────────────────────────────────────────────────────────────────

 class BPETokenizer:
    """Byte-Pair Encoding tokenizer. Starts with 256 byte tokens, learns merges."""

    def __init__(self, max_merges=1792):
        self.max_merges = max_merges
        self.merges = []  # list of (a, b, new_id)
        self.vocab_size = 256
        self.vocab = {i: bytes([i]) for i in range(256)}  # id -> bytes

    def _count_pairs(self, ids):
        """Count consecutive pairs in token list."""
        counts = {}
        for i in range(len(ids) - 1):
            pair = (ids[i], ids[i + 1])
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge_pair(self, ids, pair, new_id):
        """Replace all occurrences of pair with new_id."""
        result = []
        i = 0
        while i < len(ids):
            if i + 1 < len(ids) and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                result.append(new_id)
                i += 2
            else:
                result.append(ids[i])
                i += 1
        return result

    def learn(self, data_bytes, num_merges=None):
        """Learn BPE merges from raw bytes."""
        if num_merges is None:
            num_merges = self.max_merges
        num_merges = min(num_merges, self.max_merges)

        ids = list(data_bytes)
        t0 = time.time()

        for m in range(num_merges):
            counts = self._count_pairs(ids)
            if not counts:
                break
            best_pair = max(counts, key=counts.get)
            if counts[best_pair] < 2:
                break

            new_id = 256 + m
            ids = self._merge_pair(ids, best_pair, new_id)
            self.merges.append((best_pair[0], best_pair[1], new_id))
            self.vocab[new_id] = self.vocab[best_pair[0]] + self.vocab[best_pair[1]]
            self.vocab_size = 256 + m + 1

            if (m + 1) % 200 == 0:
                elapsed = time.time() - t0
                print(f"  merge {m+1}/{num_merges}  vocab={self.vocab_size}  tokens={len(ids)}  [{elapsed:.1f}s]")

        print(f"  BPE complete: {len(self.merges)} merges, vocab={self.vocab_size}, "
              f"tokens={len(ids)} (from {len(data_bytes)} bytes)")
        return ids

    def encode(self, text):
        """Encode text to token ids using learned merges."""
        if isinstance(text, str):
            text = text.encode('utf-8', errors='replace')
        ids = list(text)
        for a, b, new_id in self.merges:
            ids = self._merge_pair(ids, (a, b), new_id)
        return ids

    def decode(self, ids):
        """Decode token ids back to string."""
        raw = b''
        for tid in ids:
            if tid in self.vocab:
                raw += self.vocab[tid]
        return raw.decode('utf-8', errors='replace')

    def save(self, path):
        """Save merge rules to binary file."""
        with open(path, 'wb') as f:
            f.write(struct.pack('<I', len(self.merges)))
            for a, b, new_id in self.merges:
                f.write(struct.pack('<III', a, b, new_id))

    def load(self, path):
        """Load merge rules from binary file."""
        with open(path, 'rb') as f:
            n = struct.unpack('<I', f.read(4))[0]
            self.merges = []
            for _ in range(n):
                a, b, new_id = struct.unpack('<III', f.read(12))
                self.merges.append((a, b, new_id))
                self.vocab[new_id] = self.vocab.get(a, bytes([a % 256])) + self.vocab.get(b, bytes([b % 256]))
            self.vocab_size = 256 + len(self.merges)


 # ─────────────────────────────────────────────────────────────────────────────
 # II. METAWEIGHTS — the probability space that exists without existing.
 #    schrödinger called. he wants his cat back. we tokenized it.
 # ─────────────────────────────────────────────────────────────────────────────

 class MetaWeights:
    """
    Metaweights: weights that are implied to exist, but don't.

    After BPE tokenization of a corpus, we build:
    1. Unigram frequencies — p(token)
    2. Bigram co-occurrence — p(token_j | token_i)
    3. Trigram patterns — p(token_k | token_i, token_j)
    4. Positional affinity — which tokens prefer which positions
    5. Hebbian trace — co-occurrence memory (tokens seen together)
    6. Prophecy field — given context, what tokens are expected

    These form a probability space that a transformer can use to behave
    AS IF it had trained weights, because the statistical regularities
    from the corpus create an implicit weight space.

    The metaweights are the ghost in the machine.
    """

    def __init__(self, vocab_size, context_len):
        self.vocab_size = vocab_size
        self.context_len = context_len

        # Unigram: p(token)
        self.unigram = [0.0] * vocab_size

        # Bigram: p(next | prev) — sparse dict of dict
        self.bigram = {}

        # Trigram: p(next | prev2, prev1) — sparse
        self.trigram = {}

        # Positional affinity: which tokens appear at which positions
        self.pos_affinity = {}  # token -> list of position counts

        # Hebbian trace: co-occurrence within a window
        self.hebbian = {}  # (tok_a, tok_b) -> strength

        # Total tokens seen
        self.total = 0

    def build(self, token_ids, window=8):
        """Build metaweight space from tokenized corpus."""
        n = len(token_ids)
        self.total = n
        t0 = time.time()

        # Unigram counts
        for tid in token_ids:
            if tid < self.vocab_size:
                self.unigram[tid] += 1.0

        # Normalize unigram
        total = sum(self.unigram)
        if total > 0:
            self.unigram = [c / total for c in self.unigram]

        # Bigram counts
        for i in range(n - 1):
            a, b = token_ids[i], token_ids[i + 1]
            if a not in self.bigram:
                self.bigram[a] = {}
            self.bigram[a][b] = self.bigram[a].get(b, 0) + 1

        # Normalize bigrams
        for a in self.bigram:
            total_a = sum(self.bigram[a].values())
            if total_a > 0:
                for b in self.bigram[a]:
                    self.bigram[a][b] /= total_a

        # Trigram counts
        for i in range(n - 2):
            key = (token_ids[i], token_ids[i + 1])
            c = token_ids[i + 2]
            if key not in self.trigram:
                self.trigram[key] = {}
            self.trigram[key][c] = self.trigram[key].get(c, 0) + 1

        # Normalize trigrams
        for key in self.trigram:
            total_k = sum(self.trigram[key].values())
            if total_k > 0:
                for c in self.trigram[key]:
                    self.trigram[key][c] /= total_k

        # Positional affinity (within context windows)
        for i in range(n):
            pos = i % self.context_len
            tid = token_ids[i]
            if tid not in self.pos_affinity:
                self.pos_affinity[tid] = [0.0] * self.context_len
            self.pos_affinity[tid][pos] += 1.0

        # Normalize positional affinity
        for tid in self.pos_affinity:
            total_t = sum(self.pos_affinity[tid])
            if total_t > 0:
                self.pos_affinity[tid] = [c / total_t for c in self.pos_affinity[tid]]

        # Hebbian trace: co-occurrence within window
        # Cap to first 20K tokens for efficiency (O(n*window))
        hebb_n = min(n, 20000)
        for i in range(hebb_n):
            for j in range(max(0, i - window), min(hebb_n, i + window + 1)):
                if i == j:
                    continue
                a, b = token_ids[i], token_ids[j]
                key = (min(a, b), max(a, b))
                decay = 1.0 / (1.0 + abs(i - j))
                self.hebbian[key] = self.hebbian.get(key, 0.0) + decay

        # Normalize hebbian
        if self.hebbian:
            max_h = max(self.hebbian.values())
            if max_h > 0:
                for key in self.hebbian:
                    self.hebbian[key] /= max_h

        elapsed = time.time() - t0
        print(f"  metaweights built: {n} tokens, {len(self.bigram)} bigram keys, "
              f"{len(self.trigram)} trigram keys, {len(self.hebbian)} hebbian pairs [{elapsed:.1f}s]")

    def query_bigram(self, prev_token, vocab_size):
        """Get bigram probability distribution given previous token."""
        dist = [1e-10] * vocab_size  # smoothing
        if prev_token in self.bigram:
            for tok, prob in self.bigram[prev_token].items():
                if tok < vocab_size:
                    dist[tok] = prob
        return dist

    def query_trigram(self, prev2, prev1, vocab_size):
        """Get trigram probability distribution given two previous tokens."""
        dist = [1e-10] * vocab_size
        key = (prev2, prev1)
        if key in self.trigram:
            for tok, prob in self.trigram[key].items():
                if tok < vocab_size:
                    dist[tok] = prob
        return dist

    def query_hebbian(self, context_tokens, vocab_size):
        """Get Hebbian resonance signal for each candidate token given context."""
        signal = [0.0] * vocab_size
        # Use sparse lookup: iterate over stored hebbian pairs only
        for (a, b), strength in self.hebbian.items():
            for ctx_tok in context_tokens:
                if a == ctx_tok and b < vocab_size:
                    signal[b] += strength
                elif b == ctx_tok and a < vocab_size:
                    signal[a] += strength
        # Normalize
        max_s = max(signal) if signal else 1.0
        if max_s > 0:
            signal = [s / max_s for s in signal]
        return signal

    def query_prophecy(self, context_tokens, vocab_size, top_k=16):
        """
        Prophecy field: given context, which tokens are expected but haven't appeared?
        Returns signal boosting tokens that "should" come next based on co-occurrence.
        """
        appeared = set(context_tokens)
        signal = [0.0] * vocab_size

        for ctx_tok in context_tokens[-4:]:  # recent context
            if ctx_tok in self.bigram:
                for tok, prob in sorted(self.bigram[ctx_tok].items(),
                                        key=lambda x: -x[1])[:top_k]:
                    if tok not in appeared and tok < vocab_size:
                        signal[tok] += prob

        max_s = max(signal) if signal else 1.0
        if max_s > 0:
            signal = [s / max_s for s in signal]
        return signal


 # ─────────────────────────────────────────────────────────────────────────────
 # III. AUTOGRAD ENGINE — scalar backprop. if you can't differentiate it by hand, you don't deserve gradients.
 # ─────────────────────────────────────────────────────────────────────────────

 class Val:
    """Scalar autograd node. Tracks computation graph for backpropagation."""
    __slots__ = ('data', 'grad', '_children', '_local_grads')

    def __init__(self, data, children=(), local_grads=()):
        self.data = float(data)
        self.grad = 0.0
        self._children = children
        self._local_grads = local_grads

    def __add__(self, other):
        other = other if isinstance(other, Val) else Val(other)
        return Val(self.data + other.data, (self, other), (1.0, 1.0))

    def __mul__(self, other):
        other = other if isinstance(other, Val) else Val(other)
        return Val(self.data * other.data, (self, other), (other.data, self.data))

    def __pow__(self, other):
        return Val(self.data ** other, (self,), (other * self.data ** (other - 1),))

    def log(self):
        d = max(self.data, 1e-12)
        return Val(math.log(d), (self,), (1.0 / d,))

    def exp(self):
        e = math.exp(min(self.data, 80))
        return Val(e, (self,), (e,))

    def relu(self):
        return Val(max(0, self.data), (self,), (float(self.data > 0),))

    def tanh(self):
        t = math.tanh(self.data)
        return Val(t, (self,), (1.0 - t * t,))

    def __neg__(self): return self * -1
    def __radd__(self, other): return self + other
    def __sub__(self, other): return self + (-other)
    def __rsub__(self, other): return (-self) + other
    def __rmul__(self, other): return self * other
    def __truediv__(self, other): return self * (other if isinstance(other, Val) else Val(other)) ** -1
    def __rtruediv__(self, other): return Val(other) * self ** -1

    def backward(self):
        topo = []
        visited = set()
        def build(v):
            if id(v) not in visited:
                visited.add(id(v))
                for c in v._children:
                    build(c)
                topo.append(v)
        build(self)
        self.grad = 1.0
        for v in reversed(topo):
            for child, lg in zip(v._children, v._local_grads):
                child.grad += lg * v.grad


 # ─────────────────────────────────────────────────────────────────────────────
 # IV. THE TRANSFORMER — dual attention (Content + RRPRAM) + metaweight overlay.
 #    two heads are better than one. especially when one of them doesn't exist.
 # ─────────────────────────────────────────────────────────────────────────────

 def _randn(std=0.02):
    return random.gauss(0, std)

 def _matrix(rows, cols, std=0.02):
    return [[Val(_randn(std)) for _ in range(cols)] for _ in range(rows)]

 def _zeros(rows, cols):
    return [[Val(0.0) for _ in range(cols)] for _ in range(rows)]

 def linear(x, w):
    """Matrix-vector multiply: w @ x. w is [out, in], x is [in]."""
    return [sum(wi * xi for wi, xi in zip(row, x)) for row in w]

 def softmax(logits):
    """Numerically stable softmax over list of Val."""
    max_val = max(v.data for v in logits)
    exps = [(v - max_val).exp() for v in logits]
    total = sum(exps)
    return [e / total for e in exps]

 def softmax_float(logits):
    """Softmax over plain floats."""
    max_val = max(logits)
    exps = [math.exp(min(v - max_val, 80)) for v in logits]
    total = sum(exps)
    return [e / total for e in exps]

 def rmsnorm(x):
    """RMS normalization."""
    ms = sum(xi * xi for xi in x) / len(x)
    scale = (ms + Val(1e-5)) ** -0.5
    return [xi * scale for xi in x]


 class PostGPT:
    """
    PostGPT: a dual-attention BPE transformer with metaweights.

    Architecture:
    - BPE tokenizer (learned from corpus)
    - Token + Position embeddings
    - N transformer blocks, each with:
        * RMSNorm
        * Dual attention: Content heads (QK^T) + RRPRAM heads (x @ Wr)
        * Residual connection
        * RMSNorm
        * MLP (expand -> ReLU -> contract)
        * Residual connection
    - Final RMSNorm -> LM head -> logits
    - Metaweight overlay: Hebbian + Prophecy + Destiny signals

    The metaweight overlay means: even with random weights, the model
    generates coherent text because the probability space from the
    corpus guides sampling through the Dario field.
    """

    def __init__(self, vocab_size, context_len=64, n_embd=48, n_head=4,
                 n_layer=2, n_content_heads=2, n_rrpram_heads=2):
        self.vocab_size = vocab_size
        self.context_len = context_len
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.n_content = n_content_heads
        self.n_rrpram = n_rrpram_heads
        self.head_dim = n_embd // n_head

        assert n_content_heads + n_rrpram_heads == n_head, \
            "content + rrpram heads must equal total heads"

        # Embeddings
        self.wte = _matrix(vocab_size, n_embd)  # token embedding
        self.wpe = _matrix(context_len, n_embd)  # position embedding

        # Per-layer weights
        self.layers = []
        hd = self.head_dim
        for _ in range(n_layer):
            layer = {
                # Content attention: Q, K, V for content heads
                'wq': _matrix(n_content_heads * hd, n_embd, std=0.02),
                'wk': _matrix(n_content_heads * hd, n_embd, std=0.02),
                'wv_content': _matrix(n_content_heads * hd, n_embd, std=0.02),

                # RRPRAM attention: Wr (positional pattern matrix) + V
                'wr': _matrix(n_rrpram_heads * n_embd, context_len, std=0.02),
                'wv_rrpram': _matrix(n_rrpram_heads * hd, n_embd, std=0.02),

                # Output projection
                'wo': _matrix(n_embd, n_embd, std=0.02 / math.sqrt(2 * n_layer)),

                # MLP
                'mlp_up': _matrix(4 * n_embd, n_embd, std=0.02),
                'mlp_down': _matrix(n_embd, 4 * n_embd, std=0.02 / math.sqrt(2 * n_layer)),
            }
            self.layers.append(layer)

        # LM head
        self.lm_head = _matrix(vocab_size, n_embd, std=0.02)

        # Dario field coefficients (metaweight blending)
        self.alpha_hebbian = 0.3   # Hebbian trace strength
        self.beta_prophecy = 0.2   # Prophecy field strength
        self.gamma_destiny = 0.15  # Destiny vector strength
        self.temperature = 0.85    # Sampling temperature

        # Destiny vector (EMA of token embeddings)
        self.destiny = [0.0] * n_embd

        # Trauma accumulator
        self.trauma = 0.0

        # Collect all parameters
        self.params = []
        for row in self.wte:
            self.params.extend(row)
        for row in self.wpe:
            self.params.extend(row)
        for layer in self.layers:
            for key in layer:
                for row in layer[key]:
                    self.params.extend(row)
        for row in self.lm_head:
            self.params.extend(row)

        n_params = len(self.params)
        print(f"  PostGPT initialized: {n_params} parameters, vocab={vocab_size}, "
              f"ctx={context_len}, embd={n_embd}, heads={n_head} "
              f"(content={n_content_heads}, rrpram={n_rrpram_heads}), layers={n_layer}")

    def init_from_metaweights(self, meta):
        """
        The ghost becomes flesh.

        Instead of random initialization, seed transformer weights FROM the
        metaweight probability space. The transformer doesn't start blind —
        it starts knowing the corpus through its bones.

        1. Token embeddings ← Hebbian co-occurrence (tokens that appear together → close vectors)
        2. Position embeddings ← positional affinity (what tokens prefer which positions)
        3. RRPRAM Wr ← positional affinity patterns (the rhythm of the corpus)
        4. LM head ← unigram + bigram signal (most likely next tokens)
        """
        V = self.vocab_size
        E = self.n_embd
        T = self.context_len
        scale = 0.15  # how much metaweight signal vs random noise

        print("  Seeding transformer from metaweights (ghost → flesh)...")

        # 1. Token embeddings: tokens with high co-occurrence → similar embeddings
        #    Use SVD-free approach: for each token, its embedding is a weighted sum
        #    of its Hebbian neighbors' random embeddings
        for tok_a in range(min(V, len(self.wte))):
            signal = [0.0] * E
            n_neighbors = 0
            for tok_b in range(min(V, len(self.wte))):
                key = (min(tok_a, tok_b), max(tok_a, tok_b))
                if key in meta.hebbian and meta.hebbian[key] > 0.01:
                    strength = meta.hebbian[key]
                    for d in range(E):
                        signal[d] += strength * self.wte[tok_b][d].data
                    n_neighbors += 1
            if n_neighbors > 0:
                for d in range(E):
                    self.wte[tok_a][d].data += scale * signal[d] / n_neighbors

        # 2. Position embeddings: from positional affinity
        #    Positions that attract similar tokens → similar embeddings
        for pos in range(min(T, len(self.wpe))):
            signal = [0.0] * E
            n_toks = 0
            for tok in meta.pos_affinity:
                if tok < V and pos < len(meta.pos_affinity[tok]):
                    affinity = meta.pos_affinity[tok][pos]
                    if affinity > 0.001:
                        for d in range(E):
                            signal[d] += affinity * self.wte[tok][d].data
                        n_toks += 1
            if n_toks > 0:
                for d in range(E):
                    self.wpe[pos][d].data += scale * signal[d] / n_toks

        # 3. RRPRAM Wr: seed from positional affinity patterns
        #    Each head's Wr column[t] gets signal from which tokens prefer position t
        for layer in self.layers:
            wr = layer['wr']
            for h in range(self.n_rrpram):
                for tok in meta.pos_affinity:
                    if tok >= V:
                        continue
                    affs = meta.pos_affinity[tok]
                    for pos in range(min(T, len(affs))):
                        if affs[pos] > 0.001:
                            wr_row = h * E + (tok % E)
                            if wr_row < len(wr) and pos < len(wr[wr_row]):
                                wr[wr_row][pos].data += scale * 0.5 * affs[pos]

        # 4. LM head: seed from unigram frequencies
        #    Tokens that appear more often get higher initial bias
        for tok in range(min(V, len(self.lm_head))):
            freq = meta.unigram[tok] if tok < len(meta.unigram) else 0
            if freq > 0:
                # Spread frequency signal across embedding dimensions
                for d in range(E):
                    self.lm_head[tok][d].data += scale * freq * self.wte[tok][d].data

        print("  Metaweight seeding complete. The weights remember what they never learned.")

    def forward_token(self, token_id, pos_id, kv_cache):
        """
        Forward pass for a single token position.
        kv_cache: list of (k_list, vc_list, vr_list) per layer
        Returns logits [vocab_size] as list of Val.
        """
        hd = self.head_dim
        nc = self.n_content
        nr = self.n_rrpram

        # Token + position embedding
        tok_emb = self.wte[token_id]
        pos_emb = self.wpe[pos_id]
        x = [t + p for t, p in zip(tok_emb, pos_emb)]

        for li in range(self.n_layer):
            layer = self.layers[li]
            k_cache, vc_cache, vr_cache = kv_cache[li]

            # Pre-norm
            x_res = x
            x_norm = rmsnorm(x)

            # ── Projections ──
            q = linear(x_norm, layer['wq'])
            k = linear(x_norm, layer['wk'])
            v_content = linear(x_norm, layer['wv_content'])
            v_rrpram = linear(x_norm, layer['wv_rrpram'])

            # Cache current position
            k_cache.append(k)
            vc_cache.append(v_content)
            vr_cache.append(v_rrpram)

            x_attn = []

            # Content heads
            for h in range(nc):
                hs = h * hd
                q_h = q[hs:hs + hd]
                k_all = [ki[hs:hs + hd] for ki in k_cache]
                v_all = [vi[hs:hs + hd] for vi in vc_cache]

                # QK^T / sqrt(d)
                attn_logits = []
                for t in range(len(k_all)):
                    score = sum(q_h[j] * k_all[t][j] for j in range(hd))
                    score = score * (1.0 / math.sqrt(hd))
                    attn_logits.append(score)

                attn_weights = softmax(attn_logits)

                head_out = []
                for j in range(hd):
                    val = sum(attn_weights[t] * v_all[t][j] for t in range(len(v_all)))
                    head_out.append(val)
                x_attn.extend(head_out)

            # ── RRPRAM attention (x @ Wr — positional pattern recognition) ──
            for h in range(nr):
                hs = h * hd
                # RRPRAM: project input through Wr to get attention over positions
                # Wr shape per head: [n_embd, context_len]
                wr_offset = h * self.n_embd
                wr_h = layer['wr'][wr_offset:wr_offset + self.n_embd]

                # x_norm @ Wr_h gives [context_len] attention pattern
                seq_len = len(k_cache)
                attn_logits = []
                for t in range(seq_len):
                    # Sum over embedding dimension
                    score = Val(0.0)
                    for d in range(min(self.n_embd, len(wr_h))):
                        if t < len(wr_h[d]):
                            score = score + x_norm[d] * wr_h[d][t]
                    attn_logits.append(score)

                # Causal mask already satisfied (we only have positions <= current)
                attn_weights = softmax(attn_logits) if attn_logits else []

                v_all = [vi[hs:hs + hd] for vi in vr_cache]
                head_out = []
                for j in range(hd):
                    val_sum = Val(0.0)
                    for t in range(len(attn_weights)):
                        if t < len(v_all):
                            val_sum = val_sum + attn_weights[t] * v_all[t][j]
                    head_out.append(val_sum)
                x_attn.extend(head_out)

            # Output projection + residual
            x_proj = linear(x_attn, layer['wo'])
            x = [a + b for a, b in zip(x_proj, x_res)]

            # MLP block
            x_res = x
            x_norm = rmsnorm(x)
            h_mlp = linear(x_norm, layer['mlp_up'])
            h_mlp = [hi.relu() for hi in h_mlp]
            x_mlp = linear(h_mlp, layer['mlp_down'])
            x = [a + b for a, b in zip(x_mlp, x_res)]

        # Final norm + LM head
        x = rmsnorm(x)
        logits = linear(x, self.lm_head)
        return logits

    def forward_sequence(self, token_ids):
        """Forward pass over a sequence. Returns list of logits per position."""
        kv_cache = [([], [], []) for _ in range(self.n_layer)]
        all_logits = []
        for pos, tid in enumerate(token_ids):
            if pos >= self.context_len:
                break
            logits = self.forward_token(tid, pos, kv_cache)
            all_logits.append(logits)
        return all_logits

    def generate(self, prompt_ids, max_tokens=64, meta=None, temperature=None):
        """
        Generate tokens autoregressively.
        If meta (MetaWeights) is provided, applies the Dario field overlay.
        """
        if temperature is None:
            temperature = self.temperature

        kv_cache = [([], [], []) for _ in range(self.n_layer)]
        generated = list(prompt_ids)
        context = list(prompt_ids)

        # Feed prompt through
        for pos, tid in enumerate(prompt_ids):
            if pos >= self.context_len - 1:
                break
            _ = self.forward_token(tid, pos, kv_cache)

        # Generate new tokens
        for step in range(max_tokens):
            pos = len(context) - 1
            if pos >= self.context_len - 1:
                break

            last_tid = context[-1]
            logits = self.forward_token(last_tid, pos, kv_cache)

            # Extract raw logit values
            raw_logits = [l.data for l in logits]

            # ── Dario Field: metaweight overlay ──
            if meta is not None:
                # Hebbian signal
                hebbian = meta.query_hebbian(context[-8:], self.vocab_size)

                # Prophecy signal
                prophecy = meta.query_prophecy(context[-8:], self.vocab_size)

                # Bigram signal
                bigram = meta.query_bigram(last_tid, self.vocab_size)

                # Trigram signal (if enough context)
                if len(context) >= 2:
                    trigram = meta.query_trigram(context[-2], context[-1], self.vocab_size)
                else:
                    trigram = [0.0] * self.vocab_size

                # Destiny update
                if last_tid < len(self.wte):
                    for d in range(self.n_embd):
                        self.destiny[d] = 0.9 * self.destiny[d] + 0.1 * self.wte[last_tid][d].data

                # Destiny signal: cosine similarity with each token embedding
                destiny_signal = [0.0] * self.vocab_size
                dest_norm = math.sqrt(sum(d * d for d in self.destiny) + 1e-10)
                if dest_norm > 1e-8:
                    for tid_c in range(min(self.vocab_size, len(self.wte))):
                        emb = [self.wte[tid_c][d].data for d in range(self.n_embd)]
                        emb_norm = math.sqrt(sum(e * e for e in emb) + 1e-10)
                        if emb_norm > 1e-8:
                            dot = sum(self.destiny[d] * emb[d] for d in range(self.n_embd))
                            destiny_signal[tid_c] = dot / (dest_norm * emb_norm)

                # Combine: Dario Equation (Leo-style: bigram DOMINATES, 12× coefficient)
                # p(x|Φ) = softmax((B_coeff·B + α·H + β·F + γ·A + trigram) / τ)
                # Metaweight signals dominate over untrained base logits
                for i in range(self.vocab_size):
                    raw_logits[i] += (self.alpha_hebbian * hebbian[i]
                                      + self.beta_prophecy * prophecy[i]
                                      + self.gamma_destiny * destiny_signal[i]
                                      + 12.0 * bigram[i]
                                      + 8.0 * trigram[i])

                # Trauma modulation
                trauma_mod = 1.0 / (1.0 + self.trauma)
                raw_logits = [l * trauma_mod for l in raw_logits]

            # Repetition penalty (Leo-style)
            recent = context[-12:] if len(context) >= 12 else context
            for t in recent:
                if t < self.vocab_size:
                    raw_logits[t] *= 0.5

            # Top-k filtering (keep top 15, mask rest)
            top_k = 15
            indexed = sorted(enumerate(raw_logits), key=lambda x: -x[1])
            threshold = indexed[min(top_k - 1, len(indexed) - 1)][1]
            for i in range(self.vocab_size):
                if raw_logits[i] < threshold:
                    raw_logits[i] = -1e10

            # Temperature + softmax
            scaled = [l / temperature for l in raw_logits]
            probs = softmax_float(scaled)

            # Sample
            r = random.random()
            cum = 0.0
            chosen = 0
            for i, p in enumerate(probs):
                cum += p
                if cum > r:
                    chosen = i
                    break

            generated.append(chosen)
            context.append(chosen)

        return generated

    def generate_meta(self, prompt_ids, max_tokens=128, meta=None, temperature=None):
        """
        Meta-generation: pure metaweight generation without transformer forward pass.
        Uses only the statistical probability space from BPE tokenization.

        This follows the Haze/Leo pattern:
        - Trigram first (most coherent), fallback to bigram, then unigram
        - Sample ONLY from tokens that actually appear in the statistics
        - Repetition penalty for loop avoidance
        - Top-k filtering (keep top 15 candidates like Leo)
        """
        if meta is None:
            return prompt_ids
        if temperature is None:
            temperature = self.temperature

        generated = list(prompt_ids)

        for _ in range(max_tokens):
            last = generated[-1]
            candidates = {}  # token_id -> count (sparse, only real candidates)

            # Try trigram first (strongest signal, like Haze)
            if len(generated) >= 2:
                key = (generated[-2], generated[-1])
                if key in meta.trigram:
                    candidates = dict(meta.trigram[key])

            # Fallback to bigram
            if not candidates and last in meta.bigram:
                candidates = dict(meta.bigram[last])

            # Fallback to unigram (last resort)
            if not candidates:
                for i in range(self.vocab_size):
                    if meta.unigram[i] > 1e-8:
                        candidates[i] = meta.unigram[i]

            if not candidates:
                break

            # Hebbian boost — contextual reinforcement on top of trigram/bigram
            ctx = generated[-4:]
            for tok in list(candidates.keys()):
                for ct in ctx:
                    key = (min(tok, ct), max(tok, ct))
                    if key in meta.hebbian:
                        candidates[tok] *= (1.0 + 0.3 * meta.hebbian[key])

            # Repetition penalty (Leo-style: penalize recently seen tokens)
            recent = generated[-12:] if len(generated) >= 12 else generated
            recent_counts = {}
            for t in recent:
                recent_counts[t] = recent_counts.get(t, 0) + 1
            for tok in list(candidates.keys()):
                if tok in recent_counts:
                    freq = recent_counts[tok]
                    penalty = 1.0 / (1.0 + 0.5 * freq)
                    candidates[tok] *= penalty

            # Top-k filtering (keep top 15, like Leo)
            top_k = 15
            sorted_cands = sorted(candidates.items(), key=lambda x: -x[1])
            sorted_cands = sorted_cands[:top_k]

            # Convert to probabilities with temperature
            tokens = [t for t, _ in sorted_cands]
            counts = [c for _, c in sorted_cands]

            # Log-space temperature scaling (like Haze SubwordField)
            import math as _math
            log_counts = [_math.log(c + 1e-10) / temperature for c in counts]
            max_lc = max(log_counts)
            exps = [_math.exp(lc - max_lc) for lc in log_counts]
            total = sum(exps)
            probs = [e / total for e in exps]

            # Sample
            r = random.random()
            cum = 0.0
            chosen = tokens[0]
            for tok, p in zip(tokens, probs):
                cum += p
                if cum > r:
                    chosen = tok
                    break

            generated.append(chosen)

        return generated


 # ─────────────────────────────────────────────────────────────────────────────
 # V. MAIN — tokenize, build metaweights, continue phrases.
 #    the moment of truth. or the moment of coherent bullshit. same thing.
 # ─────────────────────────────────────────────────────────────────────────────

 def load_engine(corpus_path=None):
    """Load corpus, learn BPE, build metaweights, init model. Returns (tokenizer, meta, model)."""
    if corpus_path is None:
        corpus_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'postgpt.txt')

    if not os.path.exists(corpus_path):
        print(f"ERROR: {corpus_path} not found.")
        return None, None, None

    # Step 1: Load corpus
    print("\n[1] Loading corpus...")
    with open(corpus_path, 'rb') as f:
        raw_data = f.read()
    print(f"  Corpus: {len(raw_data)} bytes ({len(raw_data)/1024:.1f} KB)")

    # Step 2: BPE tokenization — load saved merges if they exist
    print("\n[2] BPE tokenizer...")
    tokenizer = BPETokenizer(max_merges=1024)
    merges_path = corpus_path.replace('.txt', '.merges')
    if os.path.exists(merges_path):
        tokenizer.load(merges_path)
        token_ids = tokenizer.encode(raw_data)
        print(f"  Loaded {len(tokenizer.merges)} merges from {os.path.basename(merges_path)}. "
              f"Encoding: {len(token_ids)} tokens")
    else:
        token_ids = tokenizer.learn(raw_data, num_merges=1024)
        tokenizer.save(merges_path)
        print(f"  Saved merges to {os.path.basename(merges_path)}")

    # Step 3: Build metaweights from tokenized corpus
    print("\n[3] Building metaweight probability space...")
    meta = MetaWeights(tokenizer.vocab_size, context_len=64)
    meta.build(token_ids, window=4)

    # Step 4: Initialize dual-attention transformer
    print("\n[4] Initializing PostGPT transformer...")
    model = PostGPT(
        vocab_size=tokenizer.vocab_size,
        context_len=64,
        n_embd=48,
        n_head=4,
        n_layer=2,
        n_content_heads=2,
        n_rrpram_heads=2,
    )

    # Step 5: Seed transformer weights from metaweights (ghost → flesh)
    print("\n[5] Seeding transformer from metaweights...")
    model.init_from_metaweights(meta)

    return tokenizer, meta, model


 def continue_phrase(prompt, tokenizer, meta, model, max_tokens=120, temperature=0.75,
                    mode='meta'):
    """
    Continue a phrase using PostGPT.

    mode='meta'  — pure metaweight generation (fast, bigram/trigram/hebbian/prophecy)
    mode='full'  — transformer forward pass + Dario field overlay (slower, both attentions)
    """
    # Encode prompt via BPE
    prompt_ids = tokenizer.encode(prompt)
    if not prompt_ids:
        return prompt

    if mode == 'meta':
        generated = model.generate_meta(prompt_ids, max_tokens=max_tokens,
                                         meta=meta, temperature=temperature)
    else:
        generated = model.generate(prompt_ids, max_tokens=max_tokens,
                                    meta=meta, temperature=temperature)

    return tokenizer.decode(generated)


 def main():
    import sys

    print("=" * 60)
    print("  PostGPT — metaweight BPE transformer")
    print("  resonance is unbreakable")
    print("=" * 60)

    tokenizer, meta, model = load_engine()
    if tokenizer is None:
        return

    # ── Proof of concept: continue phrases from postgpt.txt ──
    # The model uses BPE tokenization + dual attention + metaweights
    # to continue any prompt coherently — without any training.

    prompts = [
        "PostGPT",
        "The metaweight",
        "RRPRAM attention",
        "BPE tokenization",
        "The transformer architecture",
        "Entropy measures",
        "Language models",
        "The Dario equation",
    ]

    # Allow custom prompt from command line: python postgpt.py "your prompt here"
    if len(sys.argv) > 1:
        prompts = [' '.join(sys.argv[1:])]

    print("\n" + "=" * 60)
    print("  PROOF OF CONCEPT: phrase continuation")
    print("  mode: metaweight (no training, just BPE + statistics)")
    print("=" * 60)

    for prompt in prompts:
        result = continue_phrase(prompt, tokenizer, meta, model,
                                  max_tokens=100, temperature=0.4, mode='meta')
        # Show prompt → continuation clearly
        prompt_len = len(prompt)
        continuation = result[prompt_len:].strip()
        print(f"\n  prompt:       \"{prompt}\"")
        print(f"  continuation: \"{continuation[:250]}\"")

    # Also show the full transformer + Dario field mode for first prompt
    print("\n" + "=" * 60)
    print("  FULL MODE: transformer + Dario field (both attentions)")
    print("=" * 60)

    test_prompt = prompts[0]
    result = continue_phrase(test_prompt, tokenizer, meta, model,
                              max_tokens=30, temperature=0.45, mode='full')
    prompt_len = len(test_prompt)
    continuation = result[prompt_len:].strip()
    print(f"\n  prompt:       \"{test_prompt}\"")
    print(f"  continuation: \"{continuation[:300]}\"")

    print("\n" + "=" * 60)
    print("  PostGPT complete. The metaweights remember.")
    print("  Try: python postgpt.py \"your prompt here\"")
    print("=" * 60)


 if __name__ == '__main__':
    main()
diff --git a/postgpt_train.py b/postgpt_train.py
 """
 postgpt_train.py — training loop for PostGPT using PyTorch + Chuck Optimizer.

 PyTorch is ONLY used here, in the training loop. The runtime (postgpt.py) is
 zero-dependency. This module:

 1. Loads postgpt.txt, tokenizes via BPE
 2. Builds the PostGPT transformer as a PyTorch module
 3. Trains using the Chuck Optimizer (self-aware AdamW variant)
 4. Saves weights back for the pure-Python runtime

 Usage:
    python postgpt_train.py [--steps 200] [--lr 3e-4]

 resonance is unbreakable.
 """

 import os
 import sys
 import math
 import time
 import struct
 import argparse

 # PyTorch — ONLY used in training, not runtime
 try:
    import torch
    import torch.nn as nn
    import torch.nn.functional as F
 except ImportError:
    print("ERROR: PyTorch required for training. Install: pip install torch")
    print("Note: postgpt.py runs without PyTorch (zero-dependency runtime).")
    sys.exit(1)


 # ─────────────────────────────────────────────────────────────────────────────
 # I. BPE TOKENIZER (same algorithm as postgpt.py, but operating on bytes)
 # ─────────────────────────────────────────────────────────────────────────────

 class BPETokenizer:
    def __init__(self, max_merges=1024):
        self.max_merges = max_merges
        self.merges = []
        self.vocab_size = 256
        self.vocab = {i: bytes([i]) for i in range(256)}

    def _count_pairs(self, ids):
        counts = {}
        for i in range(len(ids) - 1):
            pair = (ids[i], ids[i + 1])
            counts[pair] = counts.get(pair, 0) + 1
        return counts

    def _merge_pair(self, ids, pair, new_id):
        result = []
        i = 0
        while i < len(ids):
            if i + 1 < len(ids) and ids[i] == pair[0] and ids[i + 1] == pair[1]:
                result.append(new_id)
                i += 2
            else:
                result.append(ids[i])
                i += 1
        return result

    def learn(self, data_bytes, num_merges=None):
        if num_merges is None:
            num_merges = self.max_merges
        num_merges = min(num_merges, self.max_merges)
        ids = list(data_bytes)
        t0 = time.time()
        for m in range(num_merges):
            counts = self._count_pairs(ids)
            if not counts:
                break
            best_pair = max(counts, key=counts.get)
            if counts[best_pair] < 2:
                break
            new_id = 256 + m
            ids = self._merge_pair(ids, best_pair, new_id)
            self.merges.append((best_pair[0], best_pair[1], new_id))
            self.vocab[new_id] = self.vocab[best_pair[0]] + self.vocab[best_pair[1]]
            self.vocab_size = 256 + m + 1
            if (m + 1) % 200 == 0:
                elapsed = time.time() - t0
                print(f"  merge {m+1}/{num_merges}  vocab={self.vocab_size}  tokens={len(ids)}  [{elapsed:.1f}s]")
        print(f"  BPE complete: {len(self.merges)} merges, vocab={self.vocab_size}, tokens={len(ids)}")
        return ids

    def encode(self, text):
        if isinstance(text, str):
            text = text.encode('utf-8', errors='replace')
        ids = list(text)
        for a, b, new_id in self.merges:
            ids = self._merge_pair(ids, (a, b), new_id)
        return ids

    def decode(self, ids):
        raw = b''
        for tid in ids:
            if tid in self.vocab:
                raw += self.vocab[tid]
        return raw.decode('utf-8', errors='replace')


 # ─────────────────────────────────────────────────────────────────────────────
 # II. PYTORCH PostGPT MODEL
 # ─────────────────────────────────────────────────────────────────────────────

 class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        ms = x.pow(2).mean(-1, keepdim=True)
        x = x * torch.rsqrt(ms + self.eps)
        return x * self.weight


 class ContentAttention(nn.Module):
    """Standard QK^T attention."""
    def __init__(self, n_embd, n_heads, head_dim):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = head_dim
        self.wq = nn.Linear(n_embd, n_heads * head_dim, bias=False)
        self.wk = nn.Linear(n_embd, n_heads * head_dim, bias=False)
        self.wv = nn.Linear(n_embd, n_heads * head_dim, bias=False)

    def forward(self, x):
        B, T, C = x.shape
        q = self.wq(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.wk(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.wv(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
        mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
        attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=-1)

        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
        return out


 class RRPRAMAttention(nn.Module):
    """
    RRPRAM: Recursive Resonant Pattern Recognition Attention Mechanism.
    Instead of QK^T, uses x @ Wr where Wr has shape [n_embd, max_T].
    Learns positional patterns — the rhythm of language.
    """
    def __init__(self, n_embd, n_heads, head_dim, max_T):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = head_dim
        self.max_T = max_T
        # Wr: the pattern matrix — THE core RRPRAM innovation
        self.wr = nn.Parameter(torch.randn(n_heads, n_embd, max_T) * 0.02)
        self.wv = nn.Linear(n_embd, n_heads * head_dim, bias=False)

    def forward(self, x):
        B, T, C = x.shape
        v = self.wv(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        # RRPRAM: x @ Wr gives [B, n_heads, T, max_T] -> take [:, :, :, :T]
        # x: [B, T, C] -> [B, 1, T, C]
        x_expanded = x.unsqueeze(1).expand(-1, self.n_heads, -1, -1)
        # wr: [n_heads, C, max_T] -> we only use first T columns
        wr_t = self.wr[:, :, :T]  # [n_heads, C, T]
        # attn: [B, n_heads, T, T]
        attn = torch.matmul(x_expanded, wr_t.unsqueeze(0).expand(B, -1, -1, -1))

        # Causal mask
        mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
        attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=-1)

        out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
        return out


 class PostGPTBlock(nn.Module):
    """Transformer block with dual attention: Content + RRPRAM."""
    def __init__(self, n_embd, n_content, n_rrpram, head_dim, max_T):
        super().__init__()
        self.norm1 = RMSNorm(n_embd)
        self.content_attn = ContentAttention(n_embd, n_content, head_dim)
        self.rrpram_attn = RRPRAMAttention(n_embd, n_rrpram, head_dim, max_T)
        self.wo = nn.Linear((n_content + n_rrpram) * head_dim, n_embd, bias=False)

        self.norm2 = RMSNorm(n_embd)
        self.mlp_up = nn.Linear(n_embd, 4 * n_embd, bias=False)
        self.mlp_down = nn.Linear(4 * n_embd, n_embd, bias=False)

        # Scale residual connections
        nn.init.normal_(self.wo.weight, std=0.02 / math.sqrt(2))
        nn.init.normal_(self.mlp_down.weight, std=0.02 / math.sqrt(2))

    def forward(self, x):
        x_norm = self.norm1(x)
        c_out = self.content_attn(x_norm)
        r_out = self.rrpram_attn(x_norm)
        attn_out = torch.cat([c_out, r_out], dim=-1)
        x = x + self.wo(attn_out)

        x_norm = self.norm2(x)
        h = self.mlp_up(x_norm)
        h = F.relu(h)
        h = self.mlp_down(h)
        x = x + h
        return x


 class PostGPTModel(nn.Module):
    """PostGPT: dual-attention BPE transformer."""
    def __init__(self, vocab_size, context_len=64, n_embd=48, n_head=4,
                 n_layer=2, n_content=2, n_rrpram=2):
        super().__init__()
        self.context_len = context_len
        head_dim = n_embd // n_head

        self.wte = nn.Embedding(vocab_size, n_embd)
        self.wpe = nn.Embedding(context_len, n_embd)
        self.blocks = nn.ModuleList([
            PostGPTBlock(n_embd, n_content, n_rrpram, head_dim, context_len)
            for _ in range(n_layer)
        ])
        self.norm_f = RMSNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        # Weight tying
        self.lm_head.weight = self.wte.weight

        n_params = sum(p.numel() for p in self.parameters())
        print(f"  PostGPTModel: {n_params:,} parameters")

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.wte(idx)
        pos_emb = self.wpe(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb

        for block in self.blocks:
            x = block(x)

        x = self.norm_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


 # ─────────────────────────────────────────────────────────────────────────────
 # III. CHUCK OPTIMIZER — self-aware learning
 # ─────────────────────────────────────────────────────────────────────────────

 class ChuckOptimizer(torch.optim.Optimizer):
    """
    Chuck Optimizer: AdamW with self-awareness.

    Implements key levels from the Chuck Optimizer concept:
    - Level 1: Global λ — loss trend tracking, dampen/boost
    - Level 2: Per-parameter group modulation
    - Level 6: Simple memory (tracks best loss)
    - Adaptive gradient clipping
    - Mean reversion of dampen to 1.0

    Simplified for PostGPT — the full 9-level version lives in chuck.optimizer.
    """

    def __init__(self, params, lr=3e-4, betas=(0.9, 0.999), eps=1e-8,
                 weight_decay=0.01, window=16):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)

        self.window = window
        self._hist = [0.0] * window
        self._hpos = 0
        self._hfull = False

        # Level 1: Global dampen
        self.dampen = 1.0

        # Level 6: Memory
        self.best_loss = float('inf')
        self.stagnation = 0

        # Adaptive clipping
        self.gnorm_ema = 1.0
        self.global_step = 0

    def _global_grad_norm(self):
        total = 0.0
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is not None:
                    total += p.grad.data.norm().item() ** 2
        return math.sqrt(total)

    @torch.no_grad()
    def step(self, closure=None, loss=None):
        if closure is not None:
            with torch.enable_grad():
                loss_val = closure()
                if loss is None:
                    loss = loss_val.item()

        if loss is None:
            loss = 0.0

        # ── Level 1: Global trend ──
        self._hist[self._hpos] = loss
        self._hpos = (self._hpos + 1) % self.window
        if not self._hfull and self._hpos == 0:
            self._hfull = True

        if self._hfull:
            half = self.window // 2
            recent = sum(self._hist[half:]) / half
            old = sum(self._hist[:half]) / half
            trend = recent - old

            if trend > 0.02:  # loss rising
                self.dampen = max(0.5, self.dampen - 0.05)
            elif trend < -0.02:  # loss falling
                self.dampen = min(1.5, self.dampen + 0.05)

        # Mean reversion
        self.dampen = 0.999 * self.dampen + 0.001 * 1.0

        # ── Level 6: Memory ──
        if loss < self.best_loss:
            self.best_loss = loss
            self.stagnation = 0
        else:
            self.stagnation += 1

        # ── Adaptive gradient clipping ──
        gnorm = self._global_grad_norm()
        self.gnorm_ema = 0.99 * self.gnorm_ema + 0.01 * gnorm
        clip_val = max(1.0, 2.0 * self.gnorm_ema)

        if gnorm > clip_val:
            scale = clip_val / gnorm
            for group in self.param_groups:
                for p in group['params']:
                    if p.grad is not None:
                        p.grad.data.mul_(scale)

        # ── Adam step with dampen ──
        for group in self.param_groups:
            lr = group['lr'] * self.dampen
            beta1, beta2 = group['betas']
            eps = group['eps']
            wd = group['weight_decay']

            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p.data)
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                exp_avg = state['exp_avg']
                exp_avg_sq = state['exp_avg_sq']
                state['step'] += 1

                # Decoupled weight decay
                if wd > 0:
                    p.data.mul_(1 - lr * wd)

                # Adam moments
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                # Bias correction
                bc1 = 1 - beta1 ** state['step']
                bc2 = 1 - beta2 ** state['step']
                m_hat = exp_avg / bc1
                v_hat = exp_avg_sq / bc2

                # Update
                p.data.addcdiv_(m_hat, v_hat.sqrt() + eps, value=-lr)

        self.global_step += 1
        return loss


 # ─────────────────────────────────────────────────────────────────────────────
 # IV. TRAINING LOOP
 # ─────────────────────────────────────────────────────────────────────────────

 def get_batch(token_ids, batch_size, context_len, device):
    """Get a random batch of training examples."""
    n = len(token_ids)
    ix = [torch.randint(0, n - context_len, (1,)).item() for _ in range(batch_size)]
    x = torch.stack([torch.tensor(token_ids[i:i + context_len], dtype=torch.long) for i in ix])
    y = torch.stack([torch.tensor(token_ids[i + 1:i + context_len + 1], dtype=torch.long) for i in ix])
    return x.to(device), y.to(device)


 def save_weights(model, path):
    """Save model weights for pure-Python runtime."""
    state = model.state_dict()
    with open(path, 'wb') as f:
        # Simple binary format: n_tensors, then for each: name_len, name, shape, data
        tensors = [(k, v.cpu().float().numpy()) for k, v in state.items()]
        f.write(struct.pack('<I', len(tensors)))
        for name, arr in tensors:
            name_bytes = name.encode('utf-8')
            f.write(struct.pack('<I', len(name_bytes)))
            f.write(name_bytes)
            shape = arr.shape
            f.write(struct.pack('<I', len(shape)))
            for s in shape:
                f.write(struct.pack('<I', s))
            flat = arr.flatten()
            f.write(struct.pack('<I', len(flat)))
            f.write(flat.tobytes())
    print(f"  Weights saved to {path} ({os.path.getsize(path) / 1024:.1f} KB)")


 def train(args):
    corpus_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'postgpt.txt')
    if not os.path.exists(corpus_path):
        print(f"ERROR: {corpus_path} not found")
        return

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"  Device: {device}")

    # Tokenize
    print("\n[1] BPE tokenization...")
    with open(corpus_path, 'rb') as f:
        raw = f.read()
    tokenizer = BPETokenizer(max_merges=1024)
    token_ids = tokenizer.learn(raw, num_merges=1024)
    print(f"  Tokens: {len(token_ids)}, Vocab: {tokenizer.vocab_size}")

    # Model
    print("\n[2] Building model...")
    model = PostGPTModel(
        vocab_size=tokenizer.vocab_size,
        context_len=args.context_len,
        n_embd=args.n_embd,
        n_head=args.n_head,
        n_layer=args.n_layer,
        n_content=args.n_content,
        n_rrpram=args.n_rrpram,
    ).to(device)

    # Optimizer: Chuck
    print("\n[3] Initializing Chuck Optimizer...")
    optimizer = ChuckOptimizer(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay,
        window=16,
    )

    # Training
    print(f"\n[4] Training for {args.steps} steps...")
    print("-" * 60)

    losses = []
    t0 = time.time()

    for step in range(args.steps):
        x, y = get_batch(token_ids, args.batch_size, args.context_len, device)
        logits, loss = model(x, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step(loss=loss.item())

        loss_val = loss.item()
        losses.append(loss_val)

        if (step + 1) % 10 == 0 or step == 0:
            elapsed = time.time() - t0
            avg_recent = sum(losses[-10:]) / len(losses[-10:])
            print(f"  step {step+1:4d}/{args.steps}  loss={loss_val:.4f}  "
                  f"avg10={avg_recent:.4f}  dampen={optimizer.dampen:.3f}  "
                  f"[{elapsed:.1f}s]")

    # Report
    print("\n" + "-" * 60)
    first_10 = sum(losses[:10]) / min(10, len(losses))
    last_10 = sum(losses[-10:]) / min(10, len(losses))
    print(f"  First 10 avg loss: {first_10:.4f}")
    print(f"  Last 10 avg loss:  {last_10:.4f}")
    print(f"  Loss delta:        {last_10 - first_10:.4f}")
    if last_10 < first_10:
        print(f"  ✓ Loss decreased by {((first_10 - last_10) / first_10) * 100:.1f}%")
    else:
        print(f"  ✗ Loss did not decrease")

    # Generate sample
    print("\n[5] Generation after training...")
    model.eval()
    with torch.no_grad():
        seed = token_ids[:4]
        idx = torch.tensor([seed], dtype=torch.long, device=device)
        for _ in range(60):
            if idx.shape[1] >= args.context_len:
                break
            logits, _ = model(idx[:, -args.context_len:])
            logits = logits[:, -1, :] / 0.8
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1)
            idx = torch.cat([idx, next_token], dim=1)
        generated = idx[0].tolist()
        text = tokenizer.decode(generated)
        print(f"  Output: {text[:300]}")

    # Save weights
    if args.save:
        print("\n[6] Saving weights...")
        save_weights(model, args.save)

    print("\n" + "=" * 60)
    print("  Training complete. Chuck is satisfied.")
    print("=" * 60)

    return losses


 def main():
    parser = argparse.ArgumentParser(description='PostGPT Training with Chuck Optimizer')
    parser.add_argument('--steps', type=int, default=200, help='Training steps')
    parser.add_argument('--batch_size', type=int, default=4, help='Batch size')
    parser.add_argument('--context_len', type=int, default=64, help='Context length')
    parser.add_argument('--n_embd', type=int, default=48, help='Embedding dimension')
    parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads')
    parser.add_argument('--n_layer', type=int, default=2, help='Number of layers')
    parser.add_argument('--n_content', type=int, default=2, help='Content attention heads')
    parser.add_argument('--n_rrpram', type=int, default=2, help='RRPRAM attention heads')
    parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate')
    parser.add_argument('--weight_decay', type=float, default=0.01, help='Weight decay')
    parser.add_argument('--save', type=str, default='', help='Save weights path')
    args = parser.parse_args()

    print("=" * 60)
    print("  PostGPT Training — Chuck Optimizer")
    print("  resonance is unbreakable")
    print("=" * 60)

    train(args)


 if __name__ == '__main__':
    main()
	"""
	postgpt_train.py — training loop for PostGPT using PyTorch + Chuck Optimizer.

	PyTorch is ONLY used here, in the training loop. The runtime (postgpt.py) is
	zero-dependency. This module:

	1. Loads postgpt.txt, tokenizes via BPE
	2. Builds the PostGPT transformer as a PyTorch module
	3. Trains using the Chuck Optimizer (self-aware AdamW variant)
	4. Saves weights back for the pure-Python runtime

	Usage:
	python postgpt_train.py [--steps 200] [--lr 3e-4]

	resonance is unbreakable.
	"""

	import os
	import sys
	import math
	import time
	import struct
	import argparse

	# PyTorch — ONLY used in training, not runtime
	try:
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	except ImportError:
	print("ERROR: PyTorch required for training. Install: pip install torch")
	print("Note: postgpt.py runs without PyTorch (zero-dependency runtime).")
	sys.exit(1)


	# ─────────────────────────────────────────────────────────────────────────────
	# I. BPE TOKENIZER (same algorithm as postgpt.py, but operating on bytes)
	# ─────────────────────────────────────────────────────────────────────────────

	class BPETokenizer:
	def __init__(self, max_merges=1024):
	self.max_merges = max_merges
	self.merges = []
	self.vocab_size = 256
	self.vocab = {i: bytes([i]) for i in range(256)}

	def _count_pairs(self, ids):
	counts = {}
	for i in range(len(ids) - 1):
	pair = (ids[i], ids[i + 1])
	counts[pair] = counts.get(pair, 0) + 1
	return counts

	def _merge_pair(self, ids, pair, new_id):
	result = []
	i = 0
	while i < len(ids):
	if i + 1 < len(ids) and ids[i] == pair[0] and ids[i + 1] == pair[1]:
	result.append(new_id)
	i += 2
	else:
	result.append(ids[i])
	i += 1
	return result

	def learn(self, data_bytes, num_merges=None):
	if num_merges is None:
	num_merges = self.max_merges
	num_merges = min(num_merges, self.max_merges)
	ids = list(data_bytes)
	t0 = time.time()
	for m in range(num_merges):
	counts = self._count_pairs(ids)
	if not counts:
	break
	best_pair = max(counts, key=counts.get)
	if counts[best_pair] < 2:
	break
	new_id = 256 + m
	ids = self._merge_pair(ids, best_pair, new_id)
	self.merges.append((best_pair[0], best_pair[1], new_id))
	self.vocab[new_id] = self.vocab[best_pair[0]] + self.vocab[best_pair[1]]
	self.vocab_size = 256 + m + 1
	if (m + 1) % 200 == 0:
	elapsed = time.time() - t0
	print(f" merge {m+1}/{num_merges} vocab={self.vocab_size} tokens={len(ids)} [{elapsed:.1f}s]")
	print(f" BPE complete: {len(self.merges)} merges, vocab={self.vocab_size}, tokens={len(ids)}")
	return ids

	def encode(self, text):
	if isinstance(text, str):
	text = text.encode('utf-8', errors='replace')
	ids = list(text)
	for a, b, new_id in self.merges:
	ids = self._merge_pair(ids, (a, b), new_id)
	return ids

	def decode(self, ids):
	raw = b''
	for tid in ids:
	if tid in self.vocab:
	raw += self.vocab[tid]
	return raw.decode('utf-8', errors='replace')


	# ─────────────────────────────────────────────────────────────────────────────
	# II. PYTORCH PostGPT MODEL
	# ─────────────────────────────────────────────────────────────────────────────

	class RMSNorm(nn.Module):
	def __init__(self, dim, eps=1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x):
	ms = x.pow(2).mean(-1, keepdim=True)
	x = x * torch.rsqrt(ms + self.eps)
	return x * self.weight


	class ContentAttention(nn.Module):
	"""Standard QK^T attention."""
	def __init__(self, n_embd, n_heads, head_dim):
	super().__init__()
	self.n_heads = n_heads
	self.head_dim = head_dim
	self.wq = nn.Linear(n_embd, n_heads * head_dim, bias=False)
	self.wk = nn.Linear(n_embd, n_heads * head_dim, bias=False)
	self.wv = nn.Linear(n_embd, n_heads * head_dim, bias=False)

	def forward(self, x):
	B, T, C = x.shape
	q = self.wq(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
	k = self.wk(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
	v = self.wv(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

	attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
	mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
	attn = attn.masked_fill(mask, float('-inf'))
	attn = F.softmax(attn, dim=-1)

	out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
	return out


	class RRPRAMAttention(nn.Module):
	"""
	RRPRAM: Recursive Resonant Pattern Recognition Attention Mechanism.
	Instead of QK^T, uses x @ Wr where Wr has shape [n_embd, max_T].
	Learns positional patterns — the rhythm of language.
	"""
	def __init__(self, n_embd, n_heads, head_dim, max_T):
	super().__init__()
	self.n_heads = n_heads
	self.head_dim = head_dim
	self.max_T = max_T
	# Wr: the pattern matrix — THE core RRPRAM innovation
	self.wr = nn.Parameter(torch.randn(n_heads, n_embd, max_T) * 0.02)
	self.wv = nn.Linear(n_embd, n_heads * head_dim, bias=False)

	def forward(self, x):
	B, T, C = x.shape
	v = self.wv(x).view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

	# RRPRAM: x @ Wr gives [B, n_heads, T, max_T] -> take [:, :, :, :T]
	# x: [B, T, C] -> [B, 1, T, C]
	x_expanded = x.unsqueeze(1).expand(-1, self.n_heads, -1, -1)
	# wr: [n_heads, C, max_T] -> we only use first T columns
	wr_t = self.wr[:, :, :T] # [n_heads, C, T]
	# attn: [B, n_heads, T, T]
	attn = torch.matmul(x_expanded, wr_t.unsqueeze(0).expand(B, -1, -1, -1))

	# Causal mask
	mask = torch.triu(torch.ones(T, T, device=x.device, dtype=torch.bool), diagonal=1)
	attn = attn.masked_fill(mask, float('-inf'))
	attn = F.softmax(attn, dim=-1)

	out = (attn @ v).transpose(1, 2).contiguous().view(B, T, -1)
	return out


	class PostGPTBlock(nn.Module):
	"""Transformer block with dual attention: Content + RRPRAM."""
	def __init__(self, n_embd, n_content, n_rrpram, head_dim, max_T):
	super().__init__()
	self.norm1 = RMSNorm(n_embd)
	self.content_attn = ContentAttention(n_embd, n_content, head_dim)
	self.rrpram_attn = RRPRAMAttention(n_embd, n_rrpram, head_dim, max_T)
	self.wo = nn.Linear((n_content + n_rrpram) * head_dim, n_embd, bias=False)

	self.norm2 = RMSNorm(n_embd)
	self.mlp_up = nn.Linear(n_embd, 4 * n_embd, bias=False)
	self.mlp_down = nn.Linear(4 * n_embd, n_embd, bias=False)

	# Scale residual connections
	nn.init.normal_(self.wo.weight, std=0.02 / math.sqrt(2))
	nn.init.normal_(self.mlp_down.weight, std=0.02 / math.sqrt(2))

	def forward(self, x):
	x_norm = self.norm1(x)
	c_out = self.content_attn(x_norm)
	r_out = self.rrpram_attn(x_norm)
	attn_out = torch.cat([c_out, r_out], dim=-1)
	x = x + self.wo(attn_out)

	x_norm = self.norm2(x)
	h = self.mlp_up(x_norm)
	h = F.relu(h)
	h = self.mlp_down(h)
	x = x + h
	return x


	class PostGPTModel(nn.Module):
	"""PostGPT: dual-attention BPE transformer."""
	def __init__(self, vocab_size, context_len=64, n_embd=48, n_head=4,
	n_layer=2, n_content=2, n_rrpram=2):
	super().__init__()
	self.context_len = context_len
	head_dim = n_embd // n_head

	self.wte = nn.Embedding(vocab_size, n_embd)
	self.wpe = nn.Embedding(context_len, n_embd)
	self.blocks = nn.ModuleList([
	PostGPTBlock(n_embd, n_content, n_rrpram, head_dim, context_len)
	for _ in range(n_layer)
	])
	self.norm_f = RMSNorm(n_embd)
	self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

	# Weight tying
	self.lm_head.weight = self.wte.weight

	n_params = sum(p.numel() for p in self.parameters())
	print(f" PostGPTModel: {n_params:,} parameters")

	def forward(self, idx, targets=None):
	B, T = idx.shape
	tok_emb = self.wte(idx)
	pos_emb = self.wpe(torch.arange(T, device=idx.device))
	x = tok_emb + pos_emb

	for block in self.blocks:
	x = block(x)

	x = self.norm_f(x)
	logits = self.lm_head(x)

	loss = None
	if targets is not None:
	loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
	return logits, loss


	# ─────────────────────────────────────────────────────────────────────────────
	# III. CHUCK OPTIMIZER — self-aware learning
	# ─────────────────────────────────────────────────────────────────────────────

	class ChuckOptimizer(torch.optim.Optimizer):
	"""
	Chuck Optimizer: AdamW with self-awareness.

	Implements key levels from the Chuck Optimizer concept:
	- Level 1: Global λ — loss trend tracking, dampen/boost
	- Level 2: Per-parameter group modulation
	- Level 6: Simple memory (tracks best loss)
	- Adaptive gradient clipping
	- Mean reversion of dampen to 1.0

	Simplified for PostGPT — the full 9-level version lives in chuck.optimizer.
	"""

	def __init__(self, params, lr=3e-4, betas=(0.9, 0.999), eps=1e-8,
	weight_decay=0.01, window=16):
	defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
	super().__init__(params, defaults)

	self.window = window
	self._hist = [0.0] * window
	self._hpos = 0
	self._hfull = False

	# Level 1: Global dampen
	self.dampen = 1.0

	# Level 6: Memory
	self.best_loss = float('inf')
	self.stagnation = 0

	# Adaptive clipping
	self.gnorm_ema = 1.0
	self.global_step = 0

	def _global_grad_norm(self):
	total = 0.0
	for group in self.param_groups:
	for p in group['params']:
	if p.grad is not None:
	total += p.grad.data.norm().item() ** 2
	return math.sqrt(total)

	@torch.no_grad()
	def step(self, closure=None, loss=None):
	if closure is not None:
	with torch.enable_grad():
	loss_val = closure()
	if loss is None:
	loss = loss_val.item()

	if loss is None:
	loss = 0.0

	# ── Level 1: Global trend ──
	self._hist[self._hpos] = loss
	self._hpos = (self._hpos + 1) % self.window
	if not self._hfull and self._hpos == 0:
	self._hfull = True

	if self._hfull:
	half = self.window // 2
	recent = sum(self._hist[half:]) / half
	old = sum(self._hist[:half]) / half
	trend = recent - old

	if trend > 0.02: # loss rising
	self.dampen = max(0.5, self.dampen - 0.05)
	elif trend < -0.02: # loss falling
	self.dampen = min(1.5, self.dampen + 0.05)

	# Mean reversion
	self.dampen = 0.999 * self.dampen + 0.001 * 1.0

	# ── Level 6: Memory ──
	if loss < self.best_loss:
	self.best_loss = loss
	self.stagnation = 0
	else:
	self.stagnation += 1

	# ── Adaptive gradient clipping ──
	gnorm = self._global_grad_norm()
	self.gnorm_ema = 0.99 * self.gnorm_ema + 0.01 * gnorm
	clip_val = max(1.0, 2.0 * self.gnorm_ema)

	if gnorm > clip_val:
	scale = clip_val / gnorm
	for group in self.param_groups:
	for p in group['params']:
	if p.grad is not None:
	p.grad.data.mul_(scale)

	# ── Adam step with dampen ──
	for group in self.param_groups:
	lr = group['lr'] * self.dampen
	beta1, beta2 = group['betas']
	eps = group['eps']
	wd = group['weight_decay']

	for p in group['params']:
	if p.grad is None:
	continue

	grad = p.grad.data
	state = self.state[p]

	if len(state) == 0:
	state['step'] = 0
	state['exp_avg'] = torch.zeros_like(p.data)
	state['exp_avg_sq'] = torch.zeros_like(p.data)

	exp_avg = state['exp_avg']
	exp_avg_sq = state['exp_avg_sq']
	state['step'] += 1

	# Decoupled weight decay
	if wd > 0:
	p.data.mul_(1 - lr * wd)

	# Adam moments
	exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
	exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

	# Bias correction
	bc1 = 1 - beta1 ** state['step']
	bc2 = 1 - beta2 ** state['step']
	m_hat = exp_avg / bc1
	v_hat = exp_avg_sq / bc2

	# Update
	p.data.addcdiv_(m_hat, v_hat.sqrt() + eps, value=-lr)

	self.global_step += 1
	return loss


	# ─────────────────────────────────────────────────────────────────────────────
	# IV. TRAINING LOOP
	# ─────────────────────────────────────────────────────────────────────────────

	def get_batch(token_ids, batch_size, context_len, device):
	"""Get a random batch of training examples."""
	n = len(token_ids)
	ix = [torch.randint(0, n - context_len, (1,)).item() for _ in range(batch_size)]
	x = torch.stack([torch.tensor(token_ids[i:i + context_len], dtype=torch.long) for i in ix])
	y = torch.stack([torch.tensor(token_ids[i + 1:i + context_len + 1], dtype=torch.long) for i in ix])
	return x.to(device), y.to(device)


	def save_weights(model, path):
	"""Save model weights for pure-Python runtime."""
	state = model.state_dict()
	with open(path, 'wb') as f:
	# Simple binary format: n_tensors, then for each: name_len, name, shape, data
	tensors = [(k, v.cpu().float().numpy()) for k, v in state.items()]
	f.write(struct.pack('<I', len(tensors)))
	for name, arr in tensors:
	name_bytes = name.encode('utf-8')
	f.write(struct.pack('<I', len(name_bytes)))
	f.write(name_bytes)
	shape = arr.shape
	f.write(struct.pack('<I', len(shape)))
	for s in shape:
	f.write(struct.pack('<I', s))
	flat = arr.flatten()
	f.write(struct.pack('<I', len(flat)))
	f.write(flat.tobytes())
	print(f" Weights saved to {path} ({os.path.getsize(path) / 1024:.1f} KB)")


	def train(args):
	corpus_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'postgpt.txt')
	if not os.path.exists(corpus_path):
	print(f"ERROR: {corpus_path} not found")
	return

	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	print(f" Device: {device}")

	# Tokenize
	print("\n[1] BPE tokenization...")
	with open(corpus_path, 'rb') as f:
	raw = f.read()
	tokenizer = BPETokenizer(max_merges=1024)
	token_ids = tokenizer.learn(raw, num_merges=1024)
	print(f" Tokens: {len(token_ids)}, Vocab: {tokenizer.vocab_size}")

	# Model
	print("\n[2] Building model...")
	model = PostGPTModel(
	vocab_size=tokenizer.vocab_size,
	context_len=args.context_len,
	n_embd=args.n_embd,
	n_head=args.n_head,
	n_layer=args.n_layer,
	n_content=args.n_content,
	n_rrpram=args.n_rrpram,
	).to(device)

	# Optimizer: Chuck
	print("\n[3] Initializing Chuck Optimizer...")
	optimizer = ChuckOptimizer(
	model.parameters(),
	lr=args.lr,
	weight_decay=args.weight_decay,
	window=16,
	)

	# Training
	print(f"\n[4] Training for {args.steps} steps...")
	print("-" * 60)

	losses = []
	t0 = time.time()

	for step in range(args.steps):
	x, y = get_batch(token_ids, args.batch_size, args.context_len, device)
	logits, loss = model(x, y)

	optimizer.zero_grad()
	loss.backward()
	optimizer.step(loss=loss.item())

	loss_val = loss.item()
	losses.append(loss_val)

	if (step + 1) % 10 == 0 or step == 0:
	elapsed = time.time() - t0
	avg_recent = sum(losses[-10:]) / len(losses[-10:])
	print(f" step {step+1:4d}/{args.steps} loss={loss_val:.4f} "
	f"avg10={avg_recent:.4f} dampen={optimizer.dampen:.3f} "
	f"[{elapsed:.1f}s]")

	# Report
	print("\n" + "-" * 60)
	first_10 = sum(losses[:10]) / min(10, len(losses))
	last_10 = sum(losses[-10:]) / min(10, len(losses))
	print(f" First 10 avg loss: {first_10:.4f}")
	print(f" Last 10 avg loss: {last_10:.4f}")
	print(f" Loss delta: {last_10 - first_10:.4f}")
	if last_10 < first_10:
	print(f" ✓ Loss decreased by {((first_10 - last_10) / first_10) * 100:.1f}%")
	else:
	print(f" ✗ Loss did not decrease")

	# Generate sample
	print("\n[5] Generation after training...")
	model.eval()
	with torch.no_grad():
	seed = token_ids[:4]
	idx = torch.tensor([seed], dtype=torch.long, device=device)
	for _ in range(60):
	if idx.shape[1] >= args.context_len:
	break
	logits, _ = model(idx[:, -args.context_len:])
	logits = logits[:, -1, :] / 0.8
	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, 1)
	idx = torch.cat([idx, next_token], dim=1)
	generated = idx[0].tolist()
	text = tokenizer.decode(generated)
	print(f" Output: {text[:300]}")

	# Save weights
	if args.save:
	print("\n[6] Saving weights...")
	save_weights(model, args.save)

	print("\n" + "=" * 60)
	print(" Training complete. Chuck is satisfied.")
	print("=" * 60)

	return losses


	def main():
	parser = argparse.ArgumentParser(description='PostGPT Training with Chuck Optimizer')
	parser.add_argument('--steps', type=int, default=200, help='Training steps')
	parser.add_argument('--batch_size', type=int, default=4, help='Batch size')
	parser.add_argument('--context_len', type=int, default=64, help='Context length')
	parser.add_argument('--n_embd', type=int, default=48, help='Embedding dimension')
	parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads')
	parser.add_argument('--n_layer', type=int, default=2, help='Number of layers')
	parser.add_argument('--n_content', type=int, default=2, help='Content attention heads')
	parser.add_argument('--n_rrpram', type=int, default=2, help='RRPRAM attention heads')
	parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate')
	parser.add_argument('--weight_decay', type=float, default=0.01, help='Weight decay')
	parser.add_argument('--save', type=str, default='', help='Save weights path')
	args = parser.parse_args()

	print("=" * 60)
	print(" PostGPT Training — Chuck Optimizer")
	print(" resonance is unbreakable")
	print("=" * 60)

	train(args)


	if __name__ == '__main__':
	main()
No results found