Created
January 2, 2026 14:09
-
-
Save d0rc/4a95ceb6f30abec637e8b4ebce924068 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ### **The Core Concept** | |
| The main innovation of mHC is solving the instability of "Hyper-Connections" (HC). | |
| Standard HC expands the residual stream width (from $C$ to $n \times C$) but allows the signal to explode or vanish because the connection matrices are unconstrained. | |
| **mHC fixes this by enforcing constraints:** | |
| 1. **Residual Mapping ($\mathcal{H}^{res}$):** Projected onto the **Birkhoff polytope** (doubly stochastic matrices) using the Sinkhorn-Knopp algorithm. | |
| 2. **Input/Output Mappings ($\mathcal{H}^{pre}, \mathcal{H}^{post}$):** Constrained to be non-negative (using Sigmoid) to prevent signal cancellation. | |
| ### **C++ Implementation of mHC** | |
| The following C++ code demonstrates the two critical components: the **Sinkhorn-Knopp projection** (to enforce stability) and the **Layer Forward Pass**. | |
| #### **1. The Manifold Projection (Sinkhorn-Knopp)** | |
| The paper uses the Sinkhorn-Knopp algorithm to ensure the residual matrix $\mathcal{H}^{res}$ is doubly stochastic (rows and columns sum to 1). | |
| ```c++ | |
| #include <vector> | |
| #include <cmath> | |
| #include <iostream> | |
| #include <numeric> | |
| // Configuration based on the paper | |
| const int N_STREAMS = 4; // Expansion rate n (typically 4) [cite: 373] | |
| const int ITERATIONS = 20; // Sinkhorn iterations (t_max) [cite: 276] | |
| // Simple Matrix structure for demonstration | |
| using Matrix = std::vector<std::vector<float>>; | |
| /** | |
| * Projects a matrix onto the Birkhoff polytope (Doubly Stochastic Manifold). | |
| * This implements Equation (9): M(t) = T_c(T_r(M(t-1))) | |
| * * Source: [cite: 271, 272] | |
| */ | |
| Matrix SinkhornKnoppProjection(Matrix H_res_raw) { | |
| int n = H_res_raw.size(); | |
| Matrix M = H_res_raw; | |
| // Step 1: Ensure strict positivity using exp() [cite: 271] | |
| // This serves as the starting point M(0) | |
| for (int i = 0; i < n; ++i) { | |
| for (int j = 0; j < n; ++j) { | |
| M[i][j] = std::exp(M[i][j]); | |
| } | |
| } | |
| // Step 2: Iterative Normalization [cite: 272] | |
| for (int t = 0; t < ITERATIONS; ++t) { | |
| // Row Normalization (T_r) | |
| for (int i = 0; i < n; ++i) { | |
| float row_sum = 0.0f; | |
| for (int j = 0; j < n; ++j) row_sum += M[i][j]; | |
| for (int j = 0; j < n; ++j) M[i][j] /= row_sum; | |
| } | |
| // Column Normalization (T_c) | |
| for (int j = 0; j < n; ++j) { | |
| float col_sum = 0.0f; | |
| for (int i = 0; i < n; ++i) col_sum += M[i][j]; | |
| for (int i = 0; i < n; ++i) M[i][j] /= col_sum; | |
| } | |
| } | |
| // Result: M is now a doubly stochastic matrix. | |
| // Ideally, row_sum = 1 and col_sum = 1. | |
| return M; | |
| } | |
| ``` | |
| #### **2. The mHC Layer Forward Pass** | |
| This function shows how the constrained matrices are applied to update the residual stream, implementing Equation (3) with the constraints from Equation (8). | |
| ```c++ | |
| // Helper: Sigmoid function for H_pre and H_post constraints [cite: 266] | |
| float sigmoid(float x) { return 1.0f / (1.0f + std::exp(-x)); } | |
| /** | |
| * Performs the mHC layer update. | |
| * * Logic: x_{l+1} = H_res * x_l + H_post^T * F(H_pre * x_l) | |
| * Source: [cite: 59] | |
| */ | |
| void mHCLayerForward( | |
| const Matrix& x_l, // Input residual stream (n x C) [cite: 118] | |
| const Matrix& raw_H_res, // Raw learned residual parameters (n x n) | |
| const std::vector<float>& raw_H_pre, // Raw input mapping (1 x n) | |
| const std::vector<float>& raw_H_post, // Raw output mapping (1 x n) | |
| Matrix& x_l_next // Output residual stream (n x C) | |
| ) { | |
| int n = x_l.size(); // Expansion factor (streams) | |
| int C = x_l[0].size(); // Hidden dimension | |
| // --- 1. Apply Manifold Constraints [cite: 266-269] --- | |
| // Constraint A: H_res must be doubly stochastic [cite: 269] | |
| Matrix H_res = SinkhornKnoppProjection(raw_H_res); | |
| // Constraint B: H_pre must be positive (Sigmoid) [cite: 267] | |
| std::vector<float> H_pre(n); | |
| for(int i=0; i<n; ++i) H_pre[i] = sigmoid(raw_H_pre[i]); | |
| // Constraint C: H_post must be positive (2 * Sigmoid) [cite: 268] | |
| std::vector<float> H_post(n); | |
| for(int i=0; i<n; ++i) H_post[i] = 2.0f * sigmoid(raw_H_post[i]); | |
| // --- 2. Aggregate Input for the Layer Function F (H_pre * x_l) --- | |
| // Aggregates n streams into 1 input vector of size C. | |
| std::vector<float> layer_input(C, 0.0f); | |
| for (int k = 0; k < n; ++k) { | |
| float weight = H_pre[k]; | |
| for (int dim = 0; dim < C; ++dim) { | |
| layer_input[dim] += weight * x_l[k][dim]; | |
| } | |
| } | |
| // --- 3. Compute Layer Function F --- | |
| // F represents Attention or MLP. In mHC, the dimension is C. | |
| // std::vector<float> layer_output = F(layer_input, weights); | |
| // (Placeholder for actual transformer block) | |
| std::vector<float> layer_output = layer_input; // Identity for demo | |
| // --- 4. Update Residual Stream (Equation 3) --- | |
| // x_{l+1} = (H_res * x_l) + (H_post * F(...)) | |
| for (int i = 0; i < n; ++i) { // For each stream i | |
| for (int dim = 0; dim < C; ++dim) { | |
| // Term 1: Mixing existing streams (H_res * x_l) | |
| float mixing_term = 0.0f; | |
| for (int j = 0; j < n; ++j) { | |
| mixing_term += H_res[i][j] * x_l[j][dim]; | |
| } | |
| // Term 2: Injecting new features (H_post * layer_output) | |
| float injection_term = H_post[i] * layer_output[dim]; | |
| // Final Sum | |
| x_l_next[i][dim] = mixing_term + injection_term; | |
| } | |
| } | |
| } | |
| ``` | |
| ### **Summary of Benefits (Why code it this way?)** | |
| 1. **Stability via Sinkhorn:** By ensuring `H_res` is doubly stochastic, the code ensures the signal norm does not grow uncontrollably. The matrix acts as a "convex combination" of the input streams, preserving the feature mean. | |
| 2. **Efficiency:** While the code above illustrates the logic, the authors implement this using **kernel fusion** in CUDA (fusing the scans and updates) and **recomputing** intermediate states to handle the memory overhead of the $n \times C$ stream. | |
| 3. **Scalability:** The expansion factor `N_STREAMS` (n) allows the model to scale capacity without increasing the compute cost of the central layer `F` linearly, while the constraints ensure this scaling doesn't break training dynamics. | |
| ### **The Logical Flow** | |
| The paper specifies this pipeline for the residual matrix $\mathcal{H}^{res}$: | |
| 1. **Raw Calculation:** Compute raw logits ($\tilde{\mathcal{H}}$) using learnable weights and the input. | |
| 2. **Algorithm Initialization:** Apply `exp()` to create strictly positive values ($M^{(0)}$). | |
| 3. **Manifold Projection:** Run Sinkhorn iterations to force rows/cols to sum to 1. | |
| ### **C++ Integration Code Example** | |
| This example demonstrates a function `compute_constrained_H` that would likely sit inside your custom C++ layer. Notice the explicit step where `initial_M` is generated. | |
| ```c++ | |
| #include <vector> | |
| #include <cmath> | |
| #include <iostream> | |
| #include <iomanip> | |
| // Dimensions for the example (Expansion rate n=4) | |
| const int N = 4; | |
| const int ITERATIONS = 20; | |
| // A flattened 1D vector representing an N x N matrix | |
| using FlatMatrix = std::vector<float>; | |
| /** | |
| * INTEGRATION COMPONENT: | |
| * This function represents the "Projection" step inside the mHC layer. | |
| * * It takes the "Raw Logits" (computed by the linear layer) and | |
| * transforms them into the valid Doubly Stochastic matrix. | |
| */ | |
| FlatMatrix compute_constrained_H_res(const FlatMatrix& raw_logits) { | |
| FlatMatrix M = raw_logits; // Working copy | |
| // --------------------------------------------------------- | |
| // STEP 1: THE INITIAL VALUE (M^(0)) | |
| // --------------------------------------------------------- | |
| // The "initial value" for Sinkhorn is NOT zeros. | |
| // It is the element-wise exponential of the raw logits. | |
| // This ensures all values are strictly positive. | |
| // Source: Equation (9) -> M^(0) = exp(H_tilde) | |
| // --------------------------------------------------------- | |
| for (int i = 0; i < N * N; ++i) { | |
| // If raw_logit is 0.0, initial value becomes 1.0 | |
| // If raw_logit is -10.0, initial value becomes ~0.000045 (close to 0 but positive) | |
| M[i] = std::exp(M[i]); | |
| } | |
| // --------------------------------------------------------- | |
| // STEP 2: SINKHORN-KNOPP ITERATIONS | |
| // --------------------------------------------------------- | |
| // Now we iteratively normalize rows and columns. | |
| for (int iter = 0; iter < ITERATIONS; ++iter) { | |
| // A. Row Normalization | |
| for (int r = 0; r < N; ++r) { | |
| float row_sum = 0.0f; | |
| // Sum the row | |
| for (int c = 0; c < N; ++c) row_sum += M[r * N + c]; | |
| // Normalize the row (safeguard against div/0) | |
| float scale = (row_sum > 1e-8f) ? (1.0f / row_sum) : 0.0f; | |
| for (int c = 0; c < N; ++c) M[r * N + c] *= scale; | |
| } | |
| // B. Column Normalization | |
| for (int c = 0; c < N; ++c) { | |
| float col_sum = 0.0f; | |
| // Sum the column | |
| for (int r = 0; r < N; ++r) col_sum += M[r * N + c]; | |
| // Normalize the column | |
| float scale = (col_sum > 1e-8f) ? (1.0f / col_sum) : 0.0f; | |
| for (int r = 0; r < N; ++r) M[r * N + c] *= scale; | |
| } | |
| } | |
| return M; | |
| } | |
| // Mock simulation of the "Linear" part to show where data comes from | |
| int main() { | |
| // Imagine this comes from: alpha * tanh(theta * x) + b | |
| // These are the "Raw Logits" before any constraints. | |
| // Notice they can be negative, zero, or positive. | |
| FlatMatrix raw_logits = { | |
| 0.5f, -1.2f, 0.1f, -0.5f, | |
| -0.2f, 0.8f, -0.9f, 0.2f, | |
| 0.0f, 0.0f, 0.5f, -0.1f, // Zeros here become 1.0 after exp() | |
| 1.2f, -0.5f, 0.3f, -2.0f | |
| }; | |
| std::cout << "--- Raw Input (Logits) ---" << std::endl; | |
| for(int i=0; i<N; ++i) { | |
| for(int j=0; j<N; ++j) std::cout << std::setw(8) << std::fixed << std::setprecision(3) << raw_logits[i*N+j]; | |
| std::cout << std::endl; | |
| } | |
| // Execute the mHC projection | |
| FlatMatrix final_H = compute_constrained_H_res(raw_logits); | |
| std::cout << "\n--- Final Doubly Stochastic Matrix (after exp + sinkhorn) ---" << std::endl; | |
| for(int i=0; i<N; ++i) { | |
| float row_check = 0.0f; | |
| for(int j=0; j<N; ++j) { | |
| std::cout << std::setw(8) << std::fixed << std::setprecision(3) << final_H[i*N+j]; | |
| row_check += final_H[i*N+j]; | |
| } | |
| std::cout << " | Row Sum: " << row_check << std::endl; | |
| } | |
| return 0; | |
| } | |
| ``` | |
| ### **Why this specific "Initial Value"?** | |
| In the code above, the line `M[i] = std::exp(M[i]);` is crucial. | |
| If you passed the raw logits (which contain negatives) directly into the Sinkhorn normalization loop: | |
| 1. **Negative Values:** You cannot normalize a row with negative numbers to sum to 1 while maintaining the probability interpretation (Birkhoff polytope requires non-negativity). | |
| 2. **Zeros:** If you tried to initialize with zeros, the matrix would stay zero forever (since $0 \times \text{scale} = 0$). | |
| By using `exp()`, we map the raw parameter space $(-\infty, \infty)$ to the positive real space $(0, \infty)$1. This is the valid "initial value" domain for the Sinkhorn algorithm. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment