d0rc · January 2, 2026 14:09
diff --git a/mHC-in-Cpp.cpp b/mHC-in-Cpp.cpp
 ### **The Core Concept**

 The main innovation of mHC is solving the instability of "Hyper-Connections" (HC). 

 Standard HC expands the residual stream width (from $C$ to $n \times C$) but allows the signal to explode or vanish because the connection matrices are unconstrained.

 **mHC fixes this by enforcing constraints:**

 1. **Residual Mapping ($\mathcal{H}^{res}$):** Projected onto the **Birkhoff polytope** (doubly stochastic matrices) using the Sinkhorn-Knopp algorithm.
 2. **Input/Output Mappings ($\mathcal{H}^{pre}, \mathcal{H}^{post}$):** Constrained to be non-negative (using Sigmoid) to prevent signal cancellation.

 ### **C++ Implementation of mHC**

 The following C++ code demonstrates the two critical components: the **Sinkhorn-Knopp projection** (to enforce stability) and the **Layer Forward Pass**.

 #### **1. The Manifold Projection (Sinkhorn-Knopp)**

 The paper uses the Sinkhorn-Knopp algorithm to ensure the residual matrix $\mathcal{H}^{res}$ is doubly stochastic (rows and columns sum to 1).

 ```c++
 #include <vector>
 #include <cmath>
 #include <iostream>
 #include <numeric>

 // Configuration based on the paper
 const int N_STREAMS = 4;        // Expansion rate n (typically 4) [cite: 373]
 const int ITERATIONS = 20;      // Sinkhorn iterations (t_max) [cite: 276]

 // Simple Matrix structure for demonstration
 using Matrix = std::vector<std::vector<float>>;

 /**
 * Projects a matrix onto the Birkhoff polytope (Doubly Stochastic Manifold).
 * This implements Equation (9): M(t) = T_c(T_r(M(t-1)))
 * * Source: [cite: 271, 272]
 */
 Matrix SinkhornKnoppProjection(Matrix H_res_raw) {
    int n = H_res_raw.size();
    Matrix M = H_res_raw;

    // Step 1: Ensure strict positivity using exp() [cite: 271]
    // This serves as the starting point M(0)
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < n; ++j) {
            M[i][j] = std::exp(M[i][j]); 
        }
    }

    // Step 2: Iterative Normalization [cite: 272]
    for (int t = 0; t < ITERATIONS; ++t) {
        // Row Normalization (T_r)
        for (int i = 0; i < n; ++i) {
            float row_sum = 0.0f;
            for (int j = 0; j < n; ++j) row_sum += M[i][j];
            for (int j = 0; j < n; ++j) M[i][j] /= row_sum;
        }

        // Column Normalization (T_c)
        for (int j = 0; j < n; ++j) {
            float col_sum = 0.0f;
            for (int i = 0; i < n; ++i) col_sum += M[i][j];
            for (int i = 0; i < n; ++i) M[i][j] /= col_sum;
        }
    }
    
    // Result: M is now a doubly stochastic matrix.
    // Ideally, row_sum = 1 and col_sum = 1.
    return M;
 }
 ```

 #### **2. The mHC Layer Forward Pass**

 This function shows how the constrained matrices are applied to update the residual stream, implementing Equation (3) with the constraints from Equation (8).

 ```c++
 // Helper: Sigmoid function for H_pre and H_post constraints [cite: 266]
 float sigmoid(float x) { return 1.0f / (1.0f + std::exp(-x)); }

 /**
 * Performs the mHC layer update.
 * * Logic: x_{l+1} = H_res * x_l + H_post^T * F(H_pre * x_l)
 * Source: [cite: 59]
 */
 void mHCLayerForward(
    const Matrix& x_l,          // Input residual stream (n x C) [cite: 118]
    const Matrix& raw_H_res,    // Raw learned residual parameters (n x n)
    const std::vector<float>& raw_H_pre,  // Raw input mapping (1 x n)
    const std::vector<float>& raw_H_post, // Raw output mapping (1 x n)
    Matrix& x_l_next            // Output residual stream (n x C)
 ) {
    int n = x_l.size();         // Expansion factor (streams)
    int C = x_l[0].size();      // Hidden dimension

    // --- 1. Apply Manifold Constraints [cite: 266-269] ---
    
    // Constraint A: H_res must be doubly stochastic [cite: 269]
    Matrix H_res = SinkhornKnoppProjection(raw_H_res);

    // Constraint B: H_pre must be positive (Sigmoid) [cite: 267]
    std::vector<float> H_pre(n);
    for(int i=0; i<n; ++i) H_pre[i] = sigmoid(raw_H_pre[i]);

    // Constraint C: H_post must be positive (2 * Sigmoid) [cite: 268]
    std::vector<float> H_post(n);
    for(int i=0; i<n; ++i) H_post[i] = 2.0f * sigmoid(raw_H_post[i]);


    // --- 2. Aggregate Input for the Layer Function F (H_pre * x_l) ---
    // Aggregates n streams into 1 input vector of size C.
    std::vector<float> layer_input(C, 0.0f);
    for (int k = 0; k < n; ++k) {
        float weight = H_pre[k];
        for (int dim = 0; dim < C; ++dim) {
            layer_input[dim] += weight * x_l[k][dim];
        }
    }

    // --- 3. Compute Layer Function F ---
    // F represents Attention or MLP. In mHC, the dimension is C.
    // std::vector<float> layer_output = F(layer_input, weights); 
    // (Placeholder for actual transformer block)
    std::vector<float> layer_output = layer_input; // Identity for demo


    // --- 4. Update Residual Stream (Equation 3) ---
    // x_{l+1} = (H_res * x_l) + (H_post * F(...))
    
    for (int i = 0; i < n; ++i) { // For each stream i
        for (int dim = 0; dim < C; ++dim) {
            
            // Term 1: Mixing existing streams (H_res * x_l)
            float mixing_term = 0.0f;
            for (int j = 0; j < n; ++j) {
                mixing_term += H_res[i][j] * x_l[j][dim];
            }

            // Term 2: Injecting new features (H_post * layer_output)
            float injection_term = H_post[i] * layer_output[dim];

            // Final Sum
            x_l_next[i][dim] = mixing_term + injection_term;
        }
    }
 }
 ```

 ### **Summary of Benefits (Why code it this way?)**

 1. **Stability via Sinkhorn:** By ensuring `H_res` is doubly stochastic, the code ensures the signal norm does not grow uncontrollably. The matrix acts as a "convex combination" of the input streams, preserving the feature mean.
 2. **Efficiency:** While the code above illustrates the logic, the authors implement this using **kernel fusion** in CUDA (fusing the scans and updates) and **recomputing** intermediate states to handle the memory overhead of the $n \times C$ stream.
 3. **Scalability:** The expansion factor `N_STREAMS` (n) allows the model to scale capacity without increasing the compute cost of the central layer `F` linearly, while the constraints ensure this scaling doesn't break training dynamics.

 ### **The Logical Flow**

 The paper specifies this pipeline for the residual matrix $\mathcal{H}^{res}$:

 1. **Raw Calculation:** Compute raw logits ($\tilde{\mathcal{H}}$) using learnable weights and the input.
 2. **Algorithm Initialization:** Apply `exp()` to create strictly positive values ($M^{(0)}$).
 3. **Manifold Projection:** Run Sinkhorn iterations to force rows/cols to sum to 1.

 ### **C++ Integration Code Example**

 This example demonstrates a function `compute_constrained_H` that would likely sit inside your custom C++ layer. Notice the explicit step where `initial_M` is generated.

 ```c++
 #include <vector>
 #include <cmath>
 #include <iostream>
 #include <iomanip>

 // Dimensions for the example (Expansion rate n=4)
 const int N = 4;
 const int ITERATIONS = 20; 

 // A flattened 1D vector representing an N x N matrix
 using FlatMatrix = std::vector<float>;

 /**
 * INTEGRATION COMPONENT:
 * This function represents the "Projection" step inside the mHC layer.
 * * It takes the "Raw Logits" (computed by the linear layer) and 
 * transforms them into the valid Doubly Stochastic matrix.
 */
 FlatMatrix compute_constrained_H_res(const FlatMatrix& raw_logits) {
    FlatMatrix M = raw_logits; // Working copy

    // ---------------------------------------------------------
    // STEP 1: THE INITIAL VALUE (M^(0))
    // ---------------------------------------------------------
    // The "initial value" for Sinkhorn is NOT zeros. 
    // It is the element-wise exponential of the raw logits.
    // This ensures all values are strictly positive.
    // Source: Equation (9) -> M^(0) = exp(H_tilde)
    // ---------------------------------------------------------
    for (int i = 0; i < N * N; ++i) {
        // If raw_logit is 0.0, initial value becomes 1.0
        // If raw_logit is -10.0, initial value becomes ~0.000045 (close to 0 but positive)
        M[i] = std::exp(M[i]); 
    }

    // ---------------------------------------------------------
    // STEP 2: SINKHORN-KNOPP ITERATIONS
    // ---------------------------------------------------------
    // Now we iteratively normalize rows and columns.
    
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        
        // A. Row Normalization
        for (int r = 0; r < N; ++r) {
            float row_sum = 0.0f;
            // Sum the row
            for (int c = 0; c < N; ++c) row_sum += M[r * N + c];
            
            // Normalize the row (safeguard against div/0)
            float scale = (row_sum > 1e-8f) ? (1.0f / row_sum) : 0.0f;
            for (int c = 0; c < N; ++c) M[r * N + c] *= scale;
        }

        // B. Column Normalization
        for (int c = 0; c < N; ++c) {
            float col_sum = 0.0f;
            // Sum the column
            for (int r = 0; r < N; ++r) col_sum += M[r * N + c];
            
            // Normalize the column
            float scale = (col_sum > 1e-8f) ? (1.0f / col_sum) : 0.0f;
            for (int r = 0; r < N; ++r) M[r * N + c] *= scale;
        }
    }

    return M;
 }

 // Mock simulation of the "Linear" part to show where data comes from
 int main() {
    // Imagine this comes from: alpha * tanh(theta * x) + b
    // These are the "Raw Logits" before any constraints.
    // Notice they can be negative, zero, or positive.
    FlatMatrix raw_logits = {
         0.5f, -1.2f,  0.1f, -0.5f,
        -0.2f,  0.8f, -0.9f,  0.2f,
         0.0f,  0.0f,  0.5f, -0.1f, // Zeros here become 1.0 after exp()
         1.2f, -0.5f,  0.3f, -2.0f
    };

    std::cout << "--- Raw Input (Logits) ---" << std::endl;
    for(int i=0; i<N; ++i) {
        for(int j=0; j<N; ++j) std::cout << std::setw(8) << std::fixed << std::setprecision(3) << raw_logits[i*N+j];
        std::cout << std::endl;
    }

    // Execute the mHC projection
    FlatMatrix final_H = compute_constrained_H_res(raw_logits);

    std::cout << "\n--- Final Doubly Stochastic Matrix (after exp + sinkhorn) ---" << std::endl;
    for(int i=0; i<N; ++i) {
        float row_check = 0.0f;
        for(int j=0; j<N; ++j) {
            std::cout << std::setw(8) << std::fixed << std::setprecision(3) << final_H[i*N+j];
            row_check += final_H[i*N+j];
        }
        std::cout << " | Row Sum: " << row_check << std::endl;
    }

    return 0;
 }
 ```

 ### **Why this specific "Initial Value"?**

 In the code above, the line `M[i] = std::exp(M[i]);` is crucial.

 If you passed the raw logits (which contain negatives) directly into the Sinkhorn normalization loop:

 1. **Negative Values:** You cannot normalize a row with negative numbers to sum to 1 while maintaining the probability interpretation (Birkhoff polytope requires non-negativity).
 2. **Zeros:** If you tried to initialize with zeros, the matrix would stay zero forever (since $0 \times \text{scale} = 0$).

 By using `exp()`, we map the raw parameter space $(-\infty, \infty)$ to the positive real space $(0, \infty)$1. This is the valid "initial value" domain for the Sinkhorn algorithm.
	### The Core Concept

	The main innovation of mHC is solving the instability of "Hyper-Connections" (HC).

	Standard HC expands the residual stream width (from $C$ to $n \times C$) but allows the signal to explode or vanish because the connection matrices are unconstrained.

	mHC fixes this by enforcing constraints:

	1. Residual Mapping ($\mathcal{H}^{res}$): Projected onto the Birkhoff polytope (doubly stochastic matrices) using the Sinkhorn-Knopp algorithm.
	2. Input/Output Mappings ($\mathcal{H}^{pre}, \mathcal{H}^{post}$): Constrained to be non-negative (using Sigmoid) to prevent signal cancellation.

	### C++ Implementation of mHC

	The following C++ code demonstrates the two critical components: the Sinkhorn-Knopp projection (to enforce stability) and the Layer Forward Pass.

	#### 1. The Manifold Projection (Sinkhorn-Knopp)

	The paper uses the Sinkhorn-Knopp algorithm to ensure the residual matrix $\mathcal{H}^{res}$ is doubly stochastic (rows and columns sum to 1).

	```c++
	#include <vector>
	#include <cmath>
	#include <iostream>
	#include <numeric>

	// Configuration based on the paper
	const int N_STREAMS = 4; // Expansion rate n (typically 4) [cite: 373]
	const int ITERATIONS = 20; // Sinkhorn iterations (t_max) [cite: 276]

	// Simple Matrix structure for demonstration
	using Matrix = std::vector<std::vector<float>>;

	/**
	* Projects a matrix onto the Birkhoff polytope (Doubly Stochastic Manifold).
	* This implements Equation (9): M(t) = T_c(T_r(M(t-1)))
	* * Source: [cite: 271, 272]
	*/
	Matrix SinkhornKnoppProjection(Matrix H_res_raw) {
	int n = H_res_raw.size();
	Matrix M = H_res_raw;

	// Step 1: Ensure strict positivity using exp() [cite: 271]
	// This serves as the starting point M(0)
	for (int i = 0; i < n; ++i) {
	for (int j = 0; j < n; ++j) {
	M[i][j] = std::exp(M[i][j]);
	}
	}

	// Step 2: Iterative Normalization [cite: 272]
	for (int t = 0; t < ITERATIONS; ++t) {
	// Row Normalization (T_r)
	for (int i = 0; i < n; ++i) {
	float row_sum = 0.0f;
	for (int j = 0; j < n; ++j) row_sum += M[i][j];
	for (int j = 0; j < n; ++j) M[i][j] /= row_sum;
	}

	// Column Normalization (T_c)
	for (int j = 0; j < n; ++j) {
	float col_sum = 0.0f;
	for (int i = 0; i < n; ++i) col_sum += M[i][j];
	for (int i = 0; i < n; ++i) M[i][j] /= col_sum;
	}
	}

	// Result: M is now a doubly stochastic matrix.
	// Ideally, row_sum = 1 and col_sum = 1.
	return M;
	}
	```

	#### 2. The mHC Layer Forward Pass

	This function shows how the constrained matrices are applied to update the residual stream, implementing Equation (3) with the constraints from Equation (8).

	```c++
	// Helper: Sigmoid function for H_pre and H_post constraints [cite: 266]
	float sigmoid(float x) { return 1.0f / (1.0f + std::exp(-x)); }

	/**
	* Performs the mHC layer update.
	* * Logic: x_{l+1} = H_res * x_l + H_post^T * F(H_pre * x_l)
	* Source: [cite: 59]
	*/
	void mHCLayerForward(
	const Matrix& x_l, // Input residual stream (n x C) [cite: 118]
	const Matrix& raw_H_res, // Raw learned residual parameters (n x n)
	const std::vector<float>& raw_H_pre, // Raw input mapping (1 x n)
	const std::vector<float>& raw_H_post, // Raw output mapping (1 x n)
	Matrix& x_l_next // Output residual stream (n x C)
	) {
	int n = x_l.size(); // Expansion factor (streams)
	int C = x_l[0].size(); // Hidden dimension

	// --- 1. Apply Manifold Constraints [cite: 266-269] ---

	// Constraint A: H_res must be doubly stochastic [cite: 269]
	Matrix H_res = SinkhornKnoppProjection(raw_H_res);

	// Constraint B: H_pre must be positive (Sigmoid) [cite: 267]
	std::vector<float> H_pre(n);
	for(int i=0; i<n; ++i) H_pre[i] = sigmoid(raw_H_pre[i]);

	// Constraint C: H_post must be positive (2 * Sigmoid) [cite: 268]
	std::vector<float> H_post(n);
	for(int i=0; i<n; ++i) H_post[i] = 2.0f * sigmoid(raw_H_post[i]);


	// --- 2. Aggregate Input for the Layer Function F (H_pre * x_l) ---
	// Aggregates n streams into 1 input vector of size C.
	std::vector<float> layer_input(C, 0.0f);
	for (int k = 0; k < n; ++k) {
	float weight = H_pre[k];
	for (int dim = 0; dim < C; ++dim) {
	layer_input[dim] += weight * x_l[k][dim];
	}
	}

	// --- 3. Compute Layer Function F ---
	// F represents Attention or MLP. In mHC, the dimension is C.
	// std::vector<float> layer_output = F(layer_input, weights);
	// (Placeholder for actual transformer block)
	std::vector<float> layer_output = layer_input; // Identity for demo


	// --- 4. Update Residual Stream (Equation 3) ---
	// x_{l+1} = (H_res * x_l) + (H_post * F(...))

	for (int i = 0; i < n; ++i) { // For each stream i
	for (int dim = 0; dim < C; ++dim) {

	// Term 1: Mixing existing streams (H_res * x_l)
	float mixing_term = 0.0f;
	for (int j = 0; j < n; ++j) {
	mixing_term += H_res[i][j] * x_l[j][dim];
	}

	// Term 2: Injecting new features (H_post * layer_output)
	float injection_term = H_post[i] * layer_output[dim];

	// Final Sum
	x_l_next[i][dim] = mixing_term + injection_term;
	}
	}
	}
	```

	### Summary of Benefits (Why code it this way?)

	1. Stability via Sinkhorn: By ensuring `H_res` is doubly stochastic, the code ensures the signal norm does not grow uncontrollably. The matrix acts as a "convex combination" of the input streams, preserving the feature mean.
	2. Efficiency: While the code above illustrates the logic, the authors implement this using kernel fusion in CUDA (fusing the scans and updates) and recomputing intermediate states to handle the memory overhead of the $n \times C$ stream.
	3. Scalability: The expansion factor `N_STREAMS` (n) allows the model to scale capacity without increasing the compute cost of the central layer `F` linearly, while the constraints ensure this scaling doesn't break training dynamics.

	### The Logical Flow

	The paper specifies this pipeline for the residual matrix $\mathcal{H}^{res}$:

	1. Raw Calculation: Compute raw logits ($\tilde{\mathcal{H}}$) using learnable weights and the input.
	2. Algorithm Initialization: Apply `exp()` to create strictly positive values ($M^{(0)}$).
	3. Manifold Projection: Run Sinkhorn iterations to force rows/cols to sum to 1.

	### C++ Integration Code Example

	This example demonstrates a function `compute_constrained_H` that would likely sit inside your custom C++ layer. Notice the explicit step where `initial_M` is generated.

	```c++
	#include <vector>
	#include <cmath>
	#include <iostream>
	#include <iomanip>

	// Dimensions for the example (Expansion rate n=4)
	const int N = 4;
	const int ITERATIONS = 20;

	// A flattened 1D vector representing an N x N matrix
	using FlatMatrix = std::vector<float>;

	/**
	* INTEGRATION COMPONENT:
	* This function represents the "Projection" step inside the mHC layer.
	* * It takes the "Raw Logits" (computed by the linear layer) and
	* transforms them into the valid Doubly Stochastic matrix.
	*/
	FlatMatrix compute_constrained_H_res(const FlatMatrix& raw_logits) {
	FlatMatrix M = raw_logits; // Working copy

	// ---------------------------------------------------------
	// STEP 1: THE INITIAL VALUE (M^(0))
	// ---------------------------------------------------------
	// The "initial value" for Sinkhorn is NOT zeros.
	// It is the element-wise exponential of the raw logits.
	// This ensures all values are strictly positive.
	// Source: Equation (9) -> M^(0) = exp(H_tilde)
	// ---------------------------------------------------------
	for (int i = 0; i < N * N; ++i) {
	// If raw_logit is 0.0, initial value becomes 1.0
	// If raw_logit is -10.0, initial value becomes ~0.000045 (close to 0 but positive)
	M[i] = std::exp(M[i]);
	}

	// ---------------------------------------------------------
	// STEP 2: SINKHORN-KNOPP ITERATIONS
	// ---------------------------------------------------------
	// Now we iteratively normalize rows and columns.

	for (int iter = 0; iter < ITERATIONS; ++iter) {

	// A. Row Normalization
	for (int r = 0; r < N; ++r) {
	float row_sum = 0.0f;
	// Sum the row
	for (int c = 0; c < N; ++c) row_sum += M[r * N + c];

	// Normalize the row (safeguard against div/0)
	float scale = (row_sum > 1e-8f) ? (1.0f / row_sum) : 0.0f;
	for (int c = 0; c < N; ++c) M[r * N + c] *= scale;
	}

	// B. Column Normalization
	for (int c = 0; c < N; ++c) {
	float col_sum = 0.0f;
	// Sum the column
	for (int r = 0; r < N; ++r) col_sum += M[r * N + c];

	// Normalize the column
	float scale = (col_sum > 1e-8f) ? (1.0f / col_sum) : 0.0f;
	for (int r = 0; r < N; ++r) M[r * N + c] *= scale;
	}
	}

	return M;
	}

	// Mock simulation of the "Linear" part to show where data comes from
	int main() {
	// Imagine this comes from: alpha * tanh(theta * x) + b
	// These are the "Raw Logits" before any constraints.
	// Notice they can be negative, zero, or positive.
	FlatMatrix raw_logits = {
	0.5f, -1.2f, 0.1f, -0.5f,
	-0.2f, 0.8f, -0.9f, 0.2f,
	0.0f, 0.0f, 0.5f, -0.1f, // Zeros here become 1.0 after exp()
	1.2f, -0.5f, 0.3f, -2.0f
	};

	std::cout << "--- Raw Input (Logits) ---" << std::endl;
	for(int i=0; i<N; ++i) {
	for(int j=0; j<N; ++j) std::cout << std::setw(8) << std::fixed << std::setprecision(3) << raw_logits[i*N+j];
	std::cout << std::endl;
	}

	// Execute the mHC projection
	FlatMatrix final_H = compute_constrained_H_res(raw_logits);

	std::cout << "\n--- Final Doubly Stochastic Matrix (after exp + sinkhorn) ---" << std::endl;
	for(int i=0; i<N; ++i) {
	float row_check = 0.0f;
	for(int j=0; j<N; ++j) {
	std::cout << std::setw(8) << std::fixed << std::setprecision(3) << final_H[i*N+j];
	row_check += final_H[i*N+j];
	}
	std::cout << " \| Row Sum: " << row_check << std::endl;
	}

	return 0;
	}
	```

	### Why this specific "Initial Value"?

	In the code above, the line `M[i] = std::exp(M[i]);` is crucial.

	If you passed the raw logits (which contain negatives) directly into the Sinkhorn normalization loop:

	1. Negative Values: You cannot normalize a row with negative numbers to sum to 1 while maintaining the probability interpretation (Birkhoff polytope requires non-negativity).
	2. Zeros: If you tried to initialize with zeros, the matrix would stay zero forever (since $0 \times \text{scale} = 0$).

	By using `exp()`, we map the raw parameter space $(-\infty, \infty)$ to the positive real space $(0, \infty)$1. This is the valid "initial value" domain for the Sinkhorn algorithm.
No results found