am17an · June 30, 2025 14:30
diff --git a/bench.cpp b/bench.cpp
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <string.h>
 #include <algorithm>
 #include <vector>

 // Function to print a tensor in readable format
 static void print_tensor(const char* name, struct ggml_tensor * tensor) {
    printf("\n%s shape: [%ld, %ld, %ld, %ld]\n", name,
           tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    
    float * data = (float*)tensor->data;
    int64_t W = tensor->ne[0];
    int64_t H = tensor->ne[1]; 
    int64_t C = tensor->ne[2];
    int64_t N = tensor->ne[3];
    
    for (int64_t n = 0; n < N; n++) {
        for (int64_t c = 0; c < C; c++) {
            printf("%s[%ld,%ld]:\n", name, n, c);
            for (int64_t h = 0; h < H; h++) {
                printf("  ");
                for (int64_t w = 0; w < W; w++) {
                    int64_t idx = n * (C * H * W) + c * (H * W) + h * W + w;
                    printf("%7.3f ", data[idx]);
                }
                printf("\n");
            }
        }
    }
 }

 // Function to compare two tensors with tolerance
 static bool tensors_equal(struct ggml_tensor * a, struct ggml_tensor * b, float tolerance) {
    if (ggml_nelements(a) != ggml_nelements(b)) {
        printf("Different number of elements: %zu vs %zu\n", ggml_nelements(a), ggml_nelements(b));
        return false;
    }
    
    for (int i = 0; i < 4; i++) {
        if (a->ne[i] != b->ne[i]) {
            printf("Shape mismatch at dim %d: %ld vs %ld\n", i, a->ne[i], b->ne[i]);
            return false;
        }
    }
    
    float * data_a = (float*)a->data;
    float * data_b = (float*)b->data;
    
    float max_diff = 0.0f;
    int diff_count = 0;
    
    for (int i = 0; i < ggml_nelements(a); i++) {
        float diff = fabsf(data_a[i] - data_b[i]);
        if (diff > tolerance) {
            diff_count++;
            if (diff_count <= 10) { // Show first 10 differences
                printf("Diff at index %zu: %.6f vs %.6f (diff: %.6f)\n", 
                       i, data_a[i], data_b[i], diff);
            }
        }
        if (diff > max_diff) {
            max_diff = diff;
        }
    }
    
    printf("Max difference: %.6f, Elements above tolerance: %d/%zu\n", 
           max_diff, diff_count, ggml_nelements(a));
    
    return diff_count == 0;
 }


 // Structure to hold benchmark results
 typedef struct {
    int IW, IH, IC, N, KW, KH, OC;
    int stride, padding, dilation;
    double im2col_time_ms;
    double simd_time_ms;
    double speedup;
    bool results_match;
    int64_t total_ops;  // Total operations for this configuration
 } benchmark_result_t;

 // Function to calculate total operations for a convolution
 static int64_t calculate_conv_ops(int IW, int IH, int IC, int N, int KW, int KH, int OC, int stride, int padding) {
    int OW = (IW + 2 * padding - KW) / stride + 1;
    int OH = (IH + 2 * padding - KH) / stride + 1;
    return (int64_t)N * OC * OH * OW * IC * KH * KW;
 }

 // Simplified timing function
 static double get_time_ms() {
    static int64_t first_call = 0;
    int64_t now = ggml_time_us();
    if (first_call == 0) first_call = now;
    return (now - first_call) / 1000.0;
 }

 // Simple performance test function
 static void test_performance(int IW, int IH, int IC, int N, int KW, int KH, int OC, 
                     int stride, int padding, int dilation) {
    // Create contexts
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024*200,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx1 = ggml_init(params);
    struct ggml_context * ctx2 = ggml_init(params);
    
    if (!ctx1 || !ctx2) return;
    
    // Create tensors
    struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N);
    struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, IW, IH, IC, N);
    struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, KW, KH, IC, OC);
    struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, KW, KH, IC, OC);
    
    // Initialize data
    srand(42);
    float * input1_data = (float*)input1->data;
    float * input2_data = (float*)input2->data;
    float * kernel1_data = (float*)kernel1->data;
    float * kernel2_data = (float*)kernel2->data;
    
    for (int i = 0; i < ggml_nelements(input1); i++) {
        float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
        input1_data[i] = val;
        input2_data[i] = val;
    }
    
    for (int i = 0; i < ggml_nelements(kernel1); i++) {
        float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
        kernel1_data[i] = val;
        kernel2_data[i] = val;
    }
    
    // Create operations
    struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 
                                                      stride, stride, padding, padding, 
                                                      dilation, dilation);
    struct ggml_tensor * result_simd = ggml_conv_2d_direct(ctx2, kernel2, input2,
                                                           stride, stride, padding, padding,
                                                           dilation, dilation);
    
    if (!result_im2col || !result_simd) {
        ggml_free(ctx1);
        ggml_free(ctx2);
        return;
    }
    
    // Build graphs
    struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
    struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
    ggml_build_forward_expand(graph1, result_im2col);
    ggml_build_forward_expand(graph2, result_simd);
    
    ggml_backend_t backend = ggml_backend_cpu_init();
    if (!backend) {
        ggml_free(ctx1);
        ggml_free(ctx2);
        return;
    }
    
    // Time IM2COL
    int64_t start = ggml_time_us();
    ggml_backend_graph_compute(backend, graph1);
    int64_t im2col_time = ggml_time_us() - start;
    
    // Time SIMD
    start = ggml_time_us();
    ggml_backend_graph_compute(backend, graph2);
    int64_t simd_time = ggml_time_us() - start;
    
    // Compare results
    bool match = true;
    std::vector<float> diffs;
    if (ggml_nelements(result_im2col) == ggml_nelements(result_simd)) {
        float * data_a = (float*)result_im2col->data;
        float * data_b = (float*)result_simd->data;
        for (size_t i = 0; i < ggml_nelements(result_im2col); i++) {
            if (fabsf(data_a[i] - data_b[i]) > 1e-4f) {
                diffs.push_back(fabsf(data_a[i] - data_b[i]));
                match = false;
            }
        }
    } else {
        match = false;
    }
    
    // Calculate speedup
    double im2col_ms = im2col_time / 1000.0;
    double simd_ms = simd_time / 1000.0;
    double speedup = im2col_ms / simd_ms;
    const char* faster = speedup > 1.0 ? "SIMD" : "IM2COL";
    if (speedup < 1.0) speedup = 1.0 / speedup;
    
    // Print table row
    printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F32 |\n",
           IW, IH, IC, KW, KH, IC, OC, stride, padding,
           im2col_ms, simd_ms, speedup, faster,
           match ? "✅" : "❌");

    if(!match) {
        printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end()));
    }
    
    ggml_backend_free(backend);
    ggml_free(ctx1);
    ggml_free(ctx2);
 }

 // FP16 performance test function - compare new fp16 implementation with old fp32 baseline
 static void test_performance_fp16(int IW, int IH, int IC, int N, int KW, int KH, int OC, 
                          int stride, int padding, int dilation) {
    // Create contexts
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024*10,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx1 = ggml_init(params);
    struct ggml_context * ctx2 = ggml_init(params);
    
    if (!ctx1 || !ctx2) return;
    
    // Create tensors - NEW: fp16 kernel, OLD: fp32 kernel
    struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N);
    struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, KW, KH, IC, OC); // NEW: FP16
    
    // Initialize data
    srand(42);
    float * input1_data = (float*)input1->data;
    ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data;
    
    for (int i = 0; i < ggml_nelements(input1); i++) {
        float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
        input1_data[i] = val;
    }
    
    for (int i = 0; i < ggml_nelements(kernel1); i++) {
        float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
        kernel1_data[i] = ggml_fp32_to_fp16(val);
    }
    
    // Create operations
    struct ggml_tensor * result_new = ggml_conv_2d_direct(ctx1, kernel1, input1, 
                                                   stride, stride, padding, padding, 
                                                   dilation, dilation);
    struct ggml_tensor * result_old = ggml_conv_2d(ctx2, kernel1, input1,
                                                   stride, stride, padding, padding,
                                                   dilation, dilation);
    
    if (!result_new || !result_old) {
        ggml_free(ctx1);
        ggml_free(ctx2);
        return;
    }
    
    // Build graphs
    struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
    struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
    ggml_build_forward_expand(graph1, result_new);
    ggml_build_forward_expand(graph2, result_old);
    
    ggml_backend_t backend = ggml_backend_cpu_init();
    if (!backend) {
        ggml_free(ctx1);
        ggml_free(ctx2);
        return;
    }
    
    // Time NEW (fp16)
    int64_t start = ggml_time_us();
    ggml_backend_graph_compute(backend, graph1);
    int64_t new_time = ggml_time_us() - start;
    
    // Time OLD (fp32)
    start = ggml_time_us();
    ggml_backend_graph_compute(backend, graph2);
    int64_t old_time = ggml_time_us() - start;
    
    // Compare results
    bool match = true;
    std::vector<float> diffs;
    if (ggml_nelements(result_new) == ggml_nelements(result_old)) {
        float * data_new = (float*)result_new->data;
        float * data_old = (float*)result_old->data;
        for (size_t i = 0; i < ggml_nelements(result_new); i++) {
            if (fabsf(data_new[i] - data_old[i]) > 2e-2f) { // Slightly higher tolerance for fp16
                diffs.push_back(fabsf(data_new[i] - data_old[i]));
                match = false;
            }
        }
    } else {
        match = false;
    }
    
    // Calculate speedup
    double new_ms = new_time / 1000.0;
    double old_ms = old_time / 1000.0;
    double speedup = old_ms / new_ms;
    const char* faster = speedup > 1.0 ? "NEW" : "OLD";
    if (speedup < 1.0) speedup = 1.0 / speedup;
    
    // Print table row
    printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F16 |\n",
           IW, IH, IC, KW, KH, IC, OC, stride, padding,
           old_ms, new_ms, speedup, faster,
           match ? "✅" : "❌");

    if(!match) {
        printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end()));
    }
    
    ggml_backend_free(backend);
    ggml_free(ctx1);
    ggml_free(ctx2);
 }

 void debug_tensor_layout() {
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx = ggml_init(params);
    
    // Create a simple 2x2x2x1 tensor (W=2, H=2, C=2, N=1)
    struct ggml_tensor * tensor = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 2, 1);
    
    printf("Tensor shape: [%ld, %ld, %ld, %ld]\n", 
           tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    printf("Tensor strides (bytes): [%ld, %ld, %ld, %ld]\n", 
           tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
    
    // Fill with known pattern: channel*100 + h*10 + w
    float * data = (float*)tensor->data;
    for (int n = 0; n < 1; n++) {
        for (int c = 0; c < 2; c++) {
            for (int h = 0; h < 2; h++) {
                for (int w = 0; w < 2; w++) {
                    float value = c * 100 + h * 10 + w;
                    // Try different indexing approaches
                    int idx_linear = n * (2*2*2) + c * (2*2) + h * 2 + w;
                    data[idx_linear] = value;
                    printf("Set [n=%d,c=%d,h=%d,w=%d] = %.0f at linear index %d\n", n, c, h, w, value, idx_linear);
                }
            }
        }
    }
    
    printf("\nReading back using stride calculation:\n");
    for (int n = 0; n < 1; n++) {
        for (int c = 0; c < 2; c++) {
            for (int h = 0; h < 2; h++) {
                for (int w = 0; w < 2; w++) {
                    // Calculate using strides
                    char* base = (char*)tensor->data;
                    float* ptr = (float*)(base + n * tensor->nb[3] + c * tensor->nb[2] + h * tensor->nb[1] + w * tensor->nb[0]);
                    printf("Read [n=%d,c=%d,h=%d,w=%d] = %.0f using strides\n", n, c, h, w, *ptr);
                }
            }
        }
    }
    
    ggml_free(ctx);
 }

 void test_simple_conv2d() {
    printf("=== Simple Conv2D Test (2x2x2 -> 2x2x1) ===\n");
    
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx1 = ggml_init(params);
    struct ggml_context * ctx2 = ggml_init(params);
    
    // Very simple: 2x2 input, 2 channels, 1x1 kernel, no padding/stride
    // Input: 2x2x2x1 (W=2, H=2, C=2, N=1)
    // Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1)  
    // Output: 2x2x1x1
    struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 1, 1, 2, 1);
    struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 1, 1, 2, 1);
    
    // Fill input with simple pattern (same data for both)
    printf("Input tensor (WHCN layout):\n");
    for (int c = 0; c < 2; c++) {
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1
                
                // Fill input1
                float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]);
                *ptr1 = val;
                
                // Fill input2 with same data
                float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]);
                *ptr2 = val;
                
                printf("  input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val);
            }
        }
    }
    
    // Fill kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both
    float * kernel1_data = (float*)kernel1->data;
    float * kernel2_data = (float*)kernel2->data;
    kernel1_data[0] = kernel2_data[0] = 1.0f; // weight for channel 0
    kernel1_data[1] = kernel2_data[1] = 1.0f; // weight for channel 1
    printf("\nKernel weights: [%.0f, %.0f]\n", kernel1_data[0], kernel1_data[1]);
    
    // Expected output calculation by hand:
    printf("\nExpected output (manual calculation):\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            float ch0_val = 0 * 10 + h * 2 + w + 1;  // channel 0 value
            float ch1_val = 1 * 10 + h * 2 + w + 1;  // channel 1 value  
            float expected = ch0_val * 1.0f + ch1_val * 1.0f;
            printf("  expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected);
        }
    }
    
    // Create operations: im2col+gemm vs direct
    struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1);
    struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1);
    
    // Build and compute graphs
    struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
    struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
    ggml_build_forward_expand(graph1, result_im2col);
    ggml_build_forward_expand(graph2, result_direct);
    
    ggml_backend_t backend = ggml_backend_cpu_init();
    ggml_backend_graph_compute(backend, graph1);
    ggml_backend_graph_compute(backend, graph2);
    
    // Check actual outputs
    float * result_im2col_data = (float*)result_im2col->data;
    float * result_direct_data = (float*)result_direct->data;
    
    printf("\nIM2COL+GEMM output:\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            int idx = h * 2 + w;
            printf("  im2col[w=%d,h=%d] = %.0f\n", w, h, result_im2col_data[idx]);
        }
    }
    
    printf("\nDirect/Tiled output:\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            int idx = h * 2 + w;
            printf("  direct[w=%d,h=%d] = %.0f\n", w, h, result_direct_data[idx]);
        }
    }
    
    // Compare them
    bool match = true;
    for (int i = 0; i < 4; i++) {
        if (fabs(result_im2col_data[i] - result_direct_data[i]) > 1e-4) {
            match = false;
            break;
        }
    }
    
    printf("\nComparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH");
    
    ggml_backend_free(backend);
    ggml_free(ctx1);
    ggml_free(ctx2);
    printf("\n");
 }

 void test_larger_conv2d() {
    printf("=== Larger Conv2D Test (3x3x3 -> 2x2x1) ===\n");
    
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx = ggml_init(params);
    
    // Larger test: 3x3 input, 3 channels, 2x2 kernel, no padding/stride
    // Input: 3x3x3x1 (W=3, H=3, C=3, N=1)
    // Kernel: 2x2x3x1 (KW=2, KH=2, IC=3, OC=1)  
    // Output: 2x2x1x1 (since (3-2)/1+1 = 2)
    struct ggml_tensor * input = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 3, 3, 1);
    struct ggml_tensor * kernel = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 3, 1);
    
    // Fill input with simple incremental pattern
    float * input_data = (float*)input->data;
    printf("Input tensor (WHCN layout):\n");
    float input_val = 1.0f;
    for (int c = 0; c < 3; c++) {
        printf("  Channel %d:\n", c);
        for (int h = 0; h < 3; h++) {
            printf("    ");
            for (int w = 0; w < 3; w++) {
                float * ptr = (float*)((char*)input_data + w * input->nb[0] + h * input->nb[1] + c * input->nb[2]);
                *ptr = input_val++;
                printf("%.0f ", *ptr);
            }
            printf("\n");
        }
    }
    
    // Fill kernel with different weights per channel to make it interesting
    float * kernel_data = (float*)kernel->data;
    printf("\nKernel weights (2x2 per channel):\n");
    float kernel_weights[3][2][2] = {
        {{1, 2}, {3, 4}},   // Channel 0: weights 1,2,3,4
        {{0.5, 1.5}, {2.5, 3.5}},  // Channel 1: weights 0.5,1.5,2.5,3.5  
        {{2, 1}, {0.5, 1.5}}  // Channel 2: weights 2,1,0.5,1.5
    };
    
    int idx = 0;
    for (int c = 0; c < 3; c++) {
        printf("  Channel %d:\n", c);
        for (int h = 0; h < 2; h++) {
            printf("    ");
            for (int w = 0; w < 2; w++) {
                kernel_data[idx] = kernel_weights[c][h][w];
                printf("%.1f ", kernel_data[idx]);
                idx++;
            }
            printf("\n");
        }
    }
    
    // Manual calculation of expected output
    printf("\nExpected output (manual calculation):\n");
    float expected[2][2];
    for (int out_h = 0; out_h < 2; out_h++) {
        for (int out_w = 0; out_w < 2; out_w++) {
            float sum = 0.0f;
            printf("  Output[%d,%d]: ", out_w, out_h);
            
            // For each channel
            for (int c = 0; c < 3; c++) {
                printf("(");
                // For each kernel position
                for (int kh = 0; kh < 2; kh++) {
                    for (int kw = 0; kw < 2; kw++) {
                        int input_h = out_h + kh;
                        int input_w = out_w + kw;
                        float input_val = c * 9 + input_h * 3 + input_w + 1;
                        float kernel_val = kernel_weights[c][kh][kw];
                        sum += input_val * kernel_val;
                        printf("%.0f*%.1f", input_val, kernel_val);
                        if (!(kh == 1 && kw == 1)) printf("+");
                    }
                }
                printf(")");
                if (c < 2) printf("+");
            }
            expected[out_h][out_w] = sum;
            printf(" = %.1f\n", sum);
        }
    }
    
    // Create conv2d operation
    struct ggml_tensor * result = ggml_conv_2d(ctx, kernel, input, 1, 1, 0, 0, 1, 1);
    
    // Build and compute graph
    struct ggml_cgraph * graph = ggml_new_graph(ctx);
    ggml_build_forward_expand(graph, result);
    
    ggml_backend_t backend = ggml_backend_cpu_init();
    ggml_backend_graph_compute(backend, graph);
    
    // Check actual output
    float * result_data = (float*)result->data;
    printf("\nActual output:\n");
    bool all_match = true;
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            int idx = h * 2 + w;
            printf("  actual[%d,%d] = %.1f\n", w, h, result_data[idx]);
            if (fabs(result_data[idx] - expected[h][w]) > 1e-4) {
                all_match = false;
            }
        }
    }
    
    if (all_match) {
        printf("✅ PASS: Large test matches expected result\n");
    } else {
        printf("❌ FAIL: Some outputs don't match expected values\n");
    }
    
    ggml_backend_free(backend);
    ggml_free(ctx);
    printf("\n");
 }

 void test_mixed_precision_conv2d() {
    printf("=== Mixed Precision Conv2D Test (F16 kernel, F32 input -> F16 output) ===\n");
    
    struct ggml_init_params params = {
        .mem_size = 128*1024*1024,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx1 = ggml_init(params);
    struct ggml_context * ctx2 = ggml_init(params);
    
    // Mixed precision: F32 input, F16 kernel -> F16 output
    // Input: 2x2x2x1 (W=2, H=2, C=2, N=1) - F32
    // Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1) - F16
    // Output: 2x2x1x1 - F16
    struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, 1, 1, 2, 1);
    struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F16, 1, 1, 2, 1);
    
    // Fill F32 input with simple pattern (same data for both)
    printf("Input tensor (WHCN layout, F32):\n");
    for (int c = 0; c < 2; c++) {
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1
                
                // Fill input1
                float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]);
                *ptr1 = val;
                
                // Fill input2 with same data
                float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]);
                *ptr2 = val;
                
                printf("  input[w=%d,h=%d,c=%d] = %.0f (F32)\n", w, h, c, val);
            }
        }
    }
    
    // Fill F16 kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both
    ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data;
    ggml_fp16_t * kernel2_data = (ggml_fp16_t*)kernel2->data;
    kernel1_data[0] = kernel2_data[0] = ggml_fp32_to_fp16(1.0f); // weight for channel 0
    kernel1_data[1] = kernel2_data[1] = ggml_fp32_to_fp16(1.0f); // weight for channel 1
    printf("\nKernel weights (F16): [%.0f, %.0f]\n", 
           ggml_fp16_to_fp32(kernel1_data[0]), ggml_fp16_to_fp32(kernel1_data[1]));
    
    // Expected output calculation by hand:
    printf("\nExpected output (manual calculation):\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            float ch0_val = 0 * 10 + h * 2 + w + 1;  // channel 0 value
            float ch1_val = 1 * 10 + h * 2 + w + 1;  // channel 1 value  
            float expected = ch0_val * 1.0f + ch1_val * 1.0f;
            printf("  expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected);
        }
    }
    
    // Create operations: im2col+gemm vs direct
    struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1);
    struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1);
    
    // Build and compute graphs
    struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
    struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
    ggml_build_forward_expand(graph1, result_im2col);
    ggml_build_forward_expand(graph2, result_direct);
    
    ggml_backend_t backend = ggml_backend_cpu_init();
    ggml_backend_graph_compute(backend, graph1);
    ggml_backend_graph_compute(backend, graph2);
    
    // Check actual outputs (mixed precision produces F16 outputs)
    ggml_fp16_t * result_im2col_data = (ggml_fp16_t*)result_im2col->data;
    ggml_fp16_t * result_direct_data = (ggml_fp16_t*)result_direct->data;
    
    printf("\nIM2COL+GEMM output (mixed precision -> F16):\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            int idx = h * 2 + w;
            printf("  im2col[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_im2col_data[idx]));
        }
    }
    
    printf("\nDirect/Tiled output (mixed precision -> F16):\n");
    for (int h = 0; h < 2; h++) {
        for (int w = 0; w < 2; w++) {
            int idx = h * 2 + w;
            printf("  direct[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_direct_data[idx]));
        }
    }
    
    // Compare them (with tolerance for mixed precision)
    bool match = true;
    float tolerance = 1e-3f; // Tolerance for mixed precision
    for (int i = 0; i < 4; i++) {
        float val_im2col = ggml_fp16_to_fp32(result_im2col_data[i]);
        float val_direct = ggml_fp16_to_fp32(result_direct_data[i]);
        if (fabs(val_im2col - val_direct) > tolerance) {
            match = false;
            break;
        }
    }
    
    printf("\nMixed Precision Comparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH");
    printf("(F16 kernel + F32 input -> F16 output)\n");
    
    ggml_backend_free(backend);
    ggml_free(ctx1);
    ggml_free(ctx2);
    printf("\n");
 }

 int main() {
    ggml_time_init(); // Initialize GGML timing
    
    printf("=== Debug Tensor Layout ===\n");
    debug_tensor_layout();
    printf("\n");
    
    test_simple_conv2d();
    test_larger_conv2d();
    
    // Mixed precision test
    printf("=== Multi-Output Test (2x2x2 -> 2x2x2) ===\n");
    
    struct ggml_init_params params2 = {
        .mem_size = 128*1024*1024,
        .mem_buffer = NULL,
        .no_alloc = false,
    };
    
    struct ggml_context * ctx3 = ggml_init(params2);
    struct ggml_context * ctx4 = ggml_init(params2);
    
    // Test: 2x2 input, 2 channels, 1x1 kernel, 2 output channels
    // Input: 2x2x2x1 (W=2, H=2, C=2, N=1)
    // Kernel: 1x1x2x2 (KW=1, KH=1, IC=2, OC=2)  
    // Output: 2x2x2x1
    struct ggml_tensor * input3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 1, 1, 2, 2);
    struct ggml_tensor * input4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 2, 2, 2, 1);
    struct ggml_tensor * kernel4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 1, 1, 2, 2);
    
    // Fill input with same pattern as before
    printf("Input tensor (WHCN layout):\n");
    for (int c = 0; c < 2; c++) {
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                float val = c * 10 + h * 2 + w + 1;
                
                float * ptr3 = (float*)((char*)input3->data + w * input3->nb[0] + h * input3->nb[1] + c * input3->nb[2]);
                float * ptr4 = (float*)((char*)input4->data + w * input4->nb[0] + h * input4->nb[1] + c * input4->nb[2]);
                *ptr3 = *ptr4 = val;
                
                printf("  input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val);
            }
        }
    }
    
    // Fill kernel with different weights for each output channel
    float * kernel3_data = (float*)kernel3->data;
    float * kernel4_data = (float*)kernel4->data;
    
    // Output channel 0: weights [1, 2] (for input channels 0,1)
    // Output channel 1: weights [0.5, 1.5] (for input channels 0,1)
    kernel3_data[0] = kernel4_data[0] = 1.0f;   // OC=0, IC=0
    kernel3_data[1] = kernel4_data[1] = 2.0f;   // OC=0, IC=1
    kernel3_data[2] = kernel4_data[2] = 0.5f;   // OC=1, IC=0
    kernel3_data[3] = kernel4_data[3] = 1.5f;   // OC=1, IC=1
    
    printf("\nKernel weights:\n");
    printf("  Output channel 0: [%.1f, %.1f]\n", kernel3_data[0], kernel3_data[1]);
    printf("  Output channel 1: [%.1f, %.1f]\n", kernel3_data[2], kernel3_data[3]);
    
    // Expected output calculation
    printf("\nExpected output (manual calculation):\n");
    for (int oc = 0; oc < 2; oc++) {
        printf("  Output channel %d:\n", oc);
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                float ch0_val = 0 * 10 + h * 2 + w + 1;  // input channel 0
                float ch1_val = 1 * 10 + h * 2 + w + 1;  // input channel 1
                float weight0 = kernel3_data[oc * 2 + 0];
                float weight1 = kernel3_data[oc * 2 + 1];
                float expected = ch0_val * weight0 + ch1_val * weight1;
                printf("    expected[w=%d,h=%d] = %.0f*%.1f + %.0f*%.1f = %.1f\n", 
                       w, h, ch0_val, weight0, ch1_val, weight1, expected);
            }
        }
    }
    
    // Create operations
    struct ggml_tensor * result3 = ggml_conv_2d(ctx3, kernel3, input3, 1, 1, 0, 0, 1, 1);
    struct ggml_tensor * result4 = ggml_conv_2d_direct(ctx4, kernel4, input4, 1, 1, 0, 0, 1, 1);
    
    // Build and compute graphs
    struct ggml_cgraph * graph3 = ggml_new_graph(ctx3);
    struct ggml_cgraph * graph4 = ggml_new_graph(ctx4);
    ggml_build_forward_expand(graph3, result3);
    ggml_build_forward_expand(graph4, result4);
    
    ggml_backend_t backend2 = ggml_backend_cpu_init();
    ggml_backend_graph_compute(backend2, graph3);
    ggml_backend_graph_compute(backend2, graph4);
    
    // Check outputs
    float * result3_data = (float*)result3->data;
    float * result4_data = (float*)result4->data;
    
    printf("\nIM2COL+GEMM output:\n");
    for (int oc = 0; oc < 2; oc++) {
        printf("  Output channel %d:\n", oc);
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                int idx = oc * 4 + h * 2 + w;  // Assuming WHCN layout for output
                printf("    im2col[w=%d,h=%d] = %.1f\n", w, h, result3_data[idx]);
            }
        }
    }
    
    printf("\nDirect/Tiled output:\n");
    for (int oc = 0; oc < 2; oc++) {
        printf("  Output channel %d:\n", oc);
        for (int h = 0; h < 2; h++) {
            for (int w = 0; w < 2; w++) {
                int idx = oc * 4 + h * 2 + w;  // Assuming WHCN layout for output
                printf("    direct[w=%d,h=%d] = %.1f\n", w, h, result4_data[idx]);
            }
        }
    }
    
    // Compare
    bool match2 = true;
    for (int i = 0; i < 8; i++) {
        if (fabs(result3_data[i] - result4_data[i]) > 1e-4) {
            match2 = false;
            break;
        }
    }
    
    printf("\nMulti-output comparison: %s\n", match2 ? "✅ MATCH" : "❌ MISMATCH");
    
    ggml_backend_free(backend2);
    ggml_free(ctx3);
    ggml_free(ctx4);
    
    printf("Done with focused tests.\n\n");
    
    printf("# Conv2D Implementation Performance Comparison\n\n");
    // === FP16 vs FP32 Performance Tests ===

    test_performance_fp16(112, 112, 32, 1, 2, 2, 64, 1, 0, 1); // pointwise
    test_performance_fp16(56, 56, 64, 1, 1, 1, 128, 1, 0, 1);  // pointwise
    test_performance_fp16(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32)
    test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1);   // Large ResNet50 first layer
   
    printf("\n# IM2COL vs SIMD Performance Tests (FP32)\n");
    printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n");
    printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n");
    
    // === ResNet-style convolutions ===
    // ResNet50 first conv layer (ImageNet input)
    test_performance(224, 224, 3, 1, 7, 7, 64, 2, 3, 1);
    
    // ResNet50 conv2_x layers  
    test_performance(56, 56, 64, 1, 1, 1, 64, 1, 0, 1);   // 1x1 bottleneck
    test_performance(56, 56, 64, 1, 3, 3, 64, 1, 1, 1);   // 3x3 conv
    test_performance(56, 56, 64, 1, 1, 1, 256, 1, 0, 1);  // 1x1 expansion
    
    // ResNet50 conv3_x layers
    test_performance(28, 28, 256, 1, 1, 1, 128, 1, 0, 1); // 1x1 bottleneck
    test_performance(28, 28, 128, 1, 3, 3, 128, 1, 1, 1); // 3x3 conv
    test_performance(28, 28, 128, 1, 1, 1, 512, 1, 0, 1); // 1x1 expansion
    
    // ResNet50 conv4_x layers  
    test_performance(14, 14, 512, 1, 1, 1, 256, 1, 0, 1); // 1x1 bottleneck
    test_performance(14, 14, 256, 1, 3, 3, 256, 1, 1, 1); // 3x3 conv
    test_performance(14, 14, 256, 1, 1, 1, 1024, 1, 0, 1);// 1x1 expansion
    
    // ResNet50 conv5_x layers
    test_performance(7, 7, 1024, 1, 1, 1, 512, 1, 0, 1);  // 1x1 bottleneck
    test_performance(7, 7, 512, 1, 3, 3, 512, 1, 1, 1);   // 3x3 conv
    test_performance(7, 7, 512, 1, 1, 1, 2048, 1, 0, 1);  // 1x1 expansion
    
    // === VGG-style convolutions ===
    // VGG16 early layers
    test_performance(224, 224, 3, 1, 3, 3, 64, 1, 1, 1);  // conv1_1
    test_performance(224, 224, 16, 1, 3, 3, 64, 1, 1, 1); // conv1_2
    test_performance(112, 112, 64, 1, 3, 3, 128, 1, 1, 1);// conv2_1
    test_performance(112, 112, 16, 1, 3, 3, 16, 1, 1, 1);// conv2_2
    
    // VGG16 middle layers  
    test_performance(56, 56, 128, 1, 3, 3, 256, 1, 1, 1); // conv3_1
    test_performance(56, 56, 256, 1, 3, 3, 256, 1, 1, 1); // conv3_2,3_3
    test_performance(28, 28, 256, 1, 3, 3, 512, 1, 1, 1); // conv4_1
    test_performance(28, 28, 512, 1, 3, 3, 512, 1, 1, 1); // conv4_2,4_3
    
    // === MobileNet-style convolutions ===
    // MobileNetV1 depthwise separable convs
    test_performance(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32)
    test_performance(112, 112, 32, 1, 1, 1, 64, 1, 0, 1); // pointwise
    test_performance(56, 56, 64, 1, 1, 1, 128, 1, 0, 1);  // pointwise
    test_performance(28, 28, 128, 1, 1, 1, 256, 1, 0, 1); // pointwise
    
    // === EfficientNet-style convolutions ===
    // EfficientNet-B0 layers
    test_performance(224, 224, 3, 1, 3, 3, 32, 2, 1, 1);  // stem conv
    test_performance(112, 112, 16, 1, 3, 3, 96, 1, 1, 1); // expand conv
    test_performance(112, 112, 24, 1, 1, 1, 14, 1, 0, 1);// expand conv
    test_performance(56, 56, 40, 1, 1, 1, 24, 1, 0, 1);  // expand conv
    
    // === Modern CNN typical layers ===
    // ConvNeXt-style convolutions
    test_performance(56, 56, 96, 1, 7, 7, 96, 1, 3, 1);   // large kernel depthwise
    test_performance(28, 28, 192, 1, 7, 7, 192, 1, 3, 1); // large kernel depthwise
    test_performance(14, 14, 384, 1, 7, 7, 384, 1, 3, 1); // large kernel depthwise
    
    // === Batch processing scenarios ===
    // Inference with batch size 4 (common for edge devices)
    test_performance(224, 224, 3, 4, 3, 3, 64, 1, 1, 1);  // batch input processing
    test_performance(56, 56, 64, 4, 3, 3, 64, 1, 1, 1);   // batch feature processing
    test_performance(28, 28, 128, 4, 1, 1, 64, 1, 0, 1); // batch pointwise

    test_performance(512, 512, 3, 1, 3, 3, 1, 1, 0, 1); // batch pointwise
    test_performance(256, 512, 3, 1, 3, 3, 9, 1, 0, 1); // batch pointwise
    test_performance(896, 896, 1, 1, 3, 3, 1, 1, 0, 1); // batch pointwise

    //test CLIP
    test_performance(224, 224, 3, 1, 3, 3, 768, 1, 1, 2); // batch pointwise
    
    printf("\n");
    
    printf("\n# Large Conv2D Performance Tests (Realistic Neural Network Sizes)\n");
    printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n");
    printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n");
    
    // === Large ResNet-style convolutions ===
    test_performance(512, 512, 3, 8, 7, 7, 64, 2, 3, 1);   // Large input ResNet50 first layer (8 batch)
    
    printf("\n# FP16 Large Conv2D Performance Tests\n");
    printf("| Input Size | Kernel | Config | FP32 (ms) | FP16 (ms) | Speedup | Match | Type |\n");
    printf("|------------|--------|--------|-----------|-----------|---------|-------|------|\n");
    test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1);   // Large input ResNet50 first layer (8 batch)
    
    // === Large FP16 vs FP32 tests ===
    
    return 0;
 }
No results found