Skip to content

Instantly share code, notes, and snippets.

@am17an
Last active June 30, 2025 14:30
Show Gist options
  • Select an option

  • Save am17an/38713d4d2ac1517d7d315321cb745db2 to your computer and use it in GitHub Desktop.

Select an option

Save am17an/38713d4d2ac1517d7d315321cb745db2 to your computer and use it in GitHub Desktop.
Vibe coded performance bench for ggml conv2d
#include "ggml.h"
#include "ggml-cpu.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <algorithm>
#include <vector>
// Function to print a tensor in readable format
static void print_tensor(const char* name, struct ggml_tensor * tensor) {
printf("\n%s shape: [%ld, %ld, %ld, %ld]\n", name,
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
float * data = (float*)tensor->data;
int64_t W = tensor->ne[0];
int64_t H = tensor->ne[1];
int64_t C = tensor->ne[2];
int64_t N = tensor->ne[3];
for (int64_t n = 0; n < N; n++) {
for (int64_t c = 0; c < C; c++) {
printf("%s[%ld,%ld]:\n", name, n, c);
for (int64_t h = 0; h < H; h++) {
printf(" ");
for (int64_t w = 0; w < W; w++) {
int64_t idx = n * (C * H * W) + c * (H * W) + h * W + w;
printf("%7.3f ", data[idx]);
}
printf("\n");
}
}
}
}
// Function to compare two tensors with tolerance
static bool tensors_equal(struct ggml_tensor * a, struct ggml_tensor * b, float tolerance) {
if (ggml_nelements(a) != ggml_nelements(b)) {
printf("Different number of elements: %zu vs %zu\n", ggml_nelements(a), ggml_nelements(b));
return false;
}
for (int i = 0; i < 4; i++) {
if (a->ne[i] != b->ne[i]) {
printf("Shape mismatch at dim %d: %ld vs %ld\n", i, a->ne[i], b->ne[i]);
return false;
}
}
float * data_a = (float*)a->data;
float * data_b = (float*)b->data;
float max_diff = 0.0f;
int diff_count = 0;
for (int i = 0; i < ggml_nelements(a); i++) {
float diff = fabsf(data_a[i] - data_b[i]);
if (diff > tolerance) {
diff_count++;
if (diff_count <= 10) { // Show first 10 differences
printf("Diff at index %zu: %.6f vs %.6f (diff: %.6f)\n",
i, data_a[i], data_b[i], diff);
}
}
if (diff > max_diff) {
max_diff = diff;
}
}
printf("Max difference: %.6f, Elements above tolerance: %d/%zu\n",
max_diff, diff_count, ggml_nelements(a));
return diff_count == 0;
}
// Structure to hold benchmark results
typedef struct {
int IW, IH, IC, N, KW, KH, OC;
int stride, padding, dilation;
double im2col_time_ms;
double simd_time_ms;
double speedup;
bool results_match;
int64_t total_ops; // Total operations for this configuration
} benchmark_result_t;
// Function to calculate total operations for a convolution
static int64_t calculate_conv_ops(int IW, int IH, int IC, int N, int KW, int KH, int OC, int stride, int padding) {
int OW = (IW + 2 * padding - KW) / stride + 1;
int OH = (IH + 2 * padding - KH) / stride + 1;
return (int64_t)N * OC * OH * OW * IC * KH * KW;
}
// Simplified timing function
static double get_time_ms() {
static int64_t first_call = 0;
int64_t now = ggml_time_us();
if (first_call == 0) first_call = now;
return (now - first_call) / 1000.0;
}
// Simple performance test function
static void test_performance(int IW, int IH, int IC, int N, int KW, int KH, int OC,
int stride, int padding, int dilation) {
// Create contexts
struct ggml_init_params params = {
.mem_size = 128*1024*1024*200,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx1 = ggml_init(params);
struct ggml_context * ctx2 = ggml_init(params);
if (!ctx1 || !ctx2) return;
// Create tensors
struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N);
struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, IW, IH, IC, N);
struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, KW, KH, IC, OC);
struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, KW, KH, IC, OC);
// Initialize data
srand(42);
float * input1_data = (float*)input1->data;
float * input2_data = (float*)input2->data;
float * kernel1_data = (float*)kernel1->data;
float * kernel2_data = (float*)kernel2->data;
for (int i = 0; i < ggml_nelements(input1); i++) {
float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
input1_data[i] = val;
input2_data[i] = val;
}
for (int i = 0; i < ggml_nelements(kernel1); i++) {
float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
kernel1_data[i] = val;
kernel2_data[i] = val;
}
// Create operations
struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1,
stride, stride, padding, padding,
dilation, dilation);
struct ggml_tensor * result_simd = ggml_conv_2d_direct(ctx2, kernel2, input2,
stride, stride, padding, padding,
dilation, dilation);
if (!result_im2col || !result_simd) {
ggml_free(ctx1);
ggml_free(ctx2);
return;
}
// Build graphs
struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
ggml_build_forward_expand(graph1, result_im2col);
ggml_build_forward_expand(graph2, result_simd);
ggml_backend_t backend = ggml_backend_cpu_init();
if (!backend) {
ggml_free(ctx1);
ggml_free(ctx2);
return;
}
// Time IM2COL
int64_t start = ggml_time_us();
ggml_backend_graph_compute(backend, graph1);
int64_t im2col_time = ggml_time_us() - start;
// Time SIMD
start = ggml_time_us();
ggml_backend_graph_compute(backend, graph2);
int64_t simd_time = ggml_time_us() - start;
// Compare results
bool match = true;
std::vector<float> diffs;
if (ggml_nelements(result_im2col) == ggml_nelements(result_simd)) {
float * data_a = (float*)result_im2col->data;
float * data_b = (float*)result_simd->data;
for (size_t i = 0; i < ggml_nelements(result_im2col); i++) {
if (fabsf(data_a[i] - data_b[i]) > 1e-4f) {
diffs.push_back(fabsf(data_a[i] - data_b[i]));
match = false;
}
}
} else {
match = false;
}
// Calculate speedup
double im2col_ms = im2col_time / 1000.0;
double simd_ms = simd_time / 1000.0;
double speedup = im2col_ms / simd_ms;
const char* faster = speedup > 1.0 ? "SIMD" : "IM2COL";
if (speedup < 1.0) speedup = 1.0 / speedup;
// Print table row
printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F32 |\n",
IW, IH, IC, KW, KH, IC, OC, stride, padding,
im2col_ms, simd_ms, speedup, faster,
match ? "" : "");
if(!match) {
printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end()));
}
ggml_backend_free(backend);
ggml_free(ctx1);
ggml_free(ctx2);
}
// FP16 performance test function - compare new fp16 implementation with old fp32 baseline
static void test_performance_fp16(int IW, int IH, int IC, int N, int KW, int KH, int OC,
int stride, int padding, int dilation) {
// Create contexts
struct ggml_init_params params = {
.mem_size = 128*1024*1024*10,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx1 = ggml_init(params);
struct ggml_context * ctx2 = ggml_init(params);
if (!ctx1 || !ctx2) return;
// Create tensors - NEW: fp16 kernel, OLD: fp32 kernel
struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N);
struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, KW, KH, IC, OC); // NEW: FP16
// Initialize data
srand(42);
float * input1_data = (float*)input1->data;
ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data;
for (int i = 0; i < ggml_nelements(input1); i++) {
float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
input1_data[i] = val;
}
for (int i = 0; i < ggml_nelements(kernel1); i++) {
float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
kernel1_data[i] = ggml_fp32_to_fp16(val);
}
// Create operations
struct ggml_tensor * result_new = ggml_conv_2d_direct(ctx1, kernel1, input1,
stride, stride, padding, padding,
dilation, dilation);
struct ggml_tensor * result_old = ggml_conv_2d(ctx2, kernel1, input1,
stride, stride, padding, padding,
dilation, dilation);
if (!result_new || !result_old) {
ggml_free(ctx1);
ggml_free(ctx2);
return;
}
// Build graphs
struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
ggml_build_forward_expand(graph1, result_new);
ggml_build_forward_expand(graph2, result_old);
ggml_backend_t backend = ggml_backend_cpu_init();
if (!backend) {
ggml_free(ctx1);
ggml_free(ctx2);
return;
}
// Time NEW (fp16)
int64_t start = ggml_time_us();
ggml_backend_graph_compute(backend, graph1);
int64_t new_time = ggml_time_us() - start;
// Time OLD (fp32)
start = ggml_time_us();
ggml_backend_graph_compute(backend, graph2);
int64_t old_time = ggml_time_us() - start;
// Compare results
bool match = true;
std::vector<float> diffs;
if (ggml_nelements(result_new) == ggml_nelements(result_old)) {
float * data_new = (float*)result_new->data;
float * data_old = (float*)result_old->data;
for (size_t i = 0; i < ggml_nelements(result_new); i++) {
if (fabsf(data_new[i] - data_old[i]) > 2e-2f) { // Slightly higher tolerance for fp16
diffs.push_back(fabsf(data_new[i] - data_old[i]));
match = false;
}
}
} else {
match = false;
}
// Calculate speedup
double new_ms = new_time / 1000.0;
double old_ms = old_time / 1000.0;
double speedup = old_ms / new_ms;
const char* faster = speedup > 1.0 ? "NEW" : "OLD";
if (speedup < 1.0) speedup = 1.0 / speedup;
// Print table row
printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F16 |\n",
IW, IH, IC, KW, KH, IC, OC, stride, padding,
old_ms, new_ms, speedup, faster,
match ? "" : "");
if(!match) {
printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end()));
}
ggml_backend_free(backend);
ggml_free(ctx1);
ggml_free(ctx2);
}
void debug_tensor_layout() {
struct ggml_init_params params = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx = ggml_init(params);
// Create a simple 2x2x2x1 tensor (W=2, H=2, C=2, N=1)
struct ggml_tensor * tensor = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 2, 1);
printf("Tensor shape: [%ld, %ld, %ld, %ld]\n",
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
printf("Tensor strides (bytes): [%ld, %ld, %ld, %ld]\n",
tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
// Fill with known pattern: channel*100 + h*10 + w
float * data = (float*)tensor->data;
for (int n = 0; n < 1; n++) {
for (int c = 0; c < 2; c++) {
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float value = c * 100 + h * 10 + w;
// Try different indexing approaches
int idx_linear = n * (2*2*2) + c * (2*2) + h * 2 + w;
data[idx_linear] = value;
printf("Set [n=%d,c=%d,h=%d,w=%d] = %.0f at linear index %d\n", n, c, h, w, value, idx_linear);
}
}
}
}
printf("\nReading back using stride calculation:\n");
for (int n = 0; n < 1; n++) {
for (int c = 0; c < 2; c++) {
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
// Calculate using strides
char* base = (char*)tensor->data;
float* ptr = (float*)(base + n * tensor->nb[3] + c * tensor->nb[2] + h * tensor->nb[1] + w * tensor->nb[0]);
printf("Read [n=%d,c=%d,h=%d,w=%d] = %.0f using strides\n", n, c, h, w, *ptr);
}
}
}
}
ggml_free(ctx);
}
void test_simple_conv2d() {
printf("=== Simple Conv2D Test (2x2x2 -> 2x2x1) ===\n");
struct ggml_init_params params = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx1 = ggml_init(params);
struct ggml_context * ctx2 = ggml_init(params);
// Very simple: 2x2 input, 2 channels, 1x1 kernel, no padding/stride
// Input: 2x2x2x1 (W=2, H=2, C=2, N=1)
// Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1)
// Output: 2x2x1x1
struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 1, 1, 2, 1);
struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 1, 1, 2, 1);
// Fill input with simple pattern (same data for both)
printf("Input tensor (WHCN layout):\n");
for (int c = 0; c < 2; c++) {
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1
// Fill input1
float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]);
*ptr1 = val;
// Fill input2 with same data
float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]);
*ptr2 = val;
printf(" input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val);
}
}
}
// Fill kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both
float * kernel1_data = (float*)kernel1->data;
float * kernel2_data = (float*)kernel2->data;
kernel1_data[0] = kernel2_data[0] = 1.0f; // weight for channel 0
kernel1_data[1] = kernel2_data[1] = 1.0f; // weight for channel 1
printf("\nKernel weights: [%.0f, %.0f]\n", kernel1_data[0], kernel1_data[1]);
// Expected output calculation by hand:
printf("\nExpected output (manual calculation):\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float ch0_val = 0 * 10 + h * 2 + w + 1; // channel 0 value
float ch1_val = 1 * 10 + h * 2 + w + 1; // channel 1 value
float expected = ch0_val * 1.0f + ch1_val * 1.0f;
printf(" expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected);
}
}
// Create operations: im2col+gemm vs direct
struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1);
struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1);
// Build and compute graphs
struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
ggml_build_forward_expand(graph1, result_im2col);
ggml_build_forward_expand(graph2, result_direct);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_graph_compute(backend, graph1);
ggml_backend_graph_compute(backend, graph2);
// Check actual outputs
float * result_im2col_data = (float*)result_im2col->data;
float * result_direct_data = (float*)result_direct->data;
printf("\nIM2COL+GEMM output:\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = h * 2 + w;
printf(" im2col[w=%d,h=%d] = %.0f\n", w, h, result_im2col_data[idx]);
}
}
printf("\nDirect/Tiled output:\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = h * 2 + w;
printf(" direct[w=%d,h=%d] = %.0f\n", w, h, result_direct_data[idx]);
}
}
// Compare them
bool match = true;
for (int i = 0; i < 4; i++) {
if (fabs(result_im2col_data[i] - result_direct_data[i]) > 1e-4) {
match = false;
break;
}
}
printf("\nComparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH");
ggml_backend_free(backend);
ggml_free(ctx1);
ggml_free(ctx2);
printf("\n");
}
void test_larger_conv2d() {
printf("=== Larger Conv2D Test (3x3x3 -> 2x2x1) ===\n");
struct ggml_init_params params = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx = ggml_init(params);
// Larger test: 3x3 input, 3 channels, 2x2 kernel, no padding/stride
// Input: 3x3x3x1 (W=3, H=3, C=3, N=1)
// Kernel: 2x2x3x1 (KW=2, KH=2, IC=3, OC=1)
// Output: 2x2x1x1 (since (3-2)/1+1 = 2)
struct ggml_tensor * input = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 3, 3, 1);
struct ggml_tensor * kernel = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 3, 1);
// Fill input with simple incremental pattern
float * input_data = (float*)input->data;
printf("Input tensor (WHCN layout):\n");
float input_val = 1.0f;
for (int c = 0; c < 3; c++) {
printf(" Channel %d:\n", c);
for (int h = 0; h < 3; h++) {
printf(" ");
for (int w = 0; w < 3; w++) {
float * ptr = (float*)((char*)input_data + w * input->nb[0] + h * input->nb[1] + c * input->nb[2]);
*ptr = input_val++;
printf("%.0f ", *ptr);
}
printf("\n");
}
}
// Fill kernel with different weights per channel to make it interesting
float * kernel_data = (float*)kernel->data;
printf("\nKernel weights (2x2 per channel):\n");
float kernel_weights[3][2][2] = {
{{1, 2}, {3, 4}}, // Channel 0: weights 1,2,3,4
{{0.5, 1.5}, {2.5, 3.5}}, // Channel 1: weights 0.5,1.5,2.5,3.5
{{2, 1}, {0.5, 1.5}} // Channel 2: weights 2,1,0.5,1.5
};
int idx = 0;
for (int c = 0; c < 3; c++) {
printf(" Channel %d:\n", c);
for (int h = 0; h < 2; h++) {
printf(" ");
for (int w = 0; w < 2; w++) {
kernel_data[idx] = kernel_weights[c][h][w];
printf("%.1f ", kernel_data[idx]);
idx++;
}
printf("\n");
}
}
// Manual calculation of expected output
printf("\nExpected output (manual calculation):\n");
float expected[2][2];
for (int out_h = 0; out_h < 2; out_h++) {
for (int out_w = 0; out_w < 2; out_w++) {
float sum = 0.0f;
printf(" Output[%d,%d]: ", out_w, out_h);
// For each channel
for (int c = 0; c < 3; c++) {
printf("(");
// For each kernel position
for (int kh = 0; kh < 2; kh++) {
for (int kw = 0; kw < 2; kw++) {
int input_h = out_h + kh;
int input_w = out_w + kw;
float input_val = c * 9 + input_h * 3 + input_w + 1;
float kernel_val = kernel_weights[c][kh][kw];
sum += input_val * kernel_val;
printf("%.0f*%.1f", input_val, kernel_val);
if (!(kh == 1 && kw == 1)) printf("+");
}
}
printf(")");
if (c < 2) printf("+");
}
expected[out_h][out_w] = sum;
printf(" = %.1f\n", sum);
}
}
// Create conv2d operation
struct ggml_tensor * result = ggml_conv_2d(ctx, kernel, input, 1, 1, 0, 0, 1, 1);
// Build and compute graph
struct ggml_cgraph * graph = ggml_new_graph(ctx);
ggml_build_forward_expand(graph, result);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_graph_compute(backend, graph);
// Check actual output
float * result_data = (float*)result->data;
printf("\nActual output:\n");
bool all_match = true;
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = h * 2 + w;
printf(" actual[%d,%d] = %.1f\n", w, h, result_data[idx]);
if (fabs(result_data[idx] - expected[h][w]) > 1e-4) {
all_match = false;
}
}
}
if (all_match) {
printf("✅ PASS: Large test matches expected result\n");
} else {
printf("❌ FAIL: Some outputs don't match expected values\n");
}
ggml_backend_free(backend);
ggml_free(ctx);
printf("\n");
}
void test_mixed_precision_conv2d() {
printf("=== Mixed Precision Conv2D Test (F16 kernel, F32 input -> F16 output) ===\n");
struct ggml_init_params params = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx1 = ggml_init(params);
struct ggml_context * ctx2 = ggml_init(params);
// Mixed precision: F32 input, F16 kernel -> F16 output
// Input: 2x2x2x1 (W=2, H=2, C=2, N=1) - F32
// Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1) - F16
// Output: 2x2x1x1 - F16
struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, 1, 1, 2, 1);
struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F16, 1, 1, 2, 1);
// Fill F32 input with simple pattern (same data for both)
printf("Input tensor (WHCN layout, F32):\n");
for (int c = 0; c < 2; c++) {
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1
// Fill input1
float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]);
*ptr1 = val;
// Fill input2 with same data
float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]);
*ptr2 = val;
printf(" input[w=%d,h=%d,c=%d] = %.0f (F32)\n", w, h, c, val);
}
}
}
// Fill F16 kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both
ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data;
ggml_fp16_t * kernel2_data = (ggml_fp16_t*)kernel2->data;
kernel1_data[0] = kernel2_data[0] = ggml_fp32_to_fp16(1.0f); // weight for channel 0
kernel1_data[1] = kernel2_data[1] = ggml_fp32_to_fp16(1.0f); // weight for channel 1
printf("\nKernel weights (F16): [%.0f, %.0f]\n",
ggml_fp16_to_fp32(kernel1_data[0]), ggml_fp16_to_fp32(kernel1_data[1]));
// Expected output calculation by hand:
printf("\nExpected output (manual calculation):\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float ch0_val = 0 * 10 + h * 2 + w + 1; // channel 0 value
float ch1_val = 1 * 10 + h * 2 + w + 1; // channel 1 value
float expected = ch0_val * 1.0f + ch1_val * 1.0f;
printf(" expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected);
}
}
// Create operations: im2col+gemm vs direct
struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1);
struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1);
// Build and compute graphs
struct ggml_cgraph * graph1 = ggml_new_graph(ctx1);
struct ggml_cgraph * graph2 = ggml_new_graph(ctx2);
ggml_build_forward_expand(graph1, result_im2col);
ggml_build_forward_expand(graph2, result_direct);
ggml_backend_t backend = ggml_backend_cpu_init();
ggml_backend_graph_compute(backend, graph1);
ggml_backend_graph_compute(backend, graph2);
// Check actual outputs (mixed precision produces F16 outputs)
ggml_fp16_t * result_im2col_data = (ggml_fp16_t*)result_im2col->data;
ggml_fp16_t * result_direct_data = (ggml_fp16_t*)result_direct->data;
printf("\nIM2COL+GEMM output (mixed precision -> F16):\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = h * 2 + w;
printf(" im2col[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_im2col_data[idx]));
}
}
printf("\nDirect/Tiled output (mixed precision -> F16):\n");
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = h * 2 + w;
printf(" direct[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_direct_data[idx]));
}
}
// Compare them (with tolerance for mixed precision)
bool match = true;
float tolerance = 1e-3f; // Tolerance for mixed precision
for (int i = 0; i < 4; i++) {
float val_im2col = ggml_fp16_to_fp32(result_im2col_data[i]);
float val_direct = ggml_fp16_to_fp32(result_direct_data[i]);
if (fabs(val_im2col - val_direct) > tolerance) {
match = false;
break;
}
}
printf("\nMixed Precision Comparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH");
printf("(F16 kernel + F32 input -> F16 output)\n");
ggml_backend_free(backend);
ggml_free(ctx1);
ggml_free(ctx2);
printf("\n");
}
int main() {
ggml_time_init(); // Initialize GGML timing
printf("=== Debug Tensor Layout ===\n");
debug_tensor_layout();
printf("\n");
test_simple_conv2d();
test_larger_conv2d();
// Mixed precision test
printf("=== Multi-Output Test (2x2x2 -> 2x2x2) ===\n");
struct ggml_init_params params2 = {
.mem_size = 128*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx3 = ggml_init(params2);
struct ggml_context * ctx4 = ggml_init(params2);
// Test: 2x2 input, 2 channels, 1x1 kernel, 2 output channels
// Input: 2x2x2x1 (W=2, H=2, C=2, N=1)
// Kernel: 1x1x2x2 (KW=1, KH=1, IC=2, OC=2)
// Output: 2x2x2x1
struct ggml_tensor * input3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 1, 1, 2, 2);
struct ggml_tensor * input4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 2, 2, 2, 1);
struct ggml_tensor * kernel4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 1, 1, 2, 2);
// Fill input with same pattern as before
printf("Input tensor (WHCN layout):\n");
for (int c = 0; c < 2; c++) {
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float val = c * 10 + h * 2 + w + 1;
float * ptr3 = (float*)((char*)input3->data + w * input3->nb[0] + h * input3->nb[1] + c * input3->nb[2]);
float * ptr4 = (float*)((char*)input4->data + w * input4->nb[0] + h * input4->nb[1] + c * input4->nb[2]);
*ptr3 = *ptr4 = val;
printf(" input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val);
}
}
}
// Fill kernel with different weights for each output channel
float * kernel3_data = (float*)kernel3->data;
float * kernel4_data = (float*)kernel4->data;
// Output channel 0: weights [1, 2] (for input channels 0,1)
// Output channel 1: weights [0.5, 1.5] (for input channels 0,1)
kernel3_data[0] = kernel4_data[0] = 1.0f; // OC=0, IC=0
kernel3_data[1] = kernel4_data[1] = 2.0f; // OC=0, IC=1
kernel3_data[2] = kernel4_data[2] = 0.5f; // OC=1, IC=0
kernel3_data[3] = kernel4_data[3] = 1.5f; // OC=1, IC=1
printf("\nKernel weights:\n");
printf(" Output channel 0: [%.1f, %.1f]\n", kernel3_data[0], kernel3_data[1]);
printf(" Output channel 1: [%.1f, %.1f]\n", kernel3_data[2], kernel3_data[3]);
// Expected output calculation
printf("\nExpected output (manual calculation):\n");
for (int oc = 0; oc < 2; oc++) {
printf(" Output channel %d:\n", oc);
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
float ch0_val = 0 * 10 + h * 2 + w + 1; // input channel 0
float ch1_val = 1 * 10 + h * 2 + w + 1; // input channel 1
float weight0 = kernel3_data[oc * 2 + 0];
float weight1 = kernel3_data[oc * 2 + 1];
float expected = ch0_val * weight0 + ch1_val * weight1;
printf(" expected[w=%d,h=%d] = %.0f*%.1f + %.0f*%.1f = %.1f\n",
w, h, ch0_val, weight0, ch1_val, weight1, expected);
}
}
}
// Create operations
struct ggml_tensor * result3 = ggml_conv_2d(ctx3, kernel3, input3, 1, 1, 0, 0, 1, 1);
struct ggml_tensor * result4 = ggml_conv_2d_direct(ctx4, kernel4, input4, 1, 1, 0, 0, 1, 1);
// Build and compute graphs
struct ggml_cgraph * graph3 = ggml_new_graph(ctx3);
struct ggml_cgraph * graph4 = ggml_new_graph(ctx4);
ggml_build_forward_expand(graph3, result3);
ggml_build_forward_expand(graph4, result4);
ggml_backend_t backend2 = ggml_backend_cpu_init();
ggml_backend_graph_compute(backend2, graph3);
ggml_backend_graph_compute(backend2, graph4);
// Check outputs
float * result3_data = (float*)result3->data;
float * result4_data = (float*)result4->data;
printf("\nIM2COL+GEMM output:\n");
for (int oc = 0; oc < 2; oc++) {
printf(" Output channel %d:\n", oc);
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = oc * 4 + h * 2 + w; // Assuming WHCN layout for output
printf(" im2col[w=%d,h=%d] = %.1f\n", w, h, result3_data[idx]);
}
}
}
printf("\nDirect/Tiled output:\n");
for (int oc = 0; oc < 2; oc++) {
printf(" Output channel %d:\n", oc);
for (int h = 0; h < 2; h++) {
for (int w = 0; w < 2; w++) {
int idx = oc * 4 + h * 2 + w; // Assuming WHCN layout for output
printf(" direct[w=%d,h=%d] = %.1f\n", w, h, result4_data[idx]);
}
}
}
// Compare
bool match2 = true;
for (int i = 0; i < 8; i++) {
if (fabs(result3_data[i] - result4_data[i]) > 1e-4) {
match2 = false;
break;
}
}
printf("\nMulti-output comparison: %s\n", match2 ? "✅ MATCH" : "❌ MISMATCH");
ggml_backend_free(backend2);
ggml_free(ctx3);
ggml_free(ctx4);
printf("Done with focused tests.\n\n");
printf("# Conv2D Implementation Performance Comparison\n\n");
// === FP16 vs FP32 Performance Tests ===
test_performance_fp16(112, 112, 32, 1, 2, 2, 64, 1, 0, 1); // pointwise
test_performance_fp16(56, 56, 64, 1, 1, 1, 128, 1, 0, 1); // pointwise
test_performance_fp16(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32)
test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large ResNet50 first layer
printf("\n# IM2COL vs SIMD Performance Tests (FP32)\n");
printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n");
printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n");
// === ResNet-style convolutions ===
// ResNet50 first conv layer (ImageNet input)
test_performance(224, 224, 3, 1, 7, 7, 64, 2, 3, 1);
// ResNet50 conv2_x layers
test_performance(56, 56, 64, 1, 1, 1, 64, 1, 0, 1); // 1x1 bottleneck
test_performance(56, 56, 64, 1, 3, 3, 64, 1, 1, 1); // 3x3 conv
test_performance(56, 56, 64, 1, 1, 1, 256, 1, 0, 1); // 1x1 expansion
// ResNet50 conv3_x layers
test_performance(28, 28, 256, 1, 1, 1, 128, 1, 0, 1); // 1x1 bottleneck
test_performance(28, 28, 128, 1, 3, 3, 128, 1, 1, 1); // 3x3 conv
test_performance(28, 28, 128, 1, 1, 1, 512, 1, 0, 1); // 1x1 expansion
// ResNet50 conv4_x layers
test_performance(14, 14, 512, 1, 1, 1, 256, 1, 0, 1); // 1x1 bottleneck
test_performance(14, 14, 256, 1, 3, 3, 256, 1, 1, 1); // 3x3 conv
test_performance(14, 14, 256, 1, 1, 1, 1024, 1, 0, 1);// 1x1 expansion
// ResNet50 conv5_x layers
test_performance(7, 7, 1024, 1, 1, 1, 512, 1, 0, 1); // 1x1 bottleneck
test_performance(7, 7, 512, 1, 3, 3, 512, 1, 1, 1); // 3x3 conv
test_performance(7, 7, 512, 1, 1, 1, 2048, 1, 0, 1); // 1x1 expansion
// === VGG-style convolutions ===
// VGG16 early layers
test_performance(224, 224, 3, 1, 3, 3, 64, 1, 1, 1); // conv1_1
test_performance(224, 224, 16, 1, 3, 3, 64, 1, 1, 1); // conv1_2
test_performance(112, 112, 64, 1, 3, 3, 128, 1, 1, 1);// conv2_1
test_performance(112, 112, 16, 1, 3, 3, 16, 1, 1, 1);// conv2_2
// VGG16 middle layers
test_performance(56, 56, 128, 1, 3, 3, 256, 1, 1, 1); // conv3_1
test_performance(56, 56, 256, 1, 3, 3, 256, 1, 1, 1); // conv3_2,3_3
test_performance(28, 28, 256, 1, 3, 3, 512, 1, 1, 1); // conv4_1
test_performance(28, 28, 512, 1, 3, 3, 512, 1, 1, 1); // conv4_2,4_3
// === MobileNet-style convolutions ===
// MobileNetV1 depthwise separable convs
test_performance(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32)
test_performance(112, 112, 32, 1, 1, 1, 64, 1, 0, 1); // pointwise
test_performance(56, 56, 64, 1, 1, 1, 128, 1, 0, 1); // pointwise
test_performance(28, 28, 128, 1, 1, 1, 256, 1, 0, 1); // pointwise
// === EfficientNet-style convolutions ===
// EfficientNet-B0 layers
test_performance(224, 224, 3, 1, 3, 3, 32, 2, 1, 1); // stem conv
test_performance(112, 112, 16, 1, 3, 3, 96, 1, 1, 1); // expand conv
test_performance(112, 112, 24, 1, 1, 1, 14, 1, 0, 1);// expand conv
test_performance(56, 56, 40, 1, 1, 1, 24, 1, 0, 1); // expand conv
// === Modern CNN typical layers ===
// ConvNeXt-style convolutions
test_performance(56, 56, 96, 1, 7, 7, 96, 1, 3, 1); // large kernel depthwise
test_performance(28, 28, 192, 1, 7, 7, 192, 1, 3, 1); // large kernel depthwise
test_performance(14, 14, 384, 1, 7, 7, 384, 1, 3, 1); // large kernel depthwise
// === Batch processing scenarios ===
// Inference with batch size 4 (common for edge devices)
test_performance(224, 224, 3, 4, 3, 3, 64, 1, 1, 1); // batch input processing
test_performance(56, 56, 64, 4, 3, 3, 64, 1, 1, 1); // batch feature processing
test_performance(28, 28, 128, 4, 1, 1, 64, 1, 0, 1); // batch pointwise
test_performance(512, 512, 3, 1, 3, 3, 1, 1, 0, 1); // batch pointwise
test_performance(256, 512, 3, 1, 3, 3, 9, 1, 0, 1); // batch pointwise
test_performance(896, 896, 1, 1, 3, 3, 1, 1, 0, 1); // batch pointwise
//test CLIP
test_performance(224, 224, 3, 1, 3, 3, 768, 1, 1, 2); // batch pointwise
printf("\n");
printf("\n# Large Conv2D Performance Tests (Realistic Neural Network Sizes)\n");
printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n");
printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n");
// === Large ResNet-style convolutions ===
test_performance(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large input ResNet50 first layer (8 batch)
printf("\n# FP16 Large Conv2D Performance Tests\n");
printf("| Input Size | Kernel | Config | FP32 (ms) | FP16 (ms) | Speedup | Match | Type |\n");
printf("|------------|--------|--------|-----------|-----------|---------|-------|------|\n");
test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large input ResNet50 first layer (8 batch)
// === Large FP16 vs FP32 tests ===
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment