Last active
June 30, 2025 14:30
-
-
Save am17an/38713d4d2ac1517d7d315321cb745db2 to your computer and use it in GitHub Desktop.
Vibe coded performance bench for ggml conv2d
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "ggml.h" | |
| #include "ggml-cpu.h" | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <math.h> | |
| #include <string.h> | |
| #include <algorithm> | |
| #include <vector> | |
| // Function to print a tensor in readable format | |
| static void print_tensor(const char* name, struct ggml_tensor * tensor) { | |
| printf("\n%s shape: [%ld, %ld, %ld, %ld]\n", name, | |
| tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); | |
| float * data = (float*)tensor->data; | |
| int64_t W = tensor->ne[0]; | |
| int64_t H = tensor->ne[1]; | |
| int64_t C = tensor->ne[2]; | |
| int64_t N = tensor->ne[3]; | |
| for (int64_t n = 0; n < N; n++) { | |
| for (int64_t c = 0; c < C; c++) { | |
| printf("%s[%ld,%ld]:\n", name, n, c); | |
| for (int64_t h = 0; h < H; h++) { | |
| printf(" "); | |
| for (int64_t w = 0; w < W; w++) { | |
| int64_t idx = n * (C * H * W) + c * (H * W) + h * W + w; | |
| printf("%7.3f ", data[idx]); | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| } | |
| } | |
| // Function to compare two tensors with tolerance | |
| static bool tensors_equal(struct ggml_tensor * a, struct ggml_tensor * b, float tolerance) { | |
| if (ggml_nelements(a) != ggml_nelements(b)) { | |
| printf("Different number of elements: %zu vs %zu\n", ggml_nelements(a), ggml_nelements(b)); | |
| return false; | |
| } | |
| for (int i = 0; i < 4; i++) { | |
| if (a->ne[i] != b->ne[i]) { | |
| printf("Shape mismatch at dim %d: %ld vs %ld\n", i, a->ne[i], b->ne[i]); | |
| return false; | |
| } | |
| } | |
| float * data_a = (float*)a->data; | |
| float * data_b = (float*)b->data; | |
| float max_diff = 0.0f; | |
| int diff_count = 0; | |
| for (int i = 0; i < ggml_nelements(a); i++) { | |
| float diff = fabsf(data_a[i] - data_b[i]); | |
| if (diff > tolerance) { | |
| diff_count++; | |
| if (diff_count <= 10) { // Show first 10 differences | |
| printf("Diff at index %zu: %.6f vs %.6f (diff: %.6f)\n", | |
| i, data_a[i], data_b[i], diff); | |
| } | |
| } | |
| if (diff > max_diff) { | |
| max_diff = diff; | |
| } | |
| } | |
| printf("Max difference: %.6f, Elements above tolerance: %d/%zu\n", | |
| max_diff, diff_count, ggml_nelements(a)); | |
| return diff_count == 0; | |
| } | |
| // Structure to hold benchmark results | |
| typedef struct { | |
| int IW, IH, IC, N, KW, KH, OC; | |
| int stride, padding, dilation; | |
| double im2col_time_ms; | |
| double simd_time_ms; | |
| double speedup; | |
| bool results_match; | |
| int64_t total_ops; // Total operations for this configuration | |
| } benchmark_result_t; | |
| // Function to calculate total operations for a convolution | |
| static int64_t calculate_conv_ops(int IW, int IH, int IC, int N, int KW, int KH, int OC, int stride, int padding) { | |
| int OW = (IW + 2 * padding - KW) / stride + 1; | |
| int OH = (IH + 2 * padding - KH) / stride + 1; | |
| return (int64_t)N * OC * OH * OW * IC * KH * KW; | |
| } | |
| // Simplified timing function | |
| static double get_time_ms() { | |
| static int64_t first_call = 0; | |
| int64_t now = ggml_time_us(); | |
| if (first_call == 0) first_call = now; | |
| return (now - first_call) / 1000.0; | |
| } | |
| // Simple performance test function | |
| static void test_performance(int IW, int IH, int IC, int N, int KW, int KH, int OC, | |
| int stride, int padding, int dilation) { | |
| // Create contexts | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024*200, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx1 = ggml_init(params); | |
| struct ggml_context * ctx2 = ggml_init(params); | |
| if (!ctx1 || !ctx2) return; | |
| // Create tensors | |
| struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N); | |
| struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, IW, IH, IC, N); | |
| struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, KW, KH, IC, OC); | |
| struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, KW, KH, IC, OC); | |
| // Initialize data | |
| srand(42); | |
| float * input1_data = (float*)input1->data; | |
| float * input2_data = (float*)input2->data; | |
| float * kernel1_data = (float*)kernel1->data; | |
| float * kernel2_data = (float*)kernel2->data; | |
| for (int i = 0; i < ggml_nelements(input1); i++) { | |
| float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; | |
| input1_data[i] = val; | |
| input2_data[i] = val; | |
| } | |
| for (int i = 0; i < ggml_nelements(kernel1); i++) { | |
| float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; | |
| kernel1_data[i] = val; | |
| kernel2_data[i] = val; | |
| } | |
| // Create operations | |
| struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, | |
| stride, stride, padding, padding, | |
| dilation, dilation); | |
| struct ggml_tensor * result_simd = ggml_conv_2d_direct(ctx2, kernel2, input2, | |
| stride, stride, padding, padding, | |
| dilation, dilation); | |
| if (!result_im2col || !result_simd) { | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| return; | |
| } | |
| // Build graphs | |
| struct ggml_cgraph * graph1 = ggml_new_graph(ctx1); | |
| struct ggml_cgraph * graph2 = ggml_new_graph(ctx2); | |
| ggml_build_forward_expand(graph1, result_im2col); | |
| ggml_build_forward_expand(graph2, result_simd); | |
| ggml_backend_t backend = ggml_backend_cpu_init(); | |
| if (!backend) { | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| return; | |
| } | |
| // Time IM2COL | |
| int64_t start = ggml_time_us(); | |
| ggml_backend_graph_compute(backend, graph1); | |
| int64_t im2col_time = ggml_time_us() - start; | |
| // Time SIMD | |
| start = ggml_time_us(); | |
| ggml_backend_graph_compute(backend, graph2); | |
| int64_t simd_time = ggml_time_us() - start; | |
| // Compare results | |
| bool match = true; | |
| std::vector<float> diffs; | |
| if (ggml_nelements(result_im2col) == ggml_nelements(result_simd)) { | |
| float * data_a = (float*)result_im2col->data; | |
| float * data_b = (float*)result_simd->data; | |
| for (size_t i = 0; i < ggml_nelements(result_im2col); i++) { | |
| if (fabsf(data_a[i] - data_b[i]) > 1e-4f) { | |
| diffs.push_back(fabsf(data_a[i] - data_b[i])); | |
| match = false; | |
| } | |
| } | |
| } else { | |
| match = false; | |
| } | |
| // Calculate speedup | |
| double im2col_ms = im2col_time / 1000.0; | |
| double simd_ms = simd_time / 1000.0; | |
| double speedup = im2col_ms / simd_ms; | |
| const char* faster = speedup > 1.0 ? "SIMD" : "IM2COL"; | |
| if (speedup < 1.0) speedup = 1.0 / speedup; | |
| // Print table row | |
| printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F32 |\n", | |
| IW, IH, IC, KW, KH, IC, OC, stride, padding, | |
| im2col_ms, simd_ms, speedup, faster, | |
| match ? "✅" : "❌"); | |
| if(!match) { | |
| printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end())); | |
| } | |
| ggml_backend_free(backend); | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| } | |
| // FP16 performance test function - compare new fp16 implementation with old fp32 baseline | |
| static void test_performance_fp16(int IW, int IH, int IC, int N, int KW, int KH, int OC, | |
| int stride, int padding, int dilation) { | |
| // Create contexts | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024*10, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx1 = ggml_init(params); | |
| struct ggml_context * ctx2 = ggml_init(params); | |
| if (!ctx1 || !ctx2) return; | |
| // Create tensors - NEW: fp16 kernel, OLD: fp32 kernel | |
| struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, IW, IH, IC, N); | |
| struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, KW, KH, IC, OC); // NEW: FP16 | |
| // Initialize data | |
| srand(42); | |
| float * input1_data = (float*)input1->data; | |
| ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data; | |
| for (int i = 0; i < ggml_nelements(input1); i++) { | |
| float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; | |
| input1_data[i] = val; | |
| } | |
| for (int i = 0; i < ggml_nelements(kernel1); i++) { | |
| float val = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; | |
| kernel1_data[i] = ggml_fp32_to_fp16(val); | |
| } | |
| // Create operations | |
| struct ggml_tensor * result_new = ggml_conv_2d_direct(ctx1, kernel1, input1, | |
| stride, stride, padding, padding, | |
| dilation, dilation); | |
| struct ggml_tensor * result_old = ggml_conv_2d(ctx2, kernel1, input1, | |
| stride, stride, padding, padding, | |
| dilation, dilation); | |
| if (!result_new || !result_old) { | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| return; | |
| } | |
| // Build graphs | |
| struct ggml_cgraph * graph1 = ggml_new_graph(ctx1); | |
| struct ggml_cgraph * graph2 = ggml_new_graph(ctx2); | |
| ggml_build_forward_expand(graph1, result_new); | |
| ggml_build_forward_expand(graph2, result_old); | |
| ggml_backend_t backend = ggml_backend_cpu_init(); | |
| if (!backend) { | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| return; | |
| } | |
| // Time NEW (fp16) | |
| int64_t start = ggml_time_us(); | |
| ggml_backend_graph_compute(backend, graph1); | |
| int64_t new_time = ggml_time_us() - start; | |
| // Time OLD (fp32) | |
| start = ggml_time_us(); | |
| ggml_backend_graph_compute(backend, graph2); | |
| int64_t old_time = ggml_time_us() - start; | |
| // Compare results | |
| bool match = true; | |
| std::vector<float> diffs; | |
| if (ggml_nelements(result_new) == ggml_nelements(result_old)) { | |
| float * data_new = (float*)result_new->data; | |
| float * data_old = (float*)result_old->data; | |
| for (size_t i = 0; i < ggml_nelements(result_new); i++) { | |
| if (fabsf(data_new[i] - data_old[i]) > 2e-2f) { // Slightly higher tolerance for fp16 | |
| diffs.push_back(fabsf(data_new[i] - data_old[i])); | |
| match = false; | |
| } | |
| } | |
| } else { | |
| match = false; | |
| } | |
| // Calculate speedup | |
| double new_ms = new_time / 1000.0; | |
| double old_ms = old_time / 1000.0; | |
| double speedup = old_ms / new_ms; | |
| const char* faster = speedup > 1.0 ? "NEW" : "OLD"; | |
| if (speedup < 1.0) speedup = 1.0 / speedup; | |
| // Print table row | |
| printf("| %dx%dx%d | %dx%dx%d→%d | s%d p%d | %6.3f | %6.3f | %5.2fx %s | %s | F16 |\n", | |
| IW, IH, IC, KW, KH, IC, OC, stride, padding, | |
| old_ms, new_ms, speedup, faster, | |
| match ? "✅" : "❌"); | |
| if(!match) { | |
| printf("NumDiffers: %ld, MaxDifference: %f\n", diffs.size(), *std::max_element(diffs.begin(), diffs.end())); | |
| } | |
| ggml_backend_free(backend); | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| } | |
| void debug_tensor_layout() { | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx = ggml_init(params); | |
| // Create a simple 2x2x2x1 tensor (W=2, H=2, C=2, N=1) | |
| struct ggml_tensor * tensor = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 2, 1); | |
| printf("Tensor shape: [%ld, %ld, %ld, %ld]\n", | |
| tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); | |
| printf("Tensor strides (bytes): [%ld, %ld, %ld, %ld]\n", | |
| tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]); | |
| // Fill with known pattern: channel*100 + h*10 + w | |
| float * data = (float*)tensor->data; | |
| for (int n = 0; n < 1; n++) { | |
| for (int c = 0; c < 2; c++) { | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float value = c * 100 + h * 10 + w; | |
| // Try different indexing approaches | |
| int idx_linear = n * (2*2*2) + c * (2*2) + h * 2 + w; | |
| data[idx_linear] = value; | |
| printf("Set [n=%d,c=%d,h=%d,w=%d] = %.0f at linear index %d\n", n, c, h, w, value, idx_linear); | |
| } | |
| } | |
| } | |
| } | |
| printf("\nReading back using stride calculation:\n"); | |
| for (int n = 0; n < 1; n++) { | |
| for (int c = 0; c < 2; c++) { | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| // Calculate using strides | |
| char* base = (char*)tensor->data; | |
| float* ptr = (float*)(base + n * tensor->nb[3] + c * tensor->nb[2] + h * tensor->nb[1] + w * tensor->nb[0]); | |
| printf("Read [n=%d,c=%d,h=%d,w=%d] = %.0f using strides\n", n, c, h, w, *ptr); | |
| } | |
| } | |
| } | |
| } | |
| ggml_free(ctx); | |
| } | |
| void test_simple_conv2d() { | |
| printf("=== Simple Conv2D Test (2x2x2 -> 2x2x1) ===\n"); | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx1 = ggml_init(params); | |
| struct ggml_context * ctx2 = ggml_init(params); | |
| // Very simple: 2x2 input, 2 channels, 1x1 kernel, no padding/stride | |
| // Input: 2x2x2x1 (W=2, H=2, C=2, N=1) | |
| // Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1) | |
| // Output: 2x2x1x1 | |
| struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 1, 1, 2, 1); | |
| struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 1, 1, 2, 1); | |
| // Fill input with simple pattern (same data for both) | |
| printf("Input tensor (WHCN layout):\n"); | |
| for (int c = 0; c < 2; c++) { | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1 | |
| // Fill input1 | |
| float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]); | |
| *ptr1 = val; | |
| // Fill input2 with same data | |
| float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]); | |
| *ptr2 = val; | |
| printf(" input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val); | |
| } | |
| } | |
| } | |
| // Fill kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both | |
| float * kernel1_data = (float*)kernel1->data; | |
| float * kernel2_data = (float*)kernel2->data; | |
| kernel1_data[0] = kernel2_data[0] = 1.0f; // weight for channel 0 | |
| kernel1_data[1] = kernel2_data[1] = 1.0f; // weight for channel 1 | |
| printf("\nKernel weights: [%.0f, %.0f]\n", kernel1_data[0], kernel1_data[1]); | |
| // Expected output calculation by hand: | |
| printf("\nExpected output (manual calculation):\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float ch0_val = 0 * 10 + h * 2 + w + 1; // channel 0 value | |
| float ch1_val = 1 * 10 + h * 2 + w + 1; // channel 1 value | |
| float expected = ch0_val * 1.0f + ch1_val * 1.0f; | |
| printf(" expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected); | |
| } | |
| } | |
| // Create operations: im2col+gemm vs direct | |
| struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1); | |
| struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1); | |
| // Build and compute graphs | |
| struct ggml_cgraph * graph1 = ggml_new_graph(ctx1); | |
| struct ggml_cgraph * graph2 = ggml_new_graph(ctx2); | |
| ggml_build_forward_expand(graph1, result_im2col); | |
| ggml_build_forward_expand(graph2, result_direct); | |
| ggml_backend_t backend = ggml_backend_cpu_init(); | |
| ggml_backend_graph_compute(backend, graph1); | |
| ggml_backend_graph_compute(backend, graph2); | |
| // Check actual outputs | |
| float * result_im2col_data = (float*)result_im2col->data; | |
| float * result_direct_data = (float*)result_direct->data; | |
| printf("\nIM2COL+GEMM output:\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = h * 2 + w; | |
| printf(" im2col[w=%d,h=%d] = %.0f\n", w, h, result_im2col_data[idx]); | |
| } | |
| } | |
| printf("\nDirect/Tiled output:\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = h * 2 + w; | |
| printf(" direct[w=%d,h=%d] = %.0f\n", w, h, result_direct_data[idx]); | |
| } | |
| } | |
| // Compare them | |
| bool match = true; | |
| for (int i = 0; i < 4; i++) { | |
| if (fabs(result_im2col_data[i] - result_direct_data[i]) > 1e-4) { | |
| match = false; | |
| break; | |
| } | |
| } | |
| printf("\nComparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH"); | |
| ggml_backend_free(backend); | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| printf("\n"); | |
| } | |
| void test_larger_conv2d() { | |
| printf("=== Larger Conv2D Test (3x3x3 -> 2x2x1) ===\n"); | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx = ggml_init(params); | |
| // Larger test: 3x3 input, 3 channels, 2x2 kernel, no padding/stride | |
| // Input: 3x3x3x1 (W=3, H=3, C=3, N=1) | |
| // Kernel: 2x2x3x1 (KW=2, KH=2, IC=3, OC=1) | |
| // Output: 2x2x1x1 (since (3-2)/1+1 = 2) | |
| struct ggml_tensor * input = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 3, 3, 3, 1); | |
| struct ggml_tensor * kernel = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 2, 2, 3, 1); | |
| // Fill input with simple incremental pattern | |
| float * input_data = (float*)input->data; | |
| printf("Input tensor (WHCN layout):\n"); | |
| float input_val = 1.0f; | |
| for (int c = 0; c < 3; c++) { | |
| printf(" Channel %d:\n", c); | |
| for (int h = 0; h < 3; h++) { | |
| printf(" "); | |
| for (int w = 0; w < 3; w++) { | |
| float * ptr = (float*)((char*)input_data + w * input->nb[0] + h * input->nb[1] + c * input->nb[2]); | |
| *ptr = input_val++; | |
| printf("%.0f ", *ptr); | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| // Fill kernel with different weights per channel to make it interesting | |
| float * kernel_data = (float*)kernel->data; | |
| printf("\nKernel weights (2x2 per channel):\n"); | |
| float kernel_weights[3][2][2] = { | |
| {{1, 2}, {3, 4}}, // Channel 0: weights 1,2,3,4 | |
| {{0.5, 1.5}, {2.5, 3.5}}, // Channel 1: weights 0.5,1.5,2.5,3.5 | |
| {{2, 1}, {0.5, 1.5}} // Channel 2: weights 2,1,0.5,1.5 | |
| }; | |
| int idx = 0; | |
| for (int c = 0; c < 3; c++) { | |
| printf(" Channel %d:\n", c); | |
| for (int h = 0; h < 2; h++) { | |
| printf(" "); | |
| for (int w = 0; w < 2; w++) { | |
| kernel_data[idx] = kernel_weights[c][h][w]; | |
| printf("%.1f ", kernel_data[idx]); | |
| idx++; | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| // Manual calculation of expected output | |
| printf("\nExpected output (manual calculation):\n"); | |
| float expected[2][2]; | |
| for (int out_h = 0; out_h < 2; out_h++) { | |
| for (int out_w = 0; out_w < 2; out_w++) { | |
| float sum = 0.0f; | |
| printf(" Output[%d,%d]: ", out_w, out_h); | |
| // For each channel | |
| for (int c = 0; c < 3; c++) { | |
| printf("("); | |
| // For each kernel position | |
| for (int kh = 0; kh < 2; kh++) { | |
| for (int kw = 0; kw < 2; kw++) { | |
| int input_h = out_h + kh; | |
| int input_w = out_w + kw; | |
| float input_val = c * 9 + input_h * 3 + input_w + 1; | |
| float kernel_val = kernel_weights[c][kh][kw]; | |
| sum += input_val * kernel_val; | |
| printf("%.0f*%.1f", input_val, kernel_val); | |
| if (!(kh == 1 && kw == 1)) printf("+"); | |
| } | |
| } | |
| printf(")"); | |
| if (c < 2) printf("+"); | |
| } | |
| expected[out_h][out_w] = sum; | |
| printf(" = %.1f\n", sum); | |
| } | |
| } | |
| // Create conv2d operation | |
| struct ggml_tensor * result = ggml_conv_2d(ctx, kernel, input, 1, 1, 0, 0, 1, 1); | |
| // Build and compute graph | |
| struct ggml_cgraph * graph = ggml_new_graph(ctx); | |
| ggml_build_forward_expand(graph, result); | |
| ggml_backend_t backend = ggml_backend_cpu_init(); | |
| ggml_backend_graph_compute(backend, graph); | |
| // Check actual output | |
| float * result_data = (float*)result->data; | |
| printf("\nActual output:\n"); | |
| bool all_match = true; | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = h * 2 + w; | |
| printf(" actual[%d,%d] = %.1f\n", w, h, result_data[idx]); | |
| if (fabs(result_data[idx] - expected[h][w]) > 1e-4) { | |
| all_match = false; | |
| } | |
| } | |
| } | |
| if (all_match) { | |
| printf("✅ PASS: Large test matches expected result\n"); | |
| } else { | |
| printf("❌ FAIL: Some outputs don't match expected values\n"); | |
| } | |
| ggml_backend_free(backend); | |
| ggml_free(ctx); | |
| printf("\n"); | |
| } | |
| void test_mixed_precision_conv2d() { | |
| printf("=== Mixed Precision Conv2D Test (F16 kernel, F32 input -> F16 output) ===\n"); | |
| struct ggml_init_params params = { | |
| .mem_size = 128*1024*1024, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx1 = ggml_init(params); | |
| struct ggml_context * ctx2 = ggml_init(params); | |
| // Mixed precision: F32 input, F16 kernel -> F16 output | |
| // Input: 2x2x2x1 (W=2, H=2, C=2, N=1) - F32 | |
| // Kernel: 1x1x2x1 (KW=1, KH=1, IC=2, OC=1) - F16 | |
| // Output: 2x2x1x1 - F16 | |
| struct ggml_tensor * input1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel1 = ggml_new_tensor_4d(ctx1, GGML_TYPE_F16, 1, 1, 2, 1); | |
| struct ggml_tensor * input2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel2 = ggml_new_tensor_4d(ctx2, GGML_TYPE_F16, 1, 1, 2, 1); | |
| // Fill F32 input with simple pattern (same data for both) | |
| printf("Input tensor (WHCN layout, F32):\n"); | |
| for (int c = 0; c < 2; c++) { | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float val = c * 10 + h * 2 + w + 1; // channel*10 + position + 1 | |
| // Fill input1 | |
| float * ptr1 = (float*)((char*)input1->data + w * input1->nb[0] + h * input1->nb[1] + c * input1->nb[2]); | |
| *ptr1 = val; | |
| // Fill input2 with same data | |
| float * ptr2 = (float*)((char*)input2->data + w * input2->nb[0] + h * input2->nb[1] + c * input2->nb[2]); | |
| *ptr2 = val; | |
| printf(" input[w=%d,h=%d,c=%d] = %.0f (F32)\n", w, h, c, val); | |
| } | |
| } | |
| } | |
| // Fill F16 kernel: [1, 1] (so output = 1*ch0 + 1*ch1) - same for both | |
| ggml_fp16_t * kernel1_data = (ggml_fp16_t*)kernel1->data; | |
| ggml_fp16_t * kernel2_data = (ggml_fp16_t*)kernel2->data; | |
| kernel1_data[0] = kernel2_data[0] = ggml_fp32_to_fp16(1.0f); // weight for channel 0 | |
| kernel1_data[1] = kernel2_data[1] = ggml_fp32_to_fp16(1.0f); // weight for channel 1 | |
| printf("\nKernel weights (F16): [%.0f, %.0f]\n", | |
| ggml_fp16_to_fp32(kernel1_data[0]), ggml_fp16_to_fp32(kernel1_data[1])); | |
| // Expected output calculation by hand: | |
| printf("\nExpected output (manual calculation):\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float ch0_val = 0 * 10 + h * 2 + w + 1; // channel 0 value | |
| float ch1_val = 1 * 10 + h * 2 + w + 1; // channel 1 value | |
| float expected = ch0_val * 1.0f + ch1_val * 1.0f; | |
| printf(" expected[w=%d,h=%d] = %.0f*1 + %.0f*1 = %.0f\n", w, h, ch0_val, ch1_val, expected); | |
| } | |
| } | |
| // Create operations: im2col+gemm vs direct | |
| struct ggml_tensor * result_im2col = ggml_conv_2d(ctx1, kernel1, input1, 1, 1, 0, 0, 1, 1); | |
| struct ggml_tensor * result_direct = ggml_conv_2d_direct(ctx2, kernel2, input2, 1, 1, 0, 0, 1, 1); | |
| // Build and compute graphs | |
| struct ggml_cgraph * graph1 = ggml_new_graph(ctx1); | |
| struct ggml_cgraph * graph2 = ggml_new_graph(ctx2); | |
| ggml_build_forward_expand(graph1, result_im2col); | |
| ggml_build_forward_expand(graph2, result_direct); | |
| ggml_backend_t backend = ggml_backend_cpu_init(); | |
| ggml_backend_graph_compute(backend, graph1); | |
| ggml_backend_graph_compute(backend, graph2); | |
| // Check actual outputs (mixed precision produces F16 outputs) | |
| ggml_fp16_t * result_im2col_data = (ggml_fp16_t*)result_im2col->data; | |
| ggml_fp16_t * result_direct_data = (ggml_fp16_t*)result_direct->data; | |
| printf("\nIM2COL+GEMM output (mixed precision -> F16):\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = h * 2 + w; | |
| printf(" im2col[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_im2col_data[idx])); | |
| } | |
| } | |
| printf("\nDirect/Tiled output (mixed precision -> F16):\n"); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = h * 2 + w; | |
| printf(" direct[w=%d,h=%d] = %.1f\n", w, h, ggml_fp16_to_fp32(result_direct_data[idx])); | |
| } | |
| } | |
| // Compare them (with tolerance for mixed precision) | |
| bool match = true; | |
| float tolerance = 1e-3f; // Tolerance for mixed precision | |
| for (int i = 0; i < 4; i++) { | |
| float val_im2col = ggml_fp16_to_fp32(result_im2col_data[i]); | |
| float val_direct = ggml_fp16_to_fp32(result_direct_data[i]); | |
| if (fabs(val_im2col - val_direct) > tolerance) { | |
| match = false; | |
| break; | |
| } | |
| } | |
| printf("\nMixed Precision Comparison: %s\n", match ? "✅ MATCH" : "❌ MISMATCH"); | |
| printf("(F16 kernel + F32 input -> F16 output)\n"); | |
| ggml_backend_free(backend); | |
| ggml_free(ctx1); | |
| ggml_free(ctx2); | |
| printf("\n"); | |
| } | |
| int main() { | |
| ggml_time_init(); // Initialize GGML timing | |
| printf("=== Debug Tensor Layout ===\n"); | |
| debug_tensor_layout(); | |
| printf("\n"); | |
| test_simple_conv2d(); | |
| test_larger_conv2d(); | |
| // Mixed precision test | |
| printf("=== Multi-Output Test (2x2x2 -> 2x2x2) ===\n"); | |
| struct ggml_init_params params2 = { | |
| .mem_size = 128*1024*1024, | |
| .mem_buffer = NULL, | |
| .no_alloc = false, | |
| }; | |
| struct ggml_context * ctx3 = ggml_init(params2); | |
| struct ggml_context * ctx4 = ggml_init(params2); | |
| // Test: 2x2 input, 2 channels, 1x1 kernel, 2 output channels | |
| // Input: 2x2x2x1 (W=2, H=2, C=2, N=1) | |
| // Kernel: 1x1x2x2 (KW=1, KH=1, IC=2, OC=2) | |
| // Output: 2x2x2x1 | |
| struct ggml_tensor * input3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel3 = ggml_new_tensor_4d(ctx3, GGML_TYPE_F32, 1, 1, 2, 2); | |
| struct ggml_tensor * input4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 2, 2, 2, 1); | |
| struct ggml_tensor * kernel4 = ggml_new_tensor_4d(ctx4, GGML_TYPE_F32, 1, 1, 2, 2); | |
| // Fill input with same pattern as before | |
| printf("Input tensor (WHCN layout):\n"); | |
| for (int c = 0; c < 2; c++) { | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float val = c * 10 + h * 2 + w + 1; | |
| float * ptr3 = (float*)((char*)input3->data + w * input3->nb[0] + h * input3->nb[1] + c * input3->nb[2]); | |
| float * ptr4 = (float*)((char*)input4->data + w * input4->nb[0] + h * input4->nb[1] + c * input4->nb[2]); | |
| *ptr3 = *ptr4 = val; | |
| printf(" input[w=%d,h=%d,c=%d] = %.0f\n", w, h, c, val); | |
| } | |
| } | |
| } | |
| // Fill kernel with different weights for each output channel | |
| float * kernel3_data = (float*)kernel3->data; | |
| float * kernel4_data = (float*)kernel4->data; | |
| // Output channel 0: weights [1, 2] (for input channels 0,1) | |
| // Output channel 1: weights [0.5, 1.5] (for input channels 0,1) | |
| kernel3_data[0] = kernel4_data[0] = 1.0f; // OC=0, IC=0 | |
| kernel3_data[1] = kernel4_data[1] = 2.0f; // OC=0, IC=1 | |
| kernel3_data[2] = kernel4_data[2] = 0.5f; // OC=1, IC=0 | |
| kernel3_data[3] = kernel4_data[3] = 1.5f; // OC=1, IC=1 | |
| printf("\nKernel weights:\n"); | |
| printf(" Output channel 0: [%.1f, %.1f]\n", kernel3_data[0], kernel3_data[1]); | |
| printf(" Output channel 1: [%.1f, %.1f]\n", kernel3_data[2], kernel3_data[3]); | |
| // Expected output calculation | |
| printf("\nExpected output (manual calculation):\n"); | |
| for (int oc = 0; oc < 2; oc++) { | |
| printf(" Output channel %d:\n", oc); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| float ch0_val = 0 * 10 + h * 2 + w + 1; // input channel 0 | |
| float ch1_val = 1 * 10 + h * 2 + w + 1; // input channel 1 | |
| float weight0 = kernel3_data[oc * 2 + 0]; | |
| float weight1 = kernel3_data[oc * 2 + 1]; | |
| float expected = ch0_val * weight0 + ch1_val * weight1; | |
| printf(" expected[w=%d,h=%d] = %.0f*%.1f + %.0f*%.1f = %.1f\n", | |
| w, h, ch0_val, weight0, ch1_val, weight1, expected); | |
| } | |
| } | |
| } | |
| // Create operations | |
| struct ggml_tensor * result3 = ggml_conv_2d(ctx3, kernel3, input3, 1, 1, 0, 0, 1, 1); | |
| struct ggml_tensor * result4 = ggml_conv_2d_direct(ctx4, kernel4, input4, 1, 1, 0, 0, 1, 1); | |
| // Build and compute graphs | |
| struct ggml_cgraph * graph3 = ggml_new_graph(ctx3); | |
| struct ggml_cgraph * graph4 = ggml_new_graph(ctx4); | |
| ggml_build_forward_expand(graph3, result3); | |
| ggml_build_forward_expand(graph4, result4); | |
| ggml_backend_t backend2 = ggml_backend_cpu_init(); | |
| ggml_backend_graph_compute(backend2, graph3); | |
| ggml_backend_graph_compute(backend2, graph4); | |
| // Check outputs | |
| float * result3_data = (float*)result3->data; | |
| float * result4_data = (float*)result4->data; | |
| printf("\nIM2COL+GEMM output:\n"); | |
| for (int oc = 0; oc < 2; oc++) { | |
| printf(" Output channel %d:\n", oc); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = oc * 4 + h * 2 + w; // Assuming WHCN layout for output | |
| printf(" im2col[w=%d,h=%d] = %.1f\n", w, h, result3_data[idx]); | |
| } | |
| } | |
| } | |
| printf("\nDirect/Tiled output:\n"); | |
| for (int oc = 0; oc < 2; oc++) { | |
| printf(" Output channel %d:\n", oc); | |
| for (int h = 0; h < 2; h++) { | |
| for (int w = 0; w < 2; w++) { | |
| int idx = oc * 4 + h * 2 + w; // Assuming WHCN layout for output | |
| printf(" direct[w=%d,h=%d] = %.1f\n", w, h, result4_data[idx]); | |
| } | |
| } | |
| } | |
| // Compare | |
| bool match2 = true; | |
| for (int i = 0; i < 8; i++) { | |
| if (fabs(result3_data[i] - result4_data[i]) > 1e-4) { | |
| match2 = false; | |
| break; | |
| } | |
| } | |
| printf("\nMulti-output comparison: %s\n", match2 ? "✅ MATCH" : "❌ MISMATCH"); | |
| ggml_backend_free(backend2); | |
| ggml_free(ctx3); | |
| ggml_free(ctx4); | |
| printf("Done with focused tests.\n\n"); | |
| printf("# Conv2D Implementation Performance Comparison\n\n"); | |
| // === FP16 vs FP32 Performance Tests === | |
| test_performance_fp16(112, 112, 32, 1, 2, 2, 64, 1, 0, 1); // pointwise | |
| test_performance_fp16(56, 56, 64, 1, 1, 1, 128, 1, 0, 1); // pointwise | |
| test_performance_fp16(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32) | |
| test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large ResNet50 first layer | |
| printf("\n# IM2COL vs SIMD Performance Tests (FP32)\n"); | |
| printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n"); | |
| printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n"); | |
| // === ResNet-style convolutions === | |
| // ResNet50 first conv layer (ImageNet input) | |
| test_performance(224, 224, 3, 1, 7, 7, 64, 2, 3, 1); | |
| // ResNet50 conv2_x layers | |
| test_performance(56, 56, 64, 1, 1, 1, 64, 1, 0, 1); // 1x1 bottleneck | |
| test_performance(56, 56, 64, 1, 3, 3, 64, 1, 1, 1); // 3x3 conv | |
| test_performance(56, 56, 64, 1, 1, 1, 256, 1, 0, 1); // 1x1 expansion | |
| // ResNet50 conv3_x layers | |
| test_performance(28, 28, 256, 1, 1, 1, 128, 1, 0, 1); // 1x1 bottleneck | |
| test_performance(28, 28, 128, 1, 3, 3, 128, 1, 1, 1); // 3x3 conv | |
| test_performance(28, 28, 128, 1, 1, 1, 512, 1, 0, 1); // 1x1 expansion | |
| // ResNet50 conv4_x layers | |
| test_performance(14, 14, 512, 1, 1, 1, 256, 1, 0, 1); // 1x1 bottleneck | |
| test_performance(14, 14, 256, 1, 3, 3, 256, 1, 1, 1); // 3x3 conv | |
| test_performance(14, 14, 256, 1, 1, 1, 1024, 1, 0, 1);// 1x1 expansion | |
| // ResNet50 conv5_x layers | |
| test_performance(7, 7, 1024, 1, 1, 1, 512, 1, 0, 1); // 1x1 bottleneck | |
| test_performance(7, 7, 512, 1, 3, 3, 512, 1, 1, 1); // 3x3 conv | |
| test_performance(7, 7, 512, 1, 1, 1, 2048, 1, 0, 1); // 1x1 expansion | |
| // === VGG-style convolutions === | |
| // VGG16 early layers | |
| test_performance(224, 224, 3, 1, 3, 3, 64, 1, 1, 1); // conv1_1 | |
| test_performance(224, 224, 16, 1, 3, 3, 64, 1, 1, 1); // conv1_2 | |
| test_performance(112, 112, 64, 1, 3, 3, 128, 1, 1, 1);// conv2_1 | |
| test_performance(112, 112, 16, 1, 3, 3, 16, 1, 1, 1);// conv2_2 | |
| // VGG16 middle layers | |
| test_performance(56, 56, 128, 1, 3, 3, 256, 1, 1, 1); // conv3_1 | |
| test_performance(56, 56, 256, 1, 3, 3, 256, 1, 1, 1); // conv3_2,3_3 | |
| test_performance(28, 28, 256, 1, 3, 3, 512, 1, 1, 1); // conv4_1 | |
| test_performance(28, 28, 512, 1, 3, 3, 512, 1, 1, 1); // conv4_2,4_3 | |
| // === MobileNet-style convolutions === | |
| // MobileNetV1 depthwise separable convs | |
| test_performance(112, 112, 32, 1, 3, 3, 32, 1, 1, 1); // depthwise (groups=32) | |
| test_performance(112, 112, 32, 1, 1, 1, 64, 1, 0, 1); // pointwise | |
| test_performance(56, 56, 64, 1, 1, 1, 128, 1, 0, 1); // pointwise | |
| test_performance(28, 28, 128, 1, 1, 1, 256, 1, 0, 1); // pointwise | |
| // === EfficientNet-style convolutions === | |
| // EfficientNet-B0 layers | |
| test_performance(224, 224, 3, 1, 3, 3, 32, 2, 1, 1); // stem conv | |
| test_performance(112, 112, 16, 1, 3, 3, 96, 1, 1, 1); // expand conv | |
| test_performance(112, 112, 24, 1, 1, 1, 14, 1, 0, 1);// expand conv | |
| test_performance(56, 56, 40, 1, 1, 1, 24, 1, 0, 1); // expand conv | |
| // === Modern CNN typical layers === | |
| // ConvNeXt-style convolutions | |
| test_performance(56, 56, 96, 1, 7, 7, 96, 1, 3, 1); // large kernel depthwise | |
| test_performance(28, 28, 192, 1, 7, 7, 192, 1, 3, 1); // large kernel depthwise | |
| test_performance(14, 14, 384, 1, 7, 7, 384, 1, 3, 1); // large kernel depthwise | |
| // === Batch processing scenarios === | |
| // Inference with batch size 4 (common for edge devices) | |
| test_performance(224, 224, 3, 4, 3, 3, 64, 1, 1, 1); // batch input processing | |
| test_performance(56, 56, 64, 4, 3, 3, 64, 1, 1, 1); // batch feature processing | |
| test_performance(28, 28, 128, 4, 1, 1, 64, 1, 0, 1); // batch pointwise | |
| test_performance(512, 512, 3, 1, 3, 3, 1, 1, 0, 1); // batch pointwise | |
| test_performance(256, 512, 3, 1, 3, 3, 9, 1, 0, 1); // batch pointwise | |
| test_performance(896, 896, 1, 1, 3, 3, 1, 1, 0, 1); // batch pointwise | |
| //test CLIP | |
| test_performance(224, 224, 3, 1, 3, 3, 768, 1, 1, 2); // batch pointwise | |
| printf("\n"); | |
| printf("\n# Large Conv2D Performance Tests (Realistic Neural Network Sizes)\n"); | |
| printf("| Input Size | Kernel | Config | IM2COL (ms) | SIMD (ms) | Speedup | Match | Type |\n"); | |
| printf("|------------|--------|--------|-------------|-----------|---------|-------|------|\n"); | |
| // === Large ResNet-style convolutions === | |
| test_performance(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large input ResNet50 first layer (8 batch) | |
| printf("\n# FP16 Large Conv2D Performance Tests\n"); | |
| printf("| Input Size | Kernel | Config | FP32 (ms) | FP16 (ms) | Speedup | Match | Type |\n"); | |
| printf("|------------|--------|--------|-----------|-----------|---------|-------|------|\n"); | |
| test_performance_fp16(512, 512, 3, 8, 7, 7, 64, 2, 3, 1); // Large input ResNet50 first layer (8 batch) | |
| // === Large FP16 vs FP32 tests === | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment