hollance · May 4, 2025 12:45
diff --git a/tdf2.cpp b/tdf2.cpp
 /*

  Is it faster to process 4 channels of audio in parallel through their own
  biquads using a hand-rolled SIMD implementation.

  Interestingly enough, the SIMD version is faster the more channels you add,
  up to a point... With 64 channels, the naive code is a tiny bit faster,
  presumably because now the compiler will vectorize the loop itself? (Didn't
  check the assembly.)

  Or maybe now it becomes more of a memory access pattern thing, i.e. the
  computation is no longer the bottleneck but RAM speed is.

  The number of samples does not seem to matter.

  TODO:
    - Intel version (SSE + AVX for 8 channels)

  Compile and run:

  $ clang -std=c++17 -lstdc++ -Wall -Wextra -O3 -ftree-vectorize -ffast-math -march=native tdf2.cpp -o tdf2
  $ ./tdf2

  Run it a few times in a row to get more reliable results (warm up that cache).
 */

 #include <cstdio>
 #include <cmath>
 #include <ctime>
 #include <chrono>
 #include <memory>
 #include <string>
 #include <limits>
 #include <vector>

 #if defined(__SSE__)
 #include <xmmintrin.h>
 #elif defined(__ARM_NEON__)
 #include <arm_neon.h>
 #endif

 constexpr size_t numSteps = 10000;
 constexpr size_t numChannels = 16;  // must be multiple of 4
 constexpr size_t numSamples = 1024;

 // Input and output signals
 float* xs = nullptr;
 float* ys1 = nullptr;
 float* ys2 = nullptr;

 // Filter coefficients
 float* a0;
 float* a1;
 float* a2;
 float* b1;
 float* b2;

 // Filter delay units (not used by SIMD version)
 float z1[numChannels] = { 0.0f };
 float z2[numChannels] = { 0.0f };

 long long elapsed1;
 long long elapsed2;

 // =============================================================================

 // TDF2 biquad without explicit vectorization

 void benchmark_1()
 {
    auto startTime = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < numSteps; ++i) {
        std::memset(z1, 0, numChannels * sizeof(float));
        std::memset(z2, 0, numChannels * sizeof(float));

        for (size_t s = 0; s < numSamples; ++s) {
            size_t j = s * numChannels;
            for (size_t c = 0; c < numChannels; ++c) {
                float x = xs[j + c];

                float y = a0[c] * x + z1[c];
                z1[c] = a1[c] * x - b1[c] * y + z2[c];
                z2[c] = a2[c] * x - b2[c] * y;

                // To avoid differences in the output to floating point
                // operations not being associative, do the operations in
                // the exact same order as the SIMD code. This should give
                // the exact same results but for some reason there is still
                // a small error (at the limit of float precision).
                //float y = z1[c] + a0[c] * x;
                //z1[c] = z2[c] + a1[c] * x - b1[c] * y;
                //z2[c] = a2[c] * x - b2[c] * y;

                ys1[j + c] = y;
            }
        }
    }

    auto endTime = std::chrono::high_resolution_clock::now();
    auto elapsed = endTime - startTime;
    elapsed1 = elapsed.count();
    printf("[benchmark_1] elapsed: %lld\n", elapsed1);

    /*
    // Do something with the result so the loop doesn't get optimized out.
    printf("first timestep:\n");
    for (size_t c = 0; c < numChannels; ++c) {
        printf("%.24f\n", ys1[c]);
    }
    printf("last timestep:\n");
    for (size_t c = 0; c < numChannels; ++c) {
        printf("%.24f\n", ys1[(numSamples - 1) * numChannels + c]);
    }
    */
 }

 // =============================================================================

 // TDF2 biquad with SIMD implemented by hand (ARM NEON only for now)

 void benchmark_2()
 {
    /*
    // Version for 4 channels:

    float32x4_t va0 = vld1q_f32(a0);
    float32x4_t va1 = vld1q_f32(a1);
    float32x4_t va2 = vld1q_f32(a2);
    float32x4_t vb1 = vld1q_f32(b1);
    float32x4_t vb2 = vld1q_f32(b2);

    auto startTime = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < numSteps; ++i) {
        float zero = 0.0f;
        float32x4_t vz1 = vld1q_dup_f32(&zero);
        float32x4_t vz2 = vld1q_dup_f32(&zero);

        for (size_t s = 0; s < numSamples; ++s) {
            size_t j = s * numChannels;

            float32x4_t vx = vld1q_f32(xs + j);

            float32x4_t vy = vmlaq_f32(vz1, va0, vx);  // y = a0[c] * x + z1[c]

            float32x4_t t1 = vmlaq_f32(vz2, va1, vx);  // t1 = a1[c] * x + z2[c]
            vz1 = vmlsq_f32(t1, vb1, vy);              // z1 = t1 - b1[c] * y

            float32x4_t t2 = vmulq_f32(va2, vx);       // t2 = a2[c] * x
            vz2 = vmlsq_f32(t2, vb2, vy);              // z2 = t2 - b2[c] * y

            vst1q_f32(ys2 + j, vy);
        }
    }
    */

    constexpr int vc = numChannels / 4;

    float32x4_t va0[vc];
    float32x4_t va1[vc];
    float32x4_t va2[vc];
    float32x4_t vb1[vc];
    float32x4_t vb2[vc];

    for (size_t c = 0; c < vc; ++c) {
        va0[c] = vld1q_f32(a0 + c*4);
        va1[c] = vld1q_f32(a1 + c*4);
        va2[c] = vld1q_f32(a2 + c*4);
        vb1[c] = vld1q_f32(b1 + c*4);
        vb2[c] = vld1q_f32(b2 + c*4);
    }

    float32x4_t vz1[vc];
    float32x4_t vz2[vc];

    auto startTime = std::chrono::high_resolution_clock::now();

    for (size_t i = 0; i < numSteps; ++i) {
        float zero = 0.0f;
        for (size_t c = 0; c < vc; ++c) {
            vz1[c] = vld1q_dup_f32(&zero);
            vz2[c] = vld1q_dup_f32(&zero);
        }

        for (size_t s = 0; s < numSamples; ++s) {
            const size_t j = s * numChannels;
            for (size_t c = 0; c < vc; ++c) {
                const size_t k = j + c*4;
                float32x4_t vx = vld1q_f32(xs + k);
                float32x4_t vy = vmlaq_f32(vz1[c], va0[c], vx);  // y = a0[c] * x + z1[c]
                float32x4_t t1 = vmlaq_f32(vz2[c], va1[c], vx);  // t1 = a1[c] * x + z2[c]
                vz1[c] = vmlsq_f32(t1, vb1[c], vy);              // z1 = t1 - b1[c] * y
                float32x4_t t2 = vmulq_f32(va2[c], vx);          // t2 = a2[c] * x
                vz2[c] = vmlsq_f32(t2, vb2[c], vy);              // z2 = t2 - b2[c] * y
                vst1q_f32(ys2 + k, vy);
            }
        }
    }

    auto endTime = std::chrono::high_resolution_clock::now();
    auto elapsed = endTime - startTime;
    elapsed2 = elapsed.count();
    printf("[benchmark_2] elapsed: %lld\n", elapsed2);

    /*
    // Sanity check!
    printf("first timestep:\n");
    for (size_t c = 0; c < numChannels; ++c) {
        printf("%.24f\n", ys2[c]);
    }
    printf("last timestep:\n");
    for (size_t c = 0; c < numChannels; ++c) {
        printf("%.24f\n", ys2[(numSamples - 1) * numChannels + c]);
    }
    */

    // Compare the output values at every timestep.
    float maxDiff = 0.0f;
    float threshold = 1e-6f;
    for (size_t s = 0; s < numSamples; ++s) {
        size_t j = s * numChannels;
        for (size_t c = 0; c < numChannels; ++c) {
            float diff = ys1[j + c] - ys2[j + c];
            if (diff > maxDiff) { maxDiff = diff; }
            if (diff > threshold) {
                printf("diff! @%zu:%zu %.24f (%.24f vs %.24f)\n", s, c, diff, ys1[j + c], ys2[j + c]);
                goto buhbye;
            }
        }
    }
 buhbye:
    // If the max diff is equal to or smaller than epsilon, I'm happy.
    printf("max diff = %.24f\n", maxDiff);
    printf(" epsilon = %.24f\n", std::numeric_limits<float>::epsilon());

    printf("faster? %gx\n", double(elapsed1) / elapsed2);
 }

 // =============================================================================

 int main(int /*argc*/, char** /*argv*/)
 {
    // Disable subnormals.
    #if defined(__SSE__)
    _mm_setcsr(_mm_getcsr() | 0x8040);
    #elif defined(__ARM_NEON__)
    uintptr_t fpsr = 0;
    asm volatile("mrs %0, fpcr" : "=r"(fpsr));
    fpsr |= 0x01000000U;
    asm volatile("msr fpcr, %0" : : "ri"(fpsr));
    #endif

    // Make sure the memory is aligned
    // For 128-bit vector registers we only need alignment of 16
    // but let's do 32 anyway.
    xs  = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));
    ys1 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));
    ys2 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));

    std::srand(std::time(nullptr));

    for (size_t i = 0; i < numChannels * numSamples; ++i) {
        xs[i] = 2.0f * float(std::rand()) / RAND_MAX - 1.0f;
        ys1[i] = 0.0f;
        ys2[i] = 0.0f;
    }

    // Also align memory here. Note that size in bytes should be multiple of 32,
    // hence the extra factor 2. Should probably stick these into a single buffer
    // to avoid wasting space.
    a0 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
    a1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
    a2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
    b1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
    b2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));

    // Make some fake but realistic coefficients for the different channels.
    for (size_t c = 0; c < numChannels; ++c) {
        a0[c] = 0.08365524702654487f + float(c)*0.001f;
        a1[c] = 0.16731049405308973f + float(c)*0.002f;
        a2[c] = 0.08365524702654487f - float(c)*0.0001f;
        b1[c] = -1.031904346533239f - float(c)*0.002f;
        b2[c] = 0.36652533463941855f + float(c)*0.001f;
    }

    printf("***ignore this result (warming up the cache)***\n");
    benchmark_1();
    benchmark_2();

    printf("***OK now pay attention again***\n");
    benchmark_1();
    benchmark_2();

    return 0;
 }
	/*

	Is it faster to process 4 channels of audio in parallel through their own
	biquads using a hand-rolled SIMD implementation.

	Interestingly enough, the SIMD version is faster the more channels you add,
	up to a point... With 64 channels, the naive code is a tiny bit faster,
	presumably because now the compiler will vectorize the loop itself? (Didn't
	check the assembly.)

	Or maybe now it becomes more of a memory access pattern thing, i.e. the
	computation is no longer the bottleneck but RAM speed is.

	The number of samples does not seem to matter.

	TODO:
	- Intel version (SSE + AVX for 8 channels)

	Compile and run:

	$ clang -std=c++17 -lstdc++ -Wall -Wextra -O3 -ftree-vectorize -ffast-math -march=native tdf2.cpp -o tdf2
	$ ./tdf2

	Run it a few times in a row to get more reliable results (warm up that cache).
	*/

	#include <cstdio>
	#include <cmath>
	#include <ctime>
	#include <chrono>
	#include <memory>
	#include <string>
	#include <limits>
	#include <vector>

	#if defined(__SSE__)
	#include <xmmintrin.h>
	#elif defined(__ARM_NEON__)
	#include <arm_neon.h>
	#endif

	constexpr size_t numSteps = 10000;
	constexpr size_t numChannels = 16; // must be multiple of 4
	constexpr size_t numSamples = 1024;

	// Input and output signals
	float* xs = nullptr;
	float* ys1 = nullptr;
	float* ys2 = nullptr;

	// Filter coefficients
	float* a0;
	float* a1;
	float* a2;
	float* b1;
	float* b2;

	// Filter delay units (not used by SIMD version)
	float z1[numChannels] = { 0.0f };
	float z2[numChannels] = { 0.0f };

	long long elapsed1;
	long long elapsed2;

	// =============================================================================

	// TDF2 biquad without explicit vectorization

	void benchmark_1()
	{
	auto startTime = std::chrono::high_resolution_clock::now();

	for (size_t i = 0; i < numSteps; ++i) {
	std::memset(z1, 0, numChannels * sizeof(float));
	std::memset(z2, 0, numChannels * sizeof(float));

	for (size_t s = 0; s < numSamples; ++s) {
	size_t j = s * numChannels;
	for (size_t c = 0; c < numChannels; ++c) {
	float x = xs[j + c];

	float y = a0[c] * x + z1[c];
	z1[c] = a1[c] * x - b1[c] * y + z2[c];
	z2[c] = a2[c] * x - b2[c] * y;

	// To avoid differences in the output to floating point
	// operations not being associative, do the operations in
	// the exact same order as the SIMD code. This should give
	// the exact same results but for some reason there is still
	// a small error (at the limit of float precision).
	//float y = z1[c] + a0[c] * x;
	//z1[c] = z2[c] + a1[c] * x - b1[c] * y;
	//z2[c] = a2[c] * x - b2[c] * y;

	ys1[j + c] = y;
	}
	}
	}

	auto endTime = std::chrono::high_resolution_clock::now();
	auto elapsed = endTime - startTime;
	elapsed1 = elapsed.count();
	printf("[benchmark_1] elapsed: %lld\n", elapsed1);

	/*
	// Do something with the result so the loop doesn't get optimized out.
	printf("first timestep:\n");
	for (size_t c = 0; c < numChannels; ++c) {
	printf("%.24f\n", ys1[c]);
	}
	printf("last timestep:\n");
	for (size_t c = 0; c < numChannels; ++c) {
	printf("%.24f\n", ys1[(numSamples - 1) * numChannels + c]);
	}
	*/
	}

	// =============================================================================

	// TDF2 biquad with SIMD implemented by hand (ARM NEON only for now)

	void benchmark_2()
	{
	/*
	// Version for 4 channels:

	float32x4_t va0 = vld1q_f32(a0);
	float32x4_t va1 = vld1q_f32(a1);
	float32x4_t va2 = vld1q_f32(a2);
	float32x4_t vb1 = vld1q_f32(b1);
	float32x4_t vb2 = vld1q_f32(b2);

	auto startTime = std::chrono::high_resolution_clock::now();

	for (size_t i = 0; i < numSteps; ++i) {
	float zero = 0.0f;
	float32x4_t vz1 = vld1q_dup_f32(&zero);
	float32x4_t vz2 = vld1q_dup_f32(&zero);

	for (size_t s = 0; s < numSamples; ++s) {
	size_t j = s * numChannels;

	float32x4_t vx = vld1q_f32(xs + j);

	float32x4_t vy = vmlaq_f32(vz1, va0, vx); // y = a0[c] * x + z1[c]

	float32x4_t t1 = vmlaq_f32(vz2, va1, vx); // t1 = a1[c] * x + z2[c]
	vz1 = vmlsq_f32(t1, vb1, vy); // z1 = t1 - b1[c] * y

	float32x4_t t2 = vmulq_f32(va2, vx); // t2 = a2[c] * x
	vz2 = vmlsq_f32(t2, vb2, vy); // z2 = t2 - b2[c] * y

	vst1q_f32(ys2 + j, vy);
	}
	}
	*/

	constexpr int vc = numChannels / 4;

	float32x4_t va0[vc];
	float32x4_t va1[vc];
	float32x4_t va2[vc];
	float32x4_t vb1[vc];
	float32x4_t vb2[vc];

	for (size_t c = 0; c < vc; ++c) {
	va0[c] = vld1q_f32(a0 + c*4);
	va1[c] = vld1q_f32(a1 + c*4);
	va2[c] = vld1q_f32(a2 + c*4);
	vb1[c] = vld1q_f32(b1 + c*4);
	vb2[c] = vld1q_f32(b2 + c*4);
	}

	float32x4_t vz1[vc];
	float32x4_t vz2[vc];

	auto startTime = std::chrono::high_resolution_clock::now();

	for (size_t i = 0; i < numSteps; ++i) {
	float zero = 0.0f;
	for (size_t c = 0; c < vc; ++c) {
	vz1[c] = vld1q_dup_f32(&zero);
	vz2[c] = vld1q_dup_f32(&zero);
	}

	for (size_t s = 0; s < numSamples; ++s) {
	const size_t j = s * numChannels;
	for (size_t c = 0; c < vc; ++c) {
	const size_t k = j + c*4;
	float32x4_t vx = vld1q_f32(xs + k);
	float32x4_t vy = vmlaq_f32(vz1[c], va0[c], vx); // y = a0[c] * x + z1[c]
	float32x4_t t1 = vmlaq_f32(vz2[c], va1[c], vx); // t1 = a1[c] * x + z2[c]
	vz1[c] = vmlsq_f32(t1, vb1[c], vy); // z1 = t1 - b1[c] * y
	float32x4_t t2 = vmulq_f32(va2[c], vx); // t2 = a2[c] * x
	vz2[c] = vmlsq_f32(t2, vb2[c], vy); // z2 = t2 - b2[c] * y
	vst1q_f32(ys2 + k, vy);
	}
	}
	}

	auto endTime = std::chrono::high_resolution_clock::now();
	auto elapsed = endTime - startTime;
	elapsed2 = elapsed.count();
	printf("[benchmark_2] elapsed: %lld\n", elapsed2);

	/*
	// Sanity check!
	printf("first timestep:\n");
	for (size_t c = 0; c < numChannels; ++c) {
	printf("%.24f\n", ys2[c]);
	}
	printf("last timestep:\n");
	for (size_t c = 0; c < numChannels; ++c) {
	printf("%.24f\n", ys2[(numSamples - 1) * numChannels + c]);
	}
	*/

	// Compare the output values at every timestep.
	float maxDiff = 0.0f;
	float threshold = 1e-6f;
	for (size_t s = 0; s < numSamples; ++s) {
	size_t j = s * numChannels;
	for (size_t c = 0; c < numChannels; ++c) {
	float diff = ys1[j + c] - ys2[j + c];
	if (diff > maxDiff) { maxDiff = diff; }
	if (diff > threshold) {
	printf("diff! @%zu:%zu %.24f (%.24f vs %.24f)\n", s, c, diff, ys1[j + c], ys2[j + c]);
	goto buhbye;
	}
	}
	}
	buhbye:
	// If the max diff is equal to or smaller than epsilon, I'm happy.
	printf("max diff = %.24f\n", maxDiff);
	printf(" epsilon = %.24f\n", std::numeric_limits<float>::epsilon());

	printf("faster? %gx\n", double(elapsed1) / elapsed2);
	}

	// =============================================================================

	int main(int /argc/, char** /argv/)
	{
	// Disable subnormals.
	#if defined(__SSE__)
	_mm_setcsr(_mm_getcsr() \| 0x8040);
	#elif defined(__ARM_NEON__)
	uintptr_t fpsr = 0;
	asm volatile("mrs %0, fpcr" : "=r"(fpsr));
	fpsr \|= 0x01000000U;
	asm volatile("msr fpcr, %0" : : "ri"(fpsr));
	#endif

	// Make sure the memory is aligned
	// For 128-bit vector registers we only need alignment of 16
	// but let's do 32 anyway.
	xs = (float)std::aligned_alloc(32, numChannels numSamples * sizeof(float));
	ys1 = (float)std::aligned_alloc(32, numChannels numSamples * sizeof(float));
	ys2 = (float)std::aligned_alloc(32, numChannels numSamples * sizeof(float));

	std::srand(std::time(nullptr));

	for (size_t i = 0; i < numChannels * numSamples; ++i) {
	xs[i] = 2.0f * float(std::rand()) / RAND_MAX - 1.0f;
	ys1[i] = 0.0f;
	ys2[i] = 0.0f;
	}

	// Also align memory here. Note that size in bytes should be multiple of 32,
	// hence the extra factor 2. Should probably stick these into a single buffer
	// to avoid wasting space.
	a0 = (float)std::aligned_alloc(32, numChannels 2 * sizeof(float));
	a1 = (float)std::aligned_alloc(32, numChannels 2 * sizeof(float));
	a2 = (float)std::aligned_alloc(32, numChannels 2 * sizeof(float));
	b1 = (float)std::aligned_alloc(32, numChannels 2 * sizeof(float));
	b2 = (float)std::aligned_alloc(32, numChannels 2 * sizeof(float));

	// Make some fake but realistic coefficients for the different channels.
	for (size_t c = 0; c < numChannels; ++c) {
	a0[c] = 0.08365524702654487f + float(c)*0.001f;
	a1[c] = 0.16731049405308973f + float(c)*0.002f;
	a2[c] = 0.08365524702654487f - float(c)*0.0001f;
	b1[c] = -1.031904346533239f - float(c)*0.002f;
	b2[c] = 0.36652533463941855f + float(c)*0.001f;
	}

	printf("*ignore this result (warming up the cache)*\n");
	benchmark_1();
	benchmark_2();

	printf("*OK now pay attention again*\n");
	benchmark_1();
	benchmark_2();

	return 0;
	}