Skip to content

Instantly share code, notes, and snippets.

@hollance
Created May 4, 2025 12:45
Show Gist options
  • Save hollance/e96eb64176e924b44f05d314565deb55 to your computer and use it in GitHub Desktop.
Save hollance/e96eb64176e924b44f05d314565deb55 to your computer and use it in GitHub Desktop.
Comparing naive multichannel TDF2 biquad vs handrolled SIMD
/*
Is it faster to process 4 channels of audio in parallel through their own
biquads using a hand-rolled SIMD implementation.
Interestingly enough, the SIMD version is faster the more channels you add,
up to a point... With 64 channels, the naive code is a tiny bit faster,
presumably because now the compiler will vectorize the loop itself? (Didn't
check the assembly.)
Or maybe now it becomes more of a memory access pattern thing, i.e. the
computation is no longer the bottleneck but RAM speed is.
The number of samples does not seem to matter.
TODO:
- Intel version (SSE + AVX for 8 channels)
Compile and run:
$ clang -std=c++17 -lstdc++ -Wall -Wextra -O3 -ftree-vectorize -ffast-math -march=native tdf2.cpp -o tdf2
$ ./tdf2
Run it a few times in a row to get more reliable results (warm up that cache).
*/
#include <cstdio>
#include <cmath>
#include <ctime>
#include <chrono>
#include <memory>
#include <string>
#include <limits>
#include <vector>
#if defined(__SSE__)
#include <xmmintrin.h>
#elif defined(__ARM_NEON__)
#include <arm_neon.h>
#endif
constexpr size_t numSteps = 10000;
constexpr size_t numChannels = 16; // must be multiple of 4
constexpr size_t numSamples = 1024;
// Input and output signals
float* xs = nullptr;
float* ys1 = nullptr;
float* ys2 = nullptr;
// Filter coefficients
float* a0;
float* a1;
float* a2;
float* b1;
float* b2;
// Filter delay units (not used by SIMD version)
float z1[numChannels] = { 0.0f };
float z2[numChannels] = { 0.0f };
long long elapsed1;
long long elapsed2;
// =============================================================================
// TDF2 biquad without explicit vectorization
void benchmark_1()
{
auto startTime = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < numSteps; ++i) {
std::memset(z1, 0, numChannels * sizeof(float));
std::memset(z2, 0, numChannels * sizeof(float));
for (size_t s = 0; s < numSamples; ++s) {
size_t j = s * numChannels;
for (size_t c = 0; c < numChannels; ++c) {
float x = xs[j + c];
float y = a0[c] * x + z1[c];
z1[c] = a1[c] * x - b1[c] * y + z2[c];
z2[c] = a2[c] * x - b2[c] * y;
// To avoid differences in the output to floating point
// operations not being associative, do the operations in
// the exact same order as the SIMD code. This should give
// the exact same results but for some reason there is still
// a small error (at the limit of float precision).
//float y = z1[c] + a0[c] * x;
//z1[c] = z2[c] + a1[c] * x - b1[c] * y;
//z2[c] = a2[c] * x - b2[c] * y;
ys1[j + c] = y;
}
}
}
auto endTime = std::chrono::high_resolution_clock::now();
auto elapsed = endTime - startTime;
elapsed1 = elapsed.count();
printf("[benchmark_1] elapsed: %lld\n", elapsed1);
/*
// Do something with the result so the loop doesn't get optimized out.
printf("first timestep:\n");
for (size_t c = 0; c < numChannels; ++c) {
printf("%.24f\n", ys1[c]);
}
printf("last timestep:\n");
for (size_t c = 0; c < numChannels; ++c) {
printf("%.24f\n", ys1[(numSamples - 1) * numChannels + c]);
}
*/
}
// =============================================================================
// TDF2 biquad with SIMD implemented by hand (ARM NEON only for now)
void benchmark_2()
{
/*
// Version for 4 channels:
float32x4_t va0 = vld1q_f32(a0);
float32x4_t va1 = vld1q_f32(a1);
float32x4_t va2 = vld1q_f32(a2);
float32x4_t vb1 = vld1q_f32(b1);
float32x4_t vb2 = vld1q_f32(b2);
auto startTime = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < numSteps; ++i) {
float zero = 0.0f;
float32x4_t vz1 = vld1q_dup_f32(&zero);
float32x4_t vz2 = vld1q_dup_f32(&zero);
for (size_t s = 0; s < numSamples; ++s) {
size_t j = s * numChannels;
float32x4_t vx = vld1q_f32(xs + j);
float32x4_t vy = vmlaq_f32(vz1, va0, vx); // y = a0[c] * x + z1[c]
float32x4_t t1 = vmlaq_f32(vz2, va1, vx); // t1 = a1[c] * x + z2[c]
vz1 = vmlsq_f32(t1, vb1, vy); // z1 = t1 - b1[c] * y
float32x4_t t2 = vmulq_f32(va2, vx); // t2 = a2[c] * x
vz2 = vmlsq_f32(t2, vb2, vy); // z2 = t2 - b2[c] * y
vst1q_f32(ys2 + j, vy);
}
}
*/
constexpr int vc = numChannels / 4;
float32x4_t va0[vc];
float32x4_t va1[vc];
float32x4_t va2[vc];
float32x4_t vb1[vc];
float32x4_t vb2[vc];
for (size_t c = 0; c < vc; ++c) {
va0[c] = vld1q_f32(a0 + c*4);
va1[c] = vld1q_f32(a1 + c*4);
va2[c] = vld1q_f32(a2 + c*4);
vb1[c] = vld1q_f32(b1 + c*4);
vb2[c] = vld1q_f32(b2 + c*4);
}
float32x4_t vz1[vc];
float32x4_t vz2[vc];
auto startTime = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < numSteps; ++i) {
float zero = 0.0f;
for (size_t c = 0; c < vc; ++c) {
vz1[c] = vld1q_dup_f32(&zero);
vz2[c] = vld1q_dup_f32(&zero);
}
for (size_t s = 0; s < numSamples; ++s) {
const size_t j = s * numChannels;
for (size_t c = 0; c < vc; ++c) {
const size_t k = j + c*4;
float32x4_t vx = vld1q_f32(xs + k);
float32x4_t vy = vmlaq_f32(vz1[c], va0[c], vx); // y = a0[c] * x + z1[c]
float32x4_t t1 = vmlaq_f32(vz2[c], va1[c], vx); // t1 = a1[c] * x + z2[c]
vz1[c] = vmlsq_f32(t1, vb1[c], vy); // z1 = t1 - b1[c] * y
float32x4_t t2 = vmulq_f32(va2[c], vx); // t2 = a2[c] * x
vz2[c] = vmlsq_f32(t2, vb2[c], vy); // z2 = t2 - b2[c] * y
vst1q_f32(ys2 + k, vy);
}
}
}
auto endTime = std::chrono::high_resolution_clock::now();
auto elapsed = endTime - startTime;
elapsed2 = elapsed.count();
printf("[benchmark_2] elapsed: %lld\n", elapsed2);
/*
// Sanity check!
printf("first timestep:\n");
for (size_t c = 0; c < numChannels; ++c) {
printf("%.24f\n", ys2[c]);
}
printf("last timestep:\n");
for (size_t c = 0; c < numChannels; ++c) {
printf("%.24f\n", ys2[(numSamples - 1) * numChannels + c]);
}
*/
// Compare the output values at every timestep.
float maxDiff = 0.0f;
float threshold = 1e-6f;
for (size_t s = 0; s < numSamples; ++s) {
size_t j = s * numChannels;
for (size_t c = 0; c < numChannels; ++c) {
float diff = ys1[j + c] - ys2[j + c];
if (diff > maxDiff) { maxDiff = diff; }
if (diff > threshold) {
printf("diff! @%zu:%zu %.24f (%.24f vs %.24f)\n", s, c, diff, ys1[j + c], ys2[j + c]);
goto buhbye;
}
}
}
buhbye:
// If the max diff is equal to or smaller than epsilon, I'm happy.
printf("max diff = %.24f\n", maxDiff);
printf(" epsilon = %.24f\n", std::numeric_limits<float>::epsilon());
printf("faster? %gx\n", double(elapsed1) / elapsed2);
}
// =============================================================================
int main(int /*argc*/, char** /*argv*/)
{
// Disable subnormals.
#if defined(__SSE__)
_mm_setcsr(_mm_getcsr() | 0x8040);
#elif defined(__ARM_NEON__)
uintptr_t fpsr = 0;
asm volatile("mrs %0, fpcr" : "=r"(fpsr));
fpsr |= 0x01000000U;
asm volatile("msr fpcr, %0" : : "ri"(fpsr));
#endif
// Make sure the memory is aligned
// For 128-bit vector registers we only need alignment of 16
// but let's do 32 anyway.
xs = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));
ys1 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));
ys2 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float));
std::srand(std::time(nullptr));
for (size_t i = 0; i < numChannels * numSamples; ++i) {
xs[i] = 2.0f * float(std::rand()) / RAND_MAX - 1.0f;
ys1[i] = 0.0f;
ys2[i] = 0.0f;
}
// Also align memory here. Note that size in bytes should be multiple of 32,
// hence the extra factor 2. Should probably stick these into a single buffer
// to avoid wasting space.
a0 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
a1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
a2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
b1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
b2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float));
// Make some fake but realistic coefficients for the different channels.
for (size_t c = 0; c < numChannels; ++c) {
a0[c] = 0.08365524702654487f + float(c)*0.001f;
a1[c] = 0.16731049405308973f + float(c)*0.002f;
a2[c] = 0.08365524702654487f - float(c)*0.0001f;
b1[c] = -1.031904346533239f - float(c)*0.002f;
b2[c] = 0.36652533463941855f + float(c)*0.001f;
}
printf("***ignore this result (warming up the cache)***\n");
benchmark_1();
benchmark_2();
printf("***OK now pay attention again***\n");
benchmark_1();
benchmark_2();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment