Created
May 4, 2025 12:45
-
-
Save hollance/e96eb64176e924b44f05d314565deb55 to your computer and use it in GitHub Desktop.
Comparing naive multichannel TDF2 biquad vs handrolled SIMD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Is it faster to process 4 channels of audio in parallel through their own | |
biquads using a hand-rolled SIMD implementation. | |
Interestingly enough, the SIMD version is faster the more channels you add, | |
up to a point... With 64 channels, the naive code is a tiny bit faster, | |
presumably because now the compiler will vectorize the loop itself? (Didn't | |
check the assembly.) | |
Or maybe now it becomes more of a memory access pattern thing, i.e. the | |
computation is no longer the bottleneck but RAM speed is. | |
The number of samples does not seem to matter. | |
TODO: | |
- Intel version (SSE + AVX for 8 channels) | |
Compile and run: | |
$ clang -std=c++17 -lstdc++ -Wall -Wextra -O3 -ftree-vectorize -ffast-math -march=native tdf2.cpp -o tdf2 | |
$ ./tdf2 | |
Run it a few times in a row to get more reliable results (warm up that cache). | |
*/ | |
#include <cstdio> | |
#include <cmath> | |
#include <ctime> | |
#include <chrono> | |
#include <memory> | |
#include <string> | |
#include <limits> | |
#include <vector> | |
#if defined(__SSE__) | |
#include <xmmintrin.h> | |
#elif defined(__ARM_NEON__) | |
#include <arm_neon.h> | |
#endif | |
constexpr size_t numSteps = 10000; | |
constexpr size_t numChannels = 16; // must be multiple of 4 | |
constexpr size_t numSamples = 1024; | |
// Input and output signals | |
float* xs = nullptr; | |
float* ys1 = nullptr; | |
float* ys2 = nullptr; | |
// Filter coefficients | |
float* a0; | |
float* a1; | |
float* a2; | |
float* b1; | |
float* b2; | |
// Filter delay units (not used by SIMD version) | |
float z1[numChannels] = { 0.0f }; | |
float z2[numChannels] = { 0.0f }; | |
long long elapsed1; | |
long long elapsed2; | |
// ============================================================================= | |
// TDF2 biquad without explicit vectorization | |
void benchmark_1() | |
{ | |
auto startTime = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < numSteps; ++i) { | |
std::memset(z1, 0, numChannels * sizeof(float)); | |
std::memset(z2, 0, numChannels * sizeof(float)); | |
for (size_t s = 0; s < numSamples; ++s) { | |
size_t j = s * numChannels; | |
for (size_t c = 0; c < numChannels; ++c) { | |
float x = xs[j + c]; | |
float y = a0[c] * x + z1[c]; | |
z1[c] = a1[c] * x - b1[c] * y + z2[c]; | |
z2[c] = a2[c] * x - b2[c] * y; | |
// To avoid differences in the output to floating point | |
// operations not being associative, do the operations in | |
// the exact same order as the SIMD code. This should give | |
// the exact same results but for some reason there is still | |
// a small error (at the limit of float precision). | |
//float y = z1[c] + a0[c] * x; | |
//z1[c] = z2[c] + a1[c] * x - b1[c] * y; | |
//z2[c] = a2[c] * x - b2[c] * y; | |
ys1[j + c] = y; | |
} | |
} | |
} | |
auto endTime = std::chrono::high_resolution_clock::now(); | |
auto elapsed = endTime - startTime; | |
elapsed1 = elapsed.count(); | |
printf("[benchmark_1] elapsed: %lld\n", elapsed1); | |
/* | |
// Do something with the result so the loop doesn't get optimized out. | |
printf("first timestep:\n"); | |
for (size_t c = 0; c < numChannels; ++c) { | |
printf("%.24f\n", ys1[c]); | |
} | |
printf("last timestep:\n"); | |
for (size_t c = 0; c < numChannels; ++c) { | |
printf("%.24f\n", ys1[(numSamples - 1) * numChannels + c]); | |
} | |
*/ | |
} | |
// ============================================================================= | |
// TDF2 biquad with SIMD implemented by hand (ARM NEON only for now) | |
void benchmark_2() | |
{ | |
/* | |
// Version for 4 channels: | |
float32x4_t va0 = vld1q_f32(a0); | |
float32x4_t va1 = vld1q_f32(a1); | |
float32x4_t va2 = vld1q_f32(a2); | |
float32x4_t vb1 = vld1q_f32(b1); | |
float32x4_t vb2 = vld1q_f32(b2); | |
auto startTime = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < numSteps; ++i) { | |
float zero = 0.0f; | |
float32x4_t vz1 = vld1q_dup_f32(&zero); | |
float32x4_t vz2 = vld1q_dup_f32(&zero); | |
for (size_t s = 0; s < numSamples; ++s) { | |
size_t j = s * numChannels; | |
float32x4_t vx = vld1q_f32(xs + j); | |
float32x4_t vy = vmlaq_f32(vz1, va0, vx); // y = a0[c] * x + z1[c] | |
float32x4_t t1 = vmlaq_f32(vz2, va1, vx); // t1 = a1[c] * x + z2[c] | |
vz1 = vmlsq_f32(t1, vb1, vy); // z1 = t1 - b1[c] * y | |
float32x4_t t2 = vmulq_f32(va2, vx); // t2 = a2[c] * x | |
vz2 = vmlsq_f32(t2, vb2, vy); // z2 = t2 - b2[c] * y | |
vst1q_f32(ys2 + j, vy); | |
} | |
} | |
*/ | |
constexpr int vc = numChannels / 4; | |
float32x4_t va0[vc]; | |
float32x4_t va1[vc]; | |
float32x4_t va2[vc]; | |
float32x4_t vb1[vc]; | |
float32x4_t vb2[vc]; | |
for (size_t c = 0; c < vc; ++c) { | |
va0[c] = vld1q_f32(a0 + c*4); | |
va1[c] = vld1q_f32(a1 + c*4); | |
va2[c] = vld1q_f32(a2 + c*4); | |
vb1[c] = vld1q_f32(b1 + c*4); | |
vb2[c] = vld1q_f32(b2 + c*4); | |
} | |
float32x4_t vz1[vc]; | |
float32x4_t vz2[vc]; | |
auto startTime = std::chrono::high_resolution_clock::now(); | |
for (size_t i = 0; i < numSteps; ++i) { | |
float zero = 0.0f; | |
for (size_t c = 0; c < vc; ++c) { | |
vz1[c] = vld1q_dup_f32(&zero); | |
vz2[c] = vld1q_dup_f32(&zero); | |
} | |
for (size_t s = 0; s < numSamples; ++s) { | |
const size_t j = s * numChannels; | |
for (size_t c = 0; c < vc; ++c) { | |
const size_t k = j + c*4; | |
float32x4_t vx = vld1q_f32(xs + k); | |
float32x4_t vy = vmlaq_f32(vz1[c], va0[c], vx); // y = a0[c] * x + z1[c] | |
float32x4_t t1 = vmlaq_f32(vz2[c], va1[c], vx); // t1 = a1[c] * x + z2[c] | |
vz1[c] = vmlsq_f32(t1, vb1[c], vy); // z1 = t1 - b1[c] * y | |
float32x4_t t2 = vmulq_f32(va2[c], vx); // t2 = a2[c] * x | |
vz2[c] = vmlsq_f32(t2, vb2[c], vy); // z2 = t2 - b2[c] * y | |
vst1q_f32(ys2 + k, vy); | |
} | |
} | |
} | |
auto endTime = std::chrono::high_resolution_clock::now(); | |
auto elapsed = endTime - startTime; | |
elapsed2 = elapsed.count(); | |
printf("[benchmark_2] elapsed: %lld\n", elapsed2); | |
/* | |
// Sanity check! | |
printf("first timestep:\n"); | |
for (size_t c = 0; c < numChannels; ++c) { | |
printf("%.24f\n", ys2[c]); | |
} | |
printf("last timestep:\n"); | |
for (size_t c = 0; c < numChannels; ++c) { | |
printf("%.24f\n", ys2[(numSamples - 1) * numChannels + c]); | |
} | |
*/ | |
// Compare the output values at every timestep. | |
float maxDiff = 0.0f; | |
float threshold = 1e-6f; | |
for (size_t s = 0; s < numSamples; ++s) { | |
size_t j = s * numChannels; | |
for (size_t c = 0; c < numChannels; ++c) { | |
float diff = ys1[j + c] - ys2[j + c]; | |
if (diff > maxDiff) { maxDiff = diff; } | |
if (diff > threshold) { | |
printf("diff! @%zu:%zu %.24f (%.24f vs %.24f)\n", s, c, diff, ys1[j + c], ys2[j + c]); | |
goto buhbye; | |
} | |
} | |
} | |
buhbye: | |
// If the max diff is equal to or smaller than epsilon, I'm happy. | |
printf("max diff = %.24f\n", maxDiff); | |
printf(" epsilon = %.24f\n", std::numeric_limits<float>::epsilon()); | |
printf("faster? %gx\n", double(elapsed1) / elapsed2); | |
} | |
// ============================================================================= | |
int main(int /*argc*/, char** /*argv*/) | |
{ | |
// Disable subnormals. | |
#if defined(__SSE__) | |
_mm_setcsr(_mm_getcsr() | 0x8040); | |
#elif defined(__ARM_NEON__) | |
uintptr_t fpsr = 0; | |
asm volatile("mrs %0, fpcr" : "=r"(fpsr)); | |
fpsr |= 0x01000000U; | |
asm volatile("msr fpcr, %0" : : "ri"(fpsr)); | |
#endif | |
// Make sure the memory is aligned | |
// For 128-bit vector registers we only need alignment of 16 | |
// but let's do 32 anyway. | |
xs = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float)); | |
ys1 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float)); | |
ys2 = (float*)std::aligned_alloc(32, numChannels * numSamples * sizeof(float)); | |
std::srand(std::time(nullptr)); | |
for (size_t i = 0; i < numChannels * numSamples; ++i) { | |
xs[i] = 2.0f * float(std::rand()) / RAND_MAX - 1.0f; | |
ys1[i] = 0.0f; | |
ys2[i] = 0.0f; | |
} | |
// Also align memory here. Note that size in bytes should be multiple of 32, | |
// hence the extra factor 2. Should probably stick these into a single buffer | |
// to avoid wasting space. | |
a0 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float)); | |
a1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float)); | |
a2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float)); | |
b1 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float)); | |
b2 = (float*)std::aligned_alloc(32, numChannels * 2 * sizeof(float)); | |
// Make some fake but realistic coefficients for the different channels. | |
for (size_t c = 0; c < numChannels; ++c) { | |
a0[c] = 0.08365524702654487f + float(c)*0.001f; | |
a1[c] = 0.16731049405308973f + float(c)*0.002f; | |
a2[c] = 0.08365524702654487f - float(c)*0.0001f; | |
b1[c] = -1.031904346533239f - float(c)*0.002f; | |
b2[c] = 0.36652533463941855f + float(c)*0.001f; | |
} | |
printf("***ignore this result (warming up the cache)***\n"); | |
benchmark_1(); | |
benchmark_2(); | |
printf("***OK now pay attention again***\n"); | |
benchmark_1(); | |
benchmark_2(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment