Created
March 9, 2021 03:17
-
-
Save cyanreg/3df2c0b272d47c3ee0d62124542c828c to your computer and use it in GitHub Desktop.
8-point FFT (2x at a time)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
static void fft8(void *s, FFTComplex *z, FFTComplex *temp) | |
{ | |
FFTSample r1 = z[0].re - z[4].re; | |
FFTSample r2 = z[0].im - z[4].im; | |
FFTSample r3 = z[1].re - z[5].re; | |
FFTSample r4 = z[1].im - z[5].im; | |
FFTSample j1 = z[2].re - z[6].re; | |
FFTSample j2 = z[2].im - z[6].im; | |
FFTSample j3 = z[3].re - z[7].re; | |
FFTSample j4 = z[3].im - z[7].im; | |
FFTSample q1 = z[0].re + z[4].re; | |
FFTSample q2 = z[0].im + z[4].im; | |
FFTSample q3 = z[1].re + z[5].re; | |
FFTSample q4 = z[1].im + z[5].im; | |
FFTSample k1 = z[2].re + z[6].re; | |
FFTSample k2 = z[2].im + z[6].im; | |
FFTSample k3 = z[3].re + z[7].re; | |
FFTSample k4 = z[3].im + z[7].im; | |
/* 2 add 2 sub = 4 */ | |
/* 2 shufs, 1 add 1 sub = 4 */ | |
FFTSample s1 = q1 + q3; | |
FFTSample s2 = q2 + q4; | |
FFTSample g1 = k3 + k1; | |
FFTSample g2 = k2 + k4; | |
FFTSample s3 = q1 - q3; | |
FFTSample s4 = q2 - q4; | |
FFTSample g4 = k3 - k1; | |
FFTSample g3 = k2 - k4; | |
/* 1 unpack + 1 shuffle = 2 */ | |
/* 1 add */ | |
FFTSample w1 = s1 + g1; | |
FFTSample w2 = s2 + g2; | |
FFTSample w3 = s3 + g3; | |
FFTSample w4 = s4 + g4; | |
/* 1 sub */ | |
FFTSample h1 = s1 - g1; | |
FFTSample h2 = s2 - g2; | |
FFTSample h3 = s3 - g3; | |
FFTSample h4 = s4 - g4; | |
// 12 unhappy | |
z[0].re = w1; | |
z[0].im = w2; | |
z[2].re = w3; | |
z[2].im = w4; | |
z[4].re = h1; | |
z[4].im = h2; | |
z[6].re = h3; | |
z[6].im = h4; | |
/* 1 shuf + 1 shuf + 1 xor + 1 addsub */ | |
FFTSample z1 = r1 + r4; | |
FFTSample z2 = r2 - r3; | |
FFTSample z3 = r1 - r4; | |
FFTSample z4 = r2 + r3; | |
/* 1 mult */ | |
j1 *= M_SQRT1_2; | |
j2 *= -M_SQRT1_2; | |
j3 *= -M_SQRT1_2; | |
j4 *= M_SQRT1_2; | |
/* 1 shuf + 1 addsub */ | |
FFTSample l2 = j1 - j2; | |
FFTSample l1 = j2 + j1; | |
FFTSample l4 = j3 - j4; | |
FFTSample l3 = j4 + j3; | |
/* 1 shuf + 1 addsub */ | |
FFTSample t1 = l3 - l2; | |
FFTSample t2 = l4 + l1; | |
FFTSample t3 = l1 - l4; | |
FFTSample t4 = l2 + l3; | |
/* 1 add */ | |
FFTSample u1 = z1 - t1; | |
FFTSample u2 = z2 - t2; | |
FFTSample u3 = z3 - t3; | |
FFTSample u4 = z4 - t4; | |
/* 1 sub */ | |
FFTSample o1 = z1 + t1; | |
FFTSample o2 = z2 + t2; | |
FFTSample o3 = z3 + t3; | |
FFTSample o4 = z4 + t4; | |
// 11 | |
z[1].re = u1; | |
z[1].im = u2; | |
z[3].re = u3; | |
z[3].im = u4; | |
z[5].re = o1; | |
z[5].im = o2; | |
z[7].re = o3; | |
z[7].im = o4; | |
} | |
#if 0 | |
; Single 8-point in-place complex FFT (will do 2 transforms in [AVX] mode) | |
; %1 - even coefficients (a0.reim, a2.reim, [b0.reim, b2.reim]) | |
; %2 - even coefficients (a4.reim, a6.reim, [b4.reim, b6.reim]) | |
; %3 - odd coefficients (a1.reim, a3.reim, [b1.reim, b3.reim]) | |
; %4 - odd coefficients (a5.reim, a7.reim, [b5.reim, b7.reim]) | |
; %5 - temporary | |
; %6 - temporary | |
%macro FFT8 6 | |
addps %5, %1, %3 ; q1-8 | |
addps %6, %2, %4 ; k1-8 | |
subps %1, %3 ; r1-8 | |
subps %2, %4 ; j1-8 | |
shufps %4, %1, %1, q2323 ; r4343 | |
shufps %3, %5, %6, q3032 ; q34, k14 | |
shufps %1, %1, q1010 ; r1212 | |
shufps %5, %6, q1210 ; q12, k32 | |
xorps %4, [mask_pmmppmmp] ; r4343 * pmmp | |
addps %6, %5, %3 ; s12, g12 | |
mulps %2, [d8_mult_odd] ; r8 * d8_mult_odd | |
subps %5, %3 ; s34, g43 | |
addps %3, %1, %4 ; z1234 | |
unpcklpd %1, %6, %5 ; s1234 | |
shufps %4, %2, %2, q2301 ; j2143 | |
shufps %6, %5, q2332 ; g1234 | |
addsubps %2, %4 ; l2143 | |
shufps %5, %2, %2, q0123 ; l3412 awful 4-instruction dep chain | |
addsubps %5, %2 ; t1234 | |
subps %2, %1, %6 ; h1234 | |
subps %4, %3, %5 ; u1234 | |
addps %1, %6 ; w1234 | |
addps %3, %5 ; o1234 | |
%endmacro | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment