Last active
March 3, 2021 11:02
-
-
Save cyanreg/a04716311ef19d28af9a5dba21e6c7ca to your computer and use it in GitHub Desktop.
16-point FFT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from https://gist.github.com/cyanreg/665b9c79cbe51df9296a969257f2a16c | |
static void fft4(FFTComplex *z) | |
{ | |
FFTSample r1 = z[0].re - z[4].re; | |
FFTSample r2 = z[0].im - z[4].im; | |
FFTSample r3 = z[1].re - z[5].re; | |
FFTSample r4 = z[1].im - z[5].im; | |
/* r5-r8 second transform */ | |
FFTSample t1 = z[0].re + z[4].re; | |
FFTSample t2 = z[0].im + z[4].im; | |
FFTSample t3 = z[1].re + z[5].re; | |
FFTSample t4 = z[1].im + z[5].im; | |
/* t5-t8 second transform */ | |
/* 1sub + 1add = 2 instructions */ | |
FFTSample a3 = t1 - t3; | |
FFTSample a1 = t1 + t3; | |
FFTSample a4 = t2 - t4; | |
FFTSample a2 = t2 + t4; | |
/* shuf + shuf + addsub = 3 */ | |
FFTSample b3 = r1 - r4; | |
FFTSample b1 = r1 + r4; | |
FFTSample b2 = r2 - r3; | |
FFTSample b4 = r2 + r3; | |
/* shuf + shuf + addsub = 3 */ | |
/* shufps + shufps = 10 */ | |
z[0].re = a1; | |
z[0].im = a2; | |
z[1].re = a3; | |
z[1].im = a4; | |
z[4].re = b1; | |
z[4].im = b2; | |
z[5].re = b3; | |
z[5].im = b4; | |
} | |
// from https://gist.github.com/cyanreg/bbf25c8a8dfb910ed3b9ae7663983ca6 | |
static void fft8(FFTComplex *z) | |
{ | |
/* _1_ sub */ | |
FFTSample r1 = z[0].re - z[4].re; | |
FFTSample r2 = z[0].im - z[4].im; // IN LOW m1 | |
FFTSample r3 = z[1].re - z[5].re; | |
FFTSample r4 = z[1].im - z[5].im; | |
FFTSample r5 = z[2].re - z[6].re; | |
FFTSample r6 = z[2].im - z[6].im; // IN HIGH m1 | |
FFTSample r7 = z[3].re - z[7].re; | |
FFTSample r8 = z[3].im - z[7].im; | |
/* _1_ add */ | |
FFTSample q1 = z[0].re + z[4].re; | |
FFTSample q2 = z[0].im + z[4].im; // IN LOW m2 | |
FFTSample q3 = z[1].re + z[5].re; | |
FFTSample q4 = z[1].im + z[5].im; | |
FFTSample q5 = z[2].re + z[6].re; | |
FFTSample q6 = z[2].im + z[6].im; // IN HIGH m2 | |
FFTSample q7 = z[3].re + z[7].re; | |
FFTSample q8 = z[3].im + z[7].im; | |
/* 2 shufps, 1 xor MEM, 1 add = _4_ */ | |
FFTSample s1 = q1 + q3; | |
FFTSample s2 = q2 + q4; | |
FFTSample s3 = q1 - q3; | |
FFTSample s4 = q2 - q4; | |
FFTSample s5 = q5 + q7; | |
FFTSample s6 = q6 + q8; | |
FFTSample s7 = q5 - q7; | |
FFTSample s8 = q6 - q8; | |
/* (1 extract, 1 shuffle = _2_) */ | |
/* additionally 1 insert, 1 xor, 1 add = _3_ to actually perform it */ | |
q1 = s1 + s5; | |
q2 = s2 + s6; | |
q3 = s3 + s8; | |
q4 = s4 - s7; | |
q5 = s1 - s5; | |
q6 = s2 - s6; | |
q7 = s3 - s8; | |
q8 = s4 + s7; | |
z[0].re = q1; | |
z[0].im = q2; | |
z[1].re = q3; | |
z[1].im = q4; | |
z[2].re = q5; | |
z[2].im = q6; | |
z[3].re = q7; | |
z[3].im = q8; | |
/* 11 instructions */ | |
/* 2 shufps + 1 xor + 1 addsub = 4 instr */ | |
FFTSample z1 = r1 + r4; | |
FFTSample z2 = r3 - r2; | |
FFTSample z3 = r3 + r2; | |
FFTSample z4 = r1 - r4; | |
FFTSample z5 = r5 + r6; | |
FFTSample z6 = r7 - r8; | |
FFTSample z7 = r7 + r8; | |
FFTSample z8 = r5 - r6; | |
/* 1 mov, 2 shuffles, 1 addsub = 3 */ | |
FFTSample t5 = z6 + z5; | |
FFTSample t6 = z7 - z8; | |
FFTSample t7 = z7 + z8; | |
FFTSample t8 = z6 - z5; | |
/* 2 extract, 1 shuffle, 1 fma, 1 xors = 5 */ | |
FFTSample u1 = z1 + t5*( M_SQRT1_2); | |
FFTSample u2 = -z2 + t6*( M_SQRT1_2); | |
FFTSample u3 = z4 + t7*(-M_SQRT1_2); | |
FFTSample u4 = z3 + t8*( M_SQRT1_2); | |
FFTSample u5 = z1 + t5*(-M_SQRT1_2); | |
FFTSample u6 = -z2 + t6*(-M_SQRT1_2); | |
FFTSample u7 = z4 + t7*( M_SQRT1_2); | |
FFTSample u8 = z3 + t8*(-M_SQRT1_2); | |
z[4].re = u1; | |
z[4].im = u2; | |
z[5].re = u3; | |
z[5].im = u4; | |
z[6].re = u5; | |
z[6].im = u6; | |
z[7].re = u7; | |
z[7].im = u8; | |
/* 12 instructions = 23 instructions total */ | |
} | |
static void fft16(AVTXContext *ctx, FFTComplex *temp, FFTComplex *z) | |
{ | |
FFTSample cos_16_1 = TX_NAME(ff_cos_16)[1]; | |
FFTSample cos_16_3 = TX_NAME(ff_cos_16)[3]; | |
fft8(z); | |
fft4(z+8); | |
fft4(z+10); | |
FFTSample s[32]; | |
/* | |
xorps m1, m1 - free | |
mulps m0 | |
shufps m1, m1, m0 | |
xorps | |
addsub | |
shufps | |
mulps | |
mulps | |
addps | |
or (fma3) | |
shufps | |
shufps | |
mulps | |
mulps | |
fma | |
fma | |
*/ | |
s[0] = z[8].re*( 1) - z[8].im*( 0); | |
s[1] = z[8].im*( 1) + z[8].re*( 0); | |
s[2] = z[9].re*( 1) - z[9].im*(-1); | |
s[3] = z[9].im*( 1) + z[9].re*(-1); | |
s[4] = z[10].re*( 1) - z[10].im*( 0); | |
s[5] = z[10].im*( 1) + z[10].re*( 0); | |
s[6] = z[11].re*( 1) - z[11].im*( 1); | |
s[7] = z[11].im*( 1) + z[11].re*( 1); | |
s[8] = z[12].re*( cos_16_1) - z[12].im*( -cos_16_3); | |
s[9] = z[12].im*( cos_16_1) + z[12].re*( -cos_16_3); | |
s[10] = z[13].re*( cos_16_3) - z[13].im*( -cos_16_1); | |
s[11] = z[13].im*( cos_16_3) + z[13].re*( -cos_16_1); | |
s[12] = z[14].re*( cos_16_1) - z[14].im*( cos_16_3); | |
s[13] = z[14].im*( -cos_16_1) + z[14].re*( -cos_16_3); | |
s[14] = z[15].re*( cos_16_3) - z[15].im*( cos_16_1); | |
s[15] = z[15].im*( -cos_16_3) + z[15].re*( -cos_16_1); | |
s[2] *= M_SQRT1_2; | |
s[3] *= M_SQRT1_2; | |
s[5] *= -1; | |
s[6] *= M_SQRT1_2; | |
s[7] *= -M_SQRT1_2; | |
FFTSample w5 = s[0] + s[4]; | |
FFTSample w6 = s[1] - s[5]; | |
FFTSample x5 = s[2] + s[6]; | |
FFTSample x6 = s[3] - s[7]; | |
FFTSample w3 = s[4] - s[0]; | |
FFTSample w4 = s[5] + s[1]; | |
FFTSample x3 = s[6] - s[2]; | |
FFTSample x4 = s[7] + s[3]; | |
FFTSample y5 = s[8] + s[12]; | |
FFTSample y6 = s[9] - s[13]; | |
FFTSample u5 = s[10] + s[14]; | |
FFTSample u6 = s[11] - s[15]; | |
FFTSample y3 = s[12] - s[8]; | |
FFTSample y4 = s[13] + s[9]; | |
FFTSample u3 = s[14] - s[10]; | |
FFTSample u4 = s[15] + s[11]; | |
/* 2xorps, 2vperm2fs, 2 adds, 2 vpermilps = 8 */ | |
FFTSample o1 = z[0].re + w5; | |
FFTSample o2 = z[0].im + w6; | |
FFTSample o5 = z[1].re + x5; | |
FFTSample o6 = z[1].im + x6; | |
FFTSample o9 = z[2].re + w4; //h | |
FFTSample o10 = z[2].im + w3; | |
FFTSample o13 = z[3].re + x4; | |
FFTSample o14 = z[3].im + x3; | |
FFTSample o17 = z[0].re - w5; | |
FFTSample o18 = z[0].im - w6; | |
FFTSample o21 = z[1].re - x5; | |
FFTSample o22 = z[1].im - x6; | |
FFTSample o25 = z[2].re - w4; //h | |
FFTSample o26 = z[2].im - w3; | |
FFTSample o29 = z[3].re - x4; | |
FFTSample o30 = z[3].im - x3; | |
FFTSample o3 = z[4].re + y5; | |
FFTSample o4 = z[4].im + y6; | |
FFTSample o7 = z[5].re + u5; | |
FFTSample o8 = z[5].im + u6; | |
FFTSample o11 = z[6].re + y4; //h | |
FFTSample o12 = z[6].im + y3; | |
FFTSample o15 = z[7].re + u4; | |
FFTSample o16 = z[7].im + u3; | |
FFTSample o19 = z[4].re - y5; | |
FFTSample o20 = z[4].im - y6; | |
FFTSample o23 = z[5].re - u5; | |
FFTSample o24 = z[5].im - u6; | |
FFTSample o27 = z[6].re - y4; //h | |
FFTSample o28 = z[6].im - y3; | |
FFTSample o31 = z[7].re - u4; | |
FFTSample o32 = z[7].im - u3; | |
/* This is just deinterleaving, it's separate */ | |
z[0] = (FFTComplex){ o1, o2 }; | |
z[1] = (FFTComplex){ o3, o4 }; | |
z[2] = (FFTComplex){ o5, o6 }; | |
z[3] = (FFTComplex){ o7, o8 }; | |
z[4] = (FFTComplex){ o9, o10 }; | |
z[5] = (FFTComplex){ o11, o12 }; | |
z[6] = (FFTComplex){ o13, o14 }; | |
z[7] = (FFTComplex){ o15, o16 }; | |
z[8] = (FFTComplex){ o17, o18 }; | |
z[9] = (FFTComplex){ o19, o20 }; | |
z[10] = (FFTComplex){ o21, o22 }; | |
z[11] = (FFTComplex){ o23, o24 }; | |
z[12] = (FFTComplex){ o25, o26 }; | |
z[13] = (FFTComplex){ o27, o28 }; | |
z[14] = (FFTComplex){ o29, o30 }; | |
z[15] = (FFTComplex){ o31, o32 }; | |
} | |
#if 0 | |
%define M_SQRT1_2 0.707106781186547524401 | |
%define COS16_1 0.92387950420379638671875 | |
%define COS16_3 0.3826834261417388916015625 | |
s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2 | |
s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3 | |
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1 | |
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2 | |
mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG | |
INIT_YMM avx | |
cglobal fft16, 4, 4, 8, ctx, out, in, stride | |
mov ctxq, [ctxq + AVTXContext.revtab] | |
LOADZ_LUT 0, inq, ctxq + 16*0, strideq, 4 | |
LOADZ_LUT 1, inq, ctxq + 16*1, strideq, 4 | |
LOADZ_LUT 2, inq, ctxq + 16*2, strideq, 4 | |
LOADZ_LUT 3, inq, ctxq + 16*3, strideq, 4 | |
FFT4 m2, m3, m4, m5 | |
FFT8 m0, m1, m6, m7 | |
movaps m6, [mask_mpmppmpm] | |
movaps m7, [s16_perm] | |
xorps m4, m4 ; 0 | |
shufps m4, m4, m2, q2301 ; 0, 0, z[8].im, z[8].re... | |
xorps m4, [mask_mppmmpmp] | |
addps m4, m2, m4 ; s[0..7] | |
mulps m4, [s16_mult_even] ; s[0..7]*costab | |
shufps m5, m3, m3, q2301 ; z[12].im, z[12].re, z[13].im, z[13].re... | |
mulps m3, [s16_mult_odd1] ; z.reim * costab | |
mulps m5, [s16_mult_odd2] ; z.imre * costab | |
addps m5, m3, m5 ; s[8..15] | |
xorps m2, m4, m6 ; s[0..7]*mpmppmpm | |
vperm2f128 m2, m2, q0001 ; s[4..7, 0..3] | |
addps m4, m2 ; w5, w6, x5, x6, w3, w4, x3, x4 | |
vpermilps m4, m7 ; w5, w6, x5, x6, w4, w3, x4, x3 | |
xorps m3, m5, m6 ; s[8..15]*mpmppmpm | |
vperm2f128 m3, m3, q0001 ; s[12..15, 8..11] | |
addps m5, m3 ; y5, y6, u5, u6, y3, y4, u3, u4 | |
vpermilps m5, m7 ; y5, y6, u5, u6, y4, y3, u4, u3 | |
subps m3, m1, m5 ; odd part 2 | |
addps m2, m1, m5 ; odd part 1 | |
subps m1, m0, m4 ; even part 2 | |
addps m0, m0, m4 ; even part 1 | |
; deinterleaving | |
unpcklpd m5, m1, m3 | |
unpcklpd m4, m0, m2 | |
unpckhpd m1, m3 | |
unpckhpd m0, m2 | |
vextractf128 [outq + 16*0], m4, 0 | |
vextractf128 [outq + 16*1], m0, 0 | |
vextractf128 [outq + 16*2], m4, 1 | |
vextractf128 [outq + 16*3], m0, 1 | |
vextractf128 [outq + 16*4], m5, 0 | |
vextractf128 [outq + 16*5], m1, 0 | |
vextractf128 [outq + 16*6], m5, 1 | |
vextractf128 [outq + 16*7], m1, 1 | |
ret | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment