Skip to content

Instantly share code, notes, and snippets.

@cyanreg
Last active March 3, 2021 11:02
Show Gist options
  • Save cyanreg/a04716311ef19d28af9a5dba21e6c7ca to your computer and use it in GitHub Desktop.
Save cyanreg/a04716311ef19d28af9a5dba21e6c7ca to your computer and use it in GitHub Desktop.
16-point FFT
// from https://gist.github.com/cyanreg/665b9c79cbe51df9296a969257f2a16c
static void fft4(FFTComplex *z)
{
FFTSample r1 = z[0].re - z[4].re;
FFTSample r2 = z[0].im - z[4].im;
FFTSample r3 = z[1].re - z[5].re;
FFTSample r4 = z[1].im - z[5].im;
/* r5-r8 second transform */
FFTSample t1 = z[0].re + z[4].re;
FFTSample t2 = z[0].im + z[4].im;
FFTSample t3 = z[1].re + z[5].re;
FFTSample t4 = z[1].im + z[5].im;
/* t5-t8 second transform */
/* 1sub + 1add = 2 instructions */
FFTSample a3 = t1 - t3;
FFTSample a1 = t1 + t3;
FFTSample a4 = t2 - t4;
FFTSample a2 = t2 + t4;
/* shuf + shuf + addsub = 3 */
FFTSample b3 = r1 - r4;
FFTSample b1 = r1 + r4;
FFTSample b2 = r2 - r3;
FFTSample b4 = r2 + r3;
/* shuf + shuf + addsub = 3 */
/* shufps + shufps = 10 */
z[0].re = a1;
z[0].im = a2;
z[1].re = a3;
z[1].im = a4;
z[4].re = b1;
z[4].im = b2;
z[5].re = b3;
z[5].im = b4;
}
// from https://gist.github.com/cyanreg/bbf25c8a8dfb910ed3b9ae7663983ca6
static void fft8(FFTComplex *z)
{
/* _1_ sub */
FFTSample r1 = z[0].re - z[4].re;
FFTSample r2 = z[0].im - z[4].im; // IN LOW m1
FFTSample r3 = z[1].re - z[5].re;
FFTSample r4 = z[1].im - z[5].im;
FFTSample r5 = z[2].re - z[6].re;
FFTSample r6 = z[2].im - z[6].im; // IN HIGH m1
FFTSample r7 = z[3].re - z[7].re;
FFTSample r8 = z[3].im - z[7].im;
/* _1_ add */
FFTSample q1 = z[0].re + z[4].re;
FFTSample q2 = z[0].im + z[4].im; // IN LOW m2
FFTSample q3 = z[1].re + z[5].re;
FFTSample q4 = z[1].im + z[5].im;
FFTSample q5 = z[2].re + z[6].re;
FFTSample q6 = z[2].im + z[6].im; // IN HIGH m2
FFTSample q7 = z[3].re + z[7].re;
FFTSample q8 = z[3].im + z[7].im;
/* 2 shufps, 1 xor MEM, 1 add = _4_ */
FFTSample s1 = q1 + q3;
FFTSample s2 = q2 + q4;
FFTSample s3 = q1 - q3;
FFTSample s4 = q2 - q4;
FFTSample s5 = q5 + q7;
FFTSample s6 = q6 + q8;
FFTSample s7 = q5 - q7;
FFTSample s8 = q6 - q8;
/* (1 extract, 1 shuffle = _2_) */
/* additionally 1 insert, 1 xor, 1 add = _3_ to actually perform it */
q1 = s1 + s5;
q2 = s2 + s6;
q3 = s3 + s8;
q4 = s4 - s7;
q5 = s1 - s5;
q6 = s2 - s6;
q7 = s3 - s8;
q8 = s4 + s7;
z[0].re = q1;
z[0].im = q2;
z[1].re = q3;
z[1].im = q4;
z[2].re = q5;
z[2].im = q6;
z[3].re = q7;
z[3].im = q8;
/* 11 instructions */
/* 2 shufps + 1 xor + 1 addsub = 4 instr */
FFTSample z1 = r1 + r4;
FFTSample z2 = r3 - r2;
FFTSample z3 = r3 + r2;
FFTSample z4 = r1 - r4;
FFTSample z5 = r5 + r6;
FFTSample z6 = r7 - r8;
FFTSample z7 = r7 + r8;
FFTSample z8 = r5 - r6;
/* 1 mov, 2 shuffles, 1 addsub = 3 */
FFTSample t5 = z6 + z5;
FFTSample t6 = z7 - z8;
FFTSample t7 = z7 + z8;
FFTSample t8 = z6 - z5;
/* 2 extract, 1 shuffle, 1 fma, 1 xors = 5 */
FFTSample u1 = z1 + t5*( M_SQRT1_2);
FFTSample u2 = -z2 + t6*( M_SQRT1_2);
FFTSample u3 = z4 + t7*(-M_SQRT1_2);
FFTSample u4 = z3 + t8*( M_SQRT1_2);
FFTSample u5 = z1 + t5*(-M_SQRT1_2);
FFTSample u6 = -z2 + t6*(-M_SQRT1_2);
FFTSample u7 = z4 + t7*( M_SQRT1_2);
FFTSample u8 = z3 + t8*(-M_SQRT1_2);
z[4].re = u1;
z[4].im = u2;
z[5].re = u3;
z[5].im = u4;
z[6].re = u5;
z[6].im = u6;
z[7].re = u7;
z[7].im = u8;
/* 12 instructions = 23 instructions total */
}
static void fft16(AVTXContext *ctx, FFTComplex *temp, FFTComplex *z)
{
FFTSample cos_16_1 = TX_NAME(ff_cos_16)[1];
FFTSample cos_16_3 = TX_NAME(ff_cos_16)[3];
fft8(z);
fft4(z+8);
fft4(z+10);
FFTSample s[32];
/*
xorps m1, m1 - free
mulps m0
shufps m1, m1, m0
xorps
addsub
shufps
mulps
mulps
addps
or (fma3)
shufps
shufps
mulps
mulps
fma
fma
*/
s[0] = z[8].re*( 1) - z[8].im*( 0);
s[1] = z[8].im*( 1) + z[8].re*( 0);
s[2] = z[9].re*( 1) - z[9].im*(-1);
s[3] = z[9].im*( 1) + z[9].re*(-1);
s[4] = z[10].re*( 1) - z[10].im*( 0);
s[5] = z[10].im*( 1) + z[10].re*( 0);
s[6] = z[11].re*( 1) - z[11].im*( 1);
s[7] = z[11].im*( 1) + z[11].re*( 1);
s[8] = z[12].re*( cos_16_1) - z[12].im*( -cos_16_3);
s[9] = z[12].im*( cos_16_1) + z[12].re*( -cos_16_3);
s[10] = z[13].re*( cos_16_3) - z[13].im*( -cos_16_1);
s[11] = z[13].im*( cos_16_3) + z[13].re*( -cos_16_1);
s[12] = z[14].re*( cos_16_1) - z[14].im*( cos_16_3);
s[13] = z[14].im*( -cos_16_1) + z[14].re*( -cos_16_3);
s[14] = z[15].re*( cos_16_3) - z[15].im*( cos_16_1);
s[15] = z[15].im*( -cos_16_3) + z[15].re*( -cos_16_1);
s[2] *= M_SQRT1_2;
s[3] *= M_SQRT1_2;
s[5] *= -1;
s[6] *= M_SQRT1_2;
s[7] *= -M_SQRT1_2;
FFTSample w5 = s[0] + s[4];
FFTSample w6 = s[1] - s[5];
FFTSample x5 = s[2] + s[6];
FFTSample x6 = s[3] - s[7];
FFTSample w3 = s[4] - s[0];
FFTSample w4 = s[5] + s[1];
FFTSample x3 = s[6] - s[2];
FFTSample x4 = s[7] + s[3];
FFTSample y5 = s[8] + s[12];
FFTSample y6 = s[9] - s[13];
FFTSample u5 = s[10] + s[14];
FFTSample u6 = s[11] - s[15];
FFTSample y3 = s[12] - s[8];
FFTSample y4 = s[13] + s[9];
FFTSample u3 = s[14] - s[10];
FFTSample u4 = s[15] + s[11];
/* 2xorps, 2vperm2fs, 2 adds, 2 vpermilps = 8 */
FFTSample o1 = z[0].re + w5;
FFTSample o2 = z[0].im + w6;
FFTSample o5 = z[1].re + x5;
FFTSample o6 = z[1].im + x6;
FFTSample o9 = z[2].re + w4; //h
FFTSample o10 = z[2].im + w3;
FFTSample o13 = z[3].re + x4;
FFTSample o14 = z[3].im + x3;
FFTSample o17 = z[0].re - w5;
FFTSample o18 = z[0].im - w6;
FFTSample o21 = z[1].re - x5;
FFTSample o22 = z[1].im - x6;
FFTSample o25 = z[2].re - w4; //h
FFTSample o26 = z[2].im - w3;
FFTSample o29 = z[3].re - x4;
FFTSample o30 = z[3].im - x3;
FFTSample o3 = z[4].re + y5;
FFTSample o4 = z[4].im + y6;
FFTSample o7 = z[5].re + u5;
FFTSample o8 = z[5].im + u6;
FFTSample o11 = z[6].re + y4; //h
FFTSample o12 = z[6].im + y3;
FFTSample o15 = z[7].re + u4;
FFTSample o16 = z[7].im + u3;
FFTSample o19 = z[4].re - y5;
FFTSample o20 = z[4].im - y6;
FFTSample o23 = z[5].re - u5;
FFTSample o24 = z[5].im - u6;
FFTSample o27 = z[6].re - y4; //h
FFTSample o28 = z[6].im - y3;
FFTSample o31 = z[7].re - u4;
FFTSample o32 = z[7].im - u3;
/* This is just deinterleaving, it's separate */
z[0] = (FFTComplex){ o1, o2 };
z[1] = (FFTComplex){ o3, o4 };
z[2] = (FFTComplex){ o5, o6 };
z[3] = (FFTComplex){ o7, o8 };
z[4] = (FFTComplex){ o9, o10 };
z[5] = (FFTComplex){ o11, o12 };
z[6] = (FFTComplex){ o13, o14 };
z[7] = (FFTComplex){ o15, o16 };
z[8] = (FFTComplex){ o17, o18 };
z[9] = (FFTComplex){ o19, o20 };
z[10] = (FFTComplex){ o21, o22 };
z[11] = (FFTComplex){ o23, o24 };
z[12] = (FFTComplex){ o25, o26 };
z[13] = (FFTComplex){ o27, o28 };
z[14] = (FFTComplex){ o29, o30 };
z[15] = (FFTComplex){ o31, o32 };
}
#if 0
%define M_SQRT1_2 0.707106781186547524401
%define COS16_1 0.92387950420379638671875
%define COS16_3 0.3826834261417388916015625
s16_mult_even: dd 1.0, 1.0, M_SQRT1_2, M_SQRT1_2, 1.0, -1.0, M_SQRT1_2, -M_SQRT1_2
s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, COS16_3, -COS16_3
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
mask_mpmppmpm: dd NEG, POS, NEG, POS, POS, NEG, POS, NEG
INIT_YMM avx
cglobal fft16, 4, 4, 8, ctx, out, in, stride
mov ctxq, [ctxq + AVTXContext.revtab]
LOADZ_LUT 0, inq, ctxq + 16*0, strideq, 4
LOADZ_LUT 1, inq, ctxq + 16*1, strideq, 4
LOADZ_LUT 2, inq, ctxq + 16*2, strideq, 4
LOADZ_LUT 3, inq, ctxq + 16*3, strideq, 4
FFT4 m2, m3, m4, m5
FFT8 m0, m1, m6, m7
movaps m6, [mask_mpmppmpm]
movaps m7, [s16_perm]
xorps m4, m4 ; 0
shufps m4, m4, m2, q2301 ; 0, 0, z[8].im, z[8].re...
xorps m4, [mask_mppmmpmp]
addps m4, m2, m4 ; s[0..7]
mulps m4, [s16_mult_even] ; s[0..7]*costab
shufps m5, m3, m3, q2301 ; z[12].im, z[12].re, z[13].im, z[13].re...
mulps m3, [s16_mult_odd1] ; z.reim * costab
mulps m5, [s16_mult_odd2] ; z.imre * costab
addps m5, m3, m5 ; s[8..15]
xorps m2, m4, m6 ; s[0..7]*mpmppmpm
vperm2f128 m2, m2, q0001 ; s[4..7, 0..3]
addps m4, m2 ; w5, w6, x5, x6, w3, w4, x3, x4
vpermilps m4, m7 ; w5, w6, x5, x6, w4, w3, x4, x3
xorps m3, m5, m6 ; s[8..15]*mpmppmpm
vperm2f128 m3, m3, q0001 ; s[12..15, 8..11]
addps m5, m3 ; y5, y6, u5, u6, y3, y4, u3, u4
vpermilps m5, m7 ; y5, y6, u5, u6, y4, y3, u4, u3
subps m3, m1, m5 ; odd part 2
addps m2, m1, m5 ; odd part 1
subps m1, m0, m4 ; even part 2
addps m0, m0, m4 ; even part 1
; deinterleaving
unpcklpd m5, m1, m3
unpcklpd m4, m0, m2
unpckhpd m1, m3
unpckhpd m0, m2
vextractf128 [outq + 16*0], m4, 0
vextractf128 [outq + 16*1], m0, 0
vextractf128 [outq + 16*2], m4, 1
vextractf128 [outq + 16*3], m0, 1
vextractf128 [outq + 16*4], m5, 0
vextractf128 [outq + 16*5], m1, 0
vextractf128 [outq + 16*6], m5, 1
vextractf128 [outq + 16*7], m1, 1
ret
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment