Last active
April 2, 2021 14:47
-
-
Save cyanreg/12b83d1ade1238b6e1af9901ad70852e to your computer and use it in GitHub Desktop.
Split-Radix recombine loop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define BF(x, y, a, b) \ | |
do { \ | |
x = (a) - (b); \ | |
y = (a) + (b); \ | |
} while (0) | |
#define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6) \ | |
do { \ | |
r0=a0.re; \ | |
i0=a0.im; \ | |
r1=a1.re; \ | |
i1=a1.im; \ | |
BF(t3, P5, P5, P1); \ | |
BF(a2.re, a0.re, r0, P5); \ | |
BF(a3.im, a1.im, i1, t3); \ | |
BF(t4, P6, P2, P6); \ | |
BF(a3.re, a1.re, r1, t4); \ | |
BF(a2.im, a0.im, i0, P6); \ | |
} while (0) | |
#undef BUTTERFLIES | |
#define BUTTERFLIES(a0,a1,a2,a3) \ | |
do { \ | |
r0=a0.re; \ | |
i0=a0.im; \ | |
r1=a1.re; \ | |
i1=a1.im; \ | |
BF(q3, q5, q5, q1); \ | |
BF(a2.re, a0.re, r0, q5); \ | |
BF(a3.im, a1.im, i1, q3); \ | |
BF(q4, q6, q2, q6); \ | |
BF(a3.re, a1.re, r1, q4); \ | |
BF(a2.im, a0.im, i0, q6); \ | |
} while (0) | |
#undef TRANSFORM | |
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \ | |
do { \ | |
CMUL(q1, q2, a2.re, a2.im, wre, -wim); \ | |
CMUL(q5, q6, a3.re, a3.im, wre, wim); \ | |
BUTTERFLIES(a0, a1, a2, a3); \ | |
} while (0) | |
#define CMUL(dre, dim, are, aim, bre, bim) \ | |
do { \ | |
(dre) = (are) * (bre) - (aim) * (bim); \ | |
(dim) = (are) * (bim) + (aim) * (bre); \ | |
} while (0) | |
static void recombine_new(AVTXContext *s, FFTComplex *z, const FFTSample *cos, | |
unsigned int n, FFTComplex *temp_buf) | |
{ | |
const int o1 = 2*n; | |
const int o2 = 4*n; | |
const int o3 = 6*n; | |
const FFTSample *wim = cos + o1 - 7; | |
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1; | |
// LOAD AND ROTATE VIA VPERM2F128! 1 cycle less! | |
for (int i = 0; i < n; i += 4) { | |
FFTSample h[8], j[8], r[8], w[8]; | |
FFTSample t[8]; | |
FFTComplex *m0 = &z[0]; | |
FFTComplex *m1 = &z[4]; | |
FFTComplex *m2 = &z[o2 + 0]; | |
FFTComplex *m3 = &z[o2 + 4]; | |
const FFTSample *t1 = &cos[0]; | |
const FFTSample *t2 = &wim[0]; | |
/* 2 loads (tabs) */ | |
/* 2 vperm2fs, 2 shufs (im), 2 shufs (tabs) */ | |
/* 1 xor, 1 add, 1 sub, 4 mults OR 2 mults, 2 fmas */ | |
/* 13 OR 10ish (-2 each for second passovers!) */ | |
w[0] = m2[0].im*t1[0] - m2[0].re*t2[7]; | |
w[1] = m2[0].re*t1[0] + m2[0].im*t2[7]; | |
w[2] = m2[1].im*t1[2] - m2[1].re*t2[5]; | |
w[3] = m2[1].re*t1[2] + m2[1].im*t2[5]; | |
w[4] = m3[0].im*t1[4] - m3[0].re*t2[3]; | |
w[5] = m3[0].re*t1[4] + m3[0].im*t2[3]; | |
w[6] = m3[1].im*t1[6] - m3[1].re*t2[1]; | |
w[7] = m3[1].re*t1[6] + m3[1].im*t2[1]; | |
j[0] = m2[2].im*t1[0] + m2[2].re*t2[7]; | |
j[1] = m2[2].re*t1[0] - m2[2].im*t2[7]; | |
j[2] = m2[3].im*t1[2] + m2[3].re*t2[5]; | |
j[3] = m2[3].re*t1[2] - m2[3].im*t2[5]; | |
j[4] = m3[2].im*t1[4] + m3[2].re*t2[3]; | |
j[5] = m3[2].re*t1[4] - m3[2].im*t2[3]; | |
j[6] = m3[3].im*t1[6] + m3[3].re*t2[1]; | |
j[7] = m3[3].re*t1[6] - m3[3].im*t2[1]; | |
/* 1 add + 1 shuf */ | |
t[1] = j[0] + w[0]; | |
t[0] = j[1] + w[1]; | |
t[3] = j[2] + w[2]; | |
t[2] = j[3] + w[3]; | |
t[5] = j[4] + w[4]; | |
t[4] = j[5] + w[5]; | |
t[7] = j[6] + w[6]; | |
t[6] = j[7] + w[7]; | |
/* 1 sub + 1 xor */ | |
r[0] = (w[0] - j[0]); | |
r[1] = -(w[1] - j[1]); | |
r[2] = (w[2] - j[2]); | |
r[3] = -(w[3] - j[3]); | |
r[4] = (w[4] - j[4]); | |
r[5] = -(w[5] - j[5]); | |
r[6] = (w[6] - j[6]); | |
r[7] = -(w[7] - j[7]); | |
/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */ | |
m2[0].re = m0[0].re - t[0]; | |
m2[0].im = m0[0].im - t[1]; | |
m2[1].re = m0[1].re - t[2]; | |
m2[1].im = m0[1].im - t[3]; | |
m3[0].re = m0[2].re - t[4]; | |
m3[0].im = m0[2].im - t[5]; | |
m3[1].re = m0[3].re - t[6]; | |
m3[1].im = m0[3].im - t[7]; | |
m2[2].re = m1[0].re - r[0]; | |
m2[2].im = m1[0].im - r[1]; | |
m2[3].re = m1[1].re - r[2]; | |
m2[3].im = m1[1].im - r[3]; | |
m3[2].re = m1[2].re - r[4]; | |
m3[2].im = m1[2].im - r[5]; | |
m3[3].re = m1[3].re - r[6]; | |
m3[3].im = m1[3].im - r[7]; | |
m0[0].re = m0[0].re + t[0]; | |
m0[0].im = m0[0].im + t[1]; | |
m0[1].re = m0[1].re + t[2]; | |
m0[1].im = m0[1].im + t[3]; | |
m0[2].re = m0[2].re + t[4]; | |
m0[2].im = m0[2].im + t[5]; | |
m0[3].re = m0[3].re + t[6]; | |
m0[3].im = m0[3].im + t[7]; | |
m1[0].re = m1[0].re + r[0]; | |
m1[0].im = m1[0].im + r[1]; | |
m1[1].re = m1[1].re + r[2]; | |
m1[1].im = m1[1].im + r[3]; | |
m1[2].re = m1[2].re + r[4]; | |
m1[2].im = m1[2].im + r[5]; | |
m1[3].re = m1[3].re + r[6]; | |
m1[3].im = m1[3].im + r[7]; | |
#if 1 | |
/* Identical for below, but with the following parameters */ | |
m0 = &z[o1]; | |
m1 = &z[o1 + 4]; | |
m2 = &z[o3 + 0]; | |
m3 = &z[o3 + 4]; | |
t1 = &cos[1]; | |
t2 = &wim[-1]; | |
#endif | |
w[0] = m2[0].im*t1[0] - m2[0].re*t2[7]; | |
w[1] = m2[0].re*t1[0] + m2[0].im*t2[7]; | |
w[2] = m2[1].im*t1[2] - m2[1].re*t2[5]; | |
w[3] = m2[1].re*t1[2] + m2[1].im*t2[5]; | |
w[4] = m3[0].im*t1[4] - m3[0].re*t2[3]; | |
w[5] = m3[0].re*t1[4] + m3[0].im*t2[3]; | |
w[6] = m3[1].im*t1[6] - m3[1].re*t2[1]; | |
w[7] = m3[1].re*t1[6] + m3[1].im*t2[1]; | |
j[0] = m2[2].im*t1[0] + m2[2].re*t2[7]; | |
j[1] = m2[2].re*t1[0] - m2[2].im*t2[7]; | |
j[2] = m2[3].im*t1[2] + m2[3].re*t2[5]; | |
j[3] = m2[3].re*t1[2] - m2[3].im*t2[5]; | |
j[4] = m3[2].im*t1[4] + m3[2].re*t2[3]; | |
j[5] = m3[2].re*t1[4] - m3[2].im*t2[3]; | |
j[6] = m3[3].im*t1[6] + m3[3].re*t2[1]; | |
j[7] = m3[3].re*t1[6] - m3[3].im*t2[1]; | |
/* 1 add + 1 shuf */ | |
t[1] = j[0] + w[0]; | |
t[0] = j[1] + w[1]; | |
t[3] = j[2] + w[2]; | |
t[2] = j[3] + w[3]; | |
t[5] = j[4] + w[4]; | |
t[4] = j[5] + w[5]; | |
t[7] = j[6] + w[6]; | |
t[6] = j[7] + w[7]; | |
/* 1 sub + 1 xor */ | |
r[0] = (w[0] - j[0]); | |
r[1] = -(w[1] - j[1]); | |
r[2] = (w[2] - j[2]); | |
r[3] = -(w[3] - j[3]); | |
r[4] = (w[4] - j[4]); | |
r[5] = -(w[5] - j[5]); | |
r[6] = (w[6] - j[6]); | |
r[7] = -(w[7] - j[7]); | |
/* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */ | |
m2[0].re = m0[0].re - t[0]; | |
m2[0].im = m0[0].im - t[1]; | |
m2[1].re = m0[1].re - t[2]; | |
m2[1].im = m0[1].im - t[3]; | |
m3[0].re = m0[2].re - t[4]; | |
m3[0].im = m0[2].im - t[5]; | |
m3[1].re = m0[3].re - t[6]; | |
m3[1].im = m0[3].im - t[7]; | |
m2[2].re = m1[0].re - r[0]; | |
m2[2].im = m1[0].im - r[1]; | |
m2[3].re = m1[1].re - r[2]; | |
m2[3].im = m1[1].im - r[3]; | |
m3[2].re = m1[2].re - r[4]; | |
m3[2].im = m1[2].im - r[5]; | |
m3[3].re = m1[3].re - r[6]; | |
m3[3].im = m1[3].im - r[7]; | |
m0[0].re = m0[0].re + t[0]; | |
m0[0].im = m0[0].im + t[1]; | |
m0[1].re = m0[1].re + t[2]; | |
m0[1].im = m0[1].im + t[3]; | |
m0[2].re = m0[2].re + t[4]; | |
m0[2].im = m0[2].im + t[5]; | |
m0[3].re = m0[3].re + t[6]; | |
m0[3].im = m0[3].im + t[7]; | |
m1[0].re = m1[0].re + r[0]; | |
m1[0].im = m1[0].im + r[1]; | |
m1[1].re = m1[1].re + r[2]; | |
m1[1].im = m1[1].im + r[3]; | |
m1[2].re = m1[2].re + r[4]; | |
m1[2].im = m1[2].im + r[5]; | |
m1[3].re = m1[3].re + r[6]; | |
m1[3].im = m1[3].im + r[7]; | |
#if 0 | |
TRANSFORM(z[ 0 + 0], z[ 0 + 4], z[o2 + 0], z[o2 + 2], cos[0], wim[7]); | |
TRANSFORM(z[ 0 + 1], z[ 0 + 5], z[o2 + 1], z[o2 + 3], cos[2], wim[5]); | |
TRANSFORM(z[ 0 + 2], z[ 0 + 6], z[o2 + 4], z[o2 + 6], cos[4], wim[3]); | |
TRANSFORM(z[ 0 + 3], z[ 0 + 7], z[o2 + 5], z[o2 + 7], cos[6], wim[1]); | |
TRANSFORM(z[o1 + 0], z[o1 + 4], z[o3 + 0], z[o3 + 2], cos[1], wim[6]); | |
TRANSFORM(z[o1 + 1], z[o1 + 5], z[o3 + 1], z[o3 + 3], cos[3], wim[4]); | |
TRANSFORM(z[o1 + 2], z[o1 + 6], z[o3 + 4], z[o3 + 6], cos[5], wim[2]); | |
TRANSFORM(z[o1 + 3], z[o1 + 7], z[o3 + 5], z[o3 + 7], cos[7], wim[0]); | |
#endif | |
#if 0 | |
ff_fft32_float_fma3(s, temp_buf, temp_buf, 0); | |
FFTComplex zztop[] = { | |
m2[2], m2[3], m3[2], m3[3], | |
}; | |
FFTComplex *out = (FFTComplex *)zztop; | |
FFTComplex *temp = temp_buf; | |
temp += 0; out += temp - temp_buf; | |
FFTComplex *ptr = (void *)out; | |
printf("\nIDX = 0re 0im 1re 1im 2re 2im 3re 3im\n" | |
"REF = %f %f %f %f %f %f %f %f\n" | |
"OUT = %f %f %f %f %f %f %f %f\n" | |
"TST = %i %i %i %i %i %i %i %i\n\n", | |
ptr[0].re, ptr[0].im, ptr[1].re, ptr[1].im, | |
ptr[2].re, ptr[2].im, ptr[3].re, ptr[3].im, | |
temp[0].re, temp[0].im, temp[1].re, temp[1].im, | |
temp[2].re, temp[2].im, temp[3].re, temp[3].im, | |
fabsf(ptr[0].re - temp[0].re) <= 3*FLT_EPSILON, fabsf(ptr[0].im - temp[0].im) <= 3*FLT_EPSILON, | |
fabsf(ptr[1].re - temp[1].re) <= 3*FLT_EPSILON, fabsf(ptr[1].im - temp[1].im) <= 3*FLT_EPSILON, | |
fabsf(ptr[2].re - temp[2].re) <= 3*FLT_EPSILON, fabsf(ptr[2].im - temp[2].im) <= 3*FLT_EPSILON, | |
fabsf(ptr[3].re - temp[3].re) <= 3*FLT_EPSILON, fabsf(ptr[3].im - temp[3].im) <= 3*FLT_EPSILON); | |
#endif | |
z += 2*4; | |
cos += 2*4; | |
wim -= 2*4; | |
} | |
} | |
#if 0 | |
; Cobmines m0...m8 (tx1[even, even, odd, odd], tx2,3[even], tx2,3[odd]) coeffs | |
; Uses all 16 of registers. | |
; Output is slightly permuted such that tx2,3's coefficients are interleaved | |
; on a 2-point basis (look at `doc/transforms.md`) | |
%macro SPLIT_RADIX_COMBINE 11 | |
%if %11 && mmsize == 32 | |
vperm2f128 m12, %5, %6, 0x20 ; m2[0], m2[1], m3[0], m3[1] even | |
vperm2f128 m14, %8, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd | |
vperm2f128 m13, %5, %6, 0x31 ; m2[2], m2[3], m3[2], m3[3] even | |
vperm2f128 m15, %8, %7, 0x31 ; m2[2], m2[3], m3[2], m3[3] odd | |
%endif | |
shufps m10, %9, %9, q2200 ; cos00224466 | |
shufps m11, %10, %10, q1133 ; wim77553311 | |
movshdup %9, %9 ; cos11335577 | |
shufps %10, %10, q0022 ; wim66442200 | |
%if %11 && mmsize == 32 | |
shufps %5, m12, m12, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre even | |
shufps %7, m14, m14, q2301 ; m2[0].imre, m2[1].imre, m2[2].imre, m2[3].imre odd | |
shufps %6, m13, m13, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre even | |
shufps %8, m15, m15, q2301 ; m3[0].imre, m3[1].imre, m3[2].imre, m3[3].imre odd | |
; reorder the multiplies to save movs reg, reg in the %if above | |
mulps m12, m11 ; m2[0123]reim * wim7531 even | |
mulps m14, %10 ; m2[0123]reim * wim7531 odd | |
mulps m13, m11 ; m3[0123]reim * wim7531 even | |
mulps m15, %10 ; m3[0123]reim * wim7531 odd | |
%else | |
mulps m12, %5, m11 ; m2,3[01]reim * wim7531 even | |
mulps m14, %7, %10 ; m2,3[01]reim * wim7531 odd | |
mulps m13, %6, m11 ; m2,3[23]reim * wim7531 even | |
mulps m15, %8, %10 ; m2,3[23]reim * wim7531 odd | |
shufps %5, %5, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even | |
shufps %7, %7, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre odd | |
shufps %6, %6, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even | |
shufps %8, %8, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre odd | |
%endif | |
%if cpuflag(fma3) ; 6 instructions saved through FMA! | |
fmaddsubps %5, %5, m10, m12 ; w[0..8] even | |
fmaddsubps %7, %7, %9, m14 ; w[0..8] odd | |
fmsubaddps %6, %6, m10, m13 ; j[0..8] even | |
fmsubaddps %8, %8, %9, m15 ; j[0..8] odd | |
movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" | |
%else | |
mulps %5, m10 ; m2,3[01]imre * cos0246 | |
mulps %7, m8 ; m2,3[01]imre * cos0246 | |
movaps m11, [mask_pmpmpmpm] ; "subaddps? pfft, who needs that!" | |
mulps %8, m8 ; m2,3[23]reim * cos0246 | |
mulps %6, m10 ; m2,3[23]reim * cos0246 | |
addsubps %5, m12 ; w[0..8] | |
addsubps %7, m14 ; w[0..8] | |
xorps m13, m11 ; +-m2,3[23]imre * wim7531 | |
xorps m15, m11 ; +-m2,3[23]imre * wim7531 | |
addps %6, m13 ; j[0..8] | |
addps %8, m15 ; j[0..8] | |
%endif | |
addps m12, %5, m5 ; t10235476 even | |
addps m14, %7, m7 ; t10235476 odd | |
subps m13, %5, m5 ; +-r[0..7] even | |
subps m15, %7, m7 ; +-r[0..7] odd | |
shufps m12, m12, q2301 ; t[0..7] even | |
shufps m14, m14, q2301 ; t[0..7] odd | |
xorps m13, m11 ; r[0..7] even | |
xorps m15, m11 ; r[0..7] odd | |
subps %5, %1, m12 ; %3,3[01] even | |
subps %7, %3, m14 ; %3,3[01] odd | |
subps %6, %2, m13 ; %3,3[23] even | |
subps %8, %4, m15 ; %3,3[23] odd | |
addps %1, m12 ; m0 even | |
addps %3, m14 ; m0 odd | |
addps %2, m13 ; m1 even | |
addps %4, m15 ; m1 odd | |
%endmacro | |
; Same as above, only does a single parity at a time, dependent on whether | |
; the first argument is 0 or 1. Uses 3 temporary registers instead of 6. | |
%macro SPLIT_RADIX_COMBINE_HALF 10 | |
%if %1 | |
shufps %8, %6, %6, q2200 ; cos00224466 | |
shufps %9, %7, %7, q1133 ; wim77553311 | |
%else | |
shufps %8, %6, %6, q3311 ; cos11335577 | |
shufps %9, %7, %7, q0022 ; wim66442200 | |
%endif | |
mulps %10, %4, %9 ; m2,3[01]reim * wim7531 even | |
mulps %9, %5 ; m2,3[23]reim * wim7531 even | |
shufps %4, %4, q2301 ; m2[0].imre, m2[1].imre, m3[0].imre, m3[1].imre even | |
shufps %5, %5, q2301 ; m2[2].imre, m2[3].imre, m3[2].imre, m3[3].imre even | |
%if cpuflag(fma3) | |
fmaddsubps %4, %4, %8, %10 ; w[0..8] even | |
fmsubaddps %5, %5, %8, %9 ; j[0..8] even | |
movaps %8, [mask_pmpmpmpm] | |
%else | |
mulps %4, %8 ; m2,3[01]imre * cos0246 | |
mulps %5, %8 ; m2,3[23]reim * cos0246 | |
movaps %8, [mask_pmpmpmpm] | |
addsubps %4, %10 ; w[0..8] | |
xorps %9, %8 ; +-m2,3[23]imre * wim7531 | |
addps %5, %9 ; j[0..8] | |
%endif | |
addps %10, %4, %5 ; t10235476 | |
subps %9, %4, %5 ; +-r[0..7] | |
shufps %10, %10, q2301 ; t[0..7] | |
xorps %9, %8 ; r[0..7] | |
subps %4, %2, %10 ; %3,3[01] | |
subps %5, %3, %9 ; %3,3[23] | |
addps %2, %10 ; m0 | |
addps %3, %9 ; m1 | |
%endmacro | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment