Created
April 19, 2021 13:10
-
-
Save cyanreg/50fe5ecf6bc5935490aeb189e1bb5d45 to your computer and use it in GitHub Desktop.
FFmpeg transform debugging code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This file is part of FFmpeg. | |
* | |
* FFmpeg is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2.1 of the License, or (at your option) any later version. | |
* | |
* FFmpeg is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with FFmpeg; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*/ | |
#define TX_FLOAT | |
#include "libavutil/tx_priv.h" | |
#include "libavutil/attributes.h" | |
#include "libavutil/x86/cpu.h" | |
void ff_fft2_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft4_inv_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft4_fwd_float_sse2 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft8_float_sse3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft8_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft16_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft16_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft32_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_fft32_float_fma3 (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_split_radix_fft_float_avx (AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
void ff_split_radix_fft_float_fma3(AVTXContext *s, void *out, void *in, ptrdiff_t stride); | |
#include <float.h> | |
#define FFTSample av_unused FFTSample | |
extern const FFTSample ff_cos_16_float[]; | |
extern const FFTSample ff_cos_32_float[]; | |
extern const FFTSample ff_cos_64_float[]; | |
extern const FFTSample ff_cos_128_float[]; | |
extern const FFTSample ff_cos_256_float[]; | |
extern const FFTSample ff_cos_512_float[]; | |
#define BUTTERFLIES(a0,a1,a2,a3) \ | |
do { \ | |
r0=a0.re; \ | |
i0=a0.im; \ | |
r1=a1.re; \ | |
i1=a1.im; \ | |
BF(t3, t5, t5, t1); \ | |
BF(a2.re, a0.re, r0, t5); \ | |
BF(a3.im, a1.im, i1, t3); \ | |
BF(t4, t6, t2, t6); \ | |
BF(a3.re, a1.re, r1, t4); \ | |
BF(a2.im, a0.im, i0, t6); \ | |
} while (0) | |
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \ | |
do { \ | |
CMUL(t1, t2, a2.re, a2.im, wre, -wim); \ | |
CMUL(t5, t6, a3.re, a3.im, wre, wim); \ | |
BUTTERFLIES(a0, a1, a2, a3); \ | |
} while (0) | |
static void fft4(FFTComplex *z) | |
{ | |
FFTSample t1, t2, t3, t4, t5, t6, t7, t8; | |
BF(t3, t1, z[0].re, z[1].re); | |
BF(t8, t6, z[3].re, z[2].re); | |
BF(z[2].re, z[0].re, t1, t6); | |
BF(t4, t2, z[0].im, z[1].im); | |
BF(t7, t5, z[2].im, z[3].im); | |
BF(z[3].im, z[1].im, t4, t8); | |
BF(z[3].re, z[1].re, t3, t7); | |
BF(z[2].im, z[0].im, t2, t5); | |
} | |
static void fft8(FFTComplex *z) | |
{ | |
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | |
fft4(z); | |
BF(t1, z[5].re, z[4].re, -z[5].re); | |
BF(t2, z[5].im, z[4].im, -z[5].im); | |
BF(t5, z[7].re, z[6].re, -z[7].re); | |
BF(t6, z[7].im, z[6].im, -z[7].im); | |
BUTTERFLIES(z[0], z[2], z[4], z[6]); | |
TRANSFORM(z[1], z[3], z[5], z[7], RESCALE(M_SQRT1_2), RESCALE(M_SQRT1_2)); | |
} | |
static void fft16(FFTComplex *z) | |
{ | |
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | |
FFTSample cos_16_1 = TX_NAME(ff_cos_16)[1]; | |
FFTSample cos_16_2 = TX_NAME(ff_cos_16)[2]; | |
FFTSample cos_16_3 = TX_NAME(ff_cos_16)[3]; | |
fft8(z + 0); | |
fft4(z + 8); | |
fft4(z + 12); | |
t1 = z[ 8].re; | |
t2 = z[ 8].im; | |
t5 = z[12].re; | |
t6 = z[12].im; | |
BUTTERFLIES(z[0], z[4], z[8], z[12]); | |
TRANSFORM(z[ 2], z[ 6], z[10], z[14], cos_16_2, cos_16_2); | |
TRANSFORM(z[ 1], z[ 5], z[ 9], z[13], cos_16_1, cos_16_3); | |
TRANSFORM(z[ 3], z[ 7], z[11], z[15], cos_16_3, cos_16_1); | |
} | |
static void split_radix_combine(FFTComplex *z, const FFTSample *cos, int n, int loops) | |
{ | |
int o1 = 2*n; | |
int o2 = 4*n; | |
int o3 = 6*n; | |
const FFTSample *wim = cos + o1 - 7; | |
FFTSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1; | |
FFTComplex *oz = z + o3; | |
for (int i = 0; i < 4*loops; i += 4) { | |
TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]); | |
TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]); | |
TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]); | |
TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]); | |
TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]); | |
TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]); | |
TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]); | |
TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]); | |
skip: | |
z += 2*4; | |
cos += 2*4; | |
wim -= 2*4; | |
} | |
} | |
int old_revtab[4096]; | |
static void fft32(FFTComplex *z) | |
{ | |
int n4 = 8; | |
fft16(z); | |
fft8(z+n4*2); | |
fft8(z+n4*3); | |
split_radix_combine(z, ff_cos_32_float, n4/2, 1); | |
} | |
static void fft64(FFTComplex *z, int combine) | |
{ | |
int n4 = 16; | |
fft32(z); | |
fft16(z+n4*2); | |
fft16(z+n4*3); | |
split_radix_combine(z, ff_cos_64_float, n4/2, combine); | |
} | |
static void fft128(FFTComplex *z, int combine) | |
{ | |
int n4 = 32; | |
fft64(z, 2); | |
fft32(z+n4*2); | |
fft32(z+n4*3); | |
split_radix_combine(z, ff_cos_128_float, n4/2, combine); | |
} | |
static void fft256(FFTComplex *z, int combine) | |
{ | |
int n4 = 64; | |
fft128(z, 4); | |
fft64(z+n4*2, 2); | |
fft64(z+n4*3, 2); | |
split_radix_combine(z, ff_cos_256_float, n4/2, combine); | |
} | |
static void fft512(FFTComplex *z, int combine) | |
{ | |
int n4 = 128; | |
fft256(z, 8); | |
fft128(z+n4*2, 4); | |
fft128(z+n4*3, 4); | |
split_radix_combine(z, ff_cos_512_float, n4/2, combine); | |
} | |
#define BF(x, y, a, b) \ | |
do { \ | |
x = (a) - (b); \ | |
y = (a) + (b); \ | |
} while (0) | |
#define BUTTERFLIES_MIX(a0,a1,a2,a3, P1, P2, P5, P6) \ | |
do { \ | |
r0=a0.re; \ | |
i0=a0.im; \ | |
r1=a1.re; \ | |
i1=a1.im; \ | |
BF(t3, P5, P5, P1); \ | |
BF(a2.re, a0.re, r0, P5); \ | |
BF(a3.im, a1.im, i1, t3); \ | |
BF(t4, P6, P2, P6); \ | |
BF(a3.re, a1.re, r1, t4); \ | |
BF(a2.im, a0.im, i0, P6); \ | |
} while (0) | |
#undef BUTTERFLIES | |
#define BUTTERFLIES(a0,a1,a2,a3) \ | |
do { \ | |
r0=a0.re; \ | |
i0=a0.im; \ | |
r1=a1.re; \ | |
i1=a1.im; \ | |
BF(q3, q5, q5, q1); \ | |
BF(a2.re, a0.re, r0, q5); \ | |
BF(a3.im, a1.im, i1, q3); \ | |
BF(q4, q6, q2, q6); \ | |
BF(a3.re, a1.re, r1, q4); \ | |
BF(a2.im, a0.im, i0, q6); \ | |
} while (0) | |
#undef TRANSFORM | |
#define TRANSFORM(a0,a1,a2,a3,wre,wim) \ | |
do { \ | |
CMUL(q1, q2, a2.re, a2.im, wre, -wim); \ | |
CMUL(q5, q6, a3.re, a3.im, wre, wim); \ | |
BUTTERFLIES(a0, a1, a2, a3); \ | |
} while (0) | |
#define CMUL(dre, dim, are, aim, bre, bim) \ | |
do { \ | |
(dre) = (are) * (bre) - (aim) * (bim); \ | |
(dim) = (are) * (bim) + (aim) * (bre); \ | |
} while (0) | |
int64_t ff_aliens = 0; | |
static av_unused void recombine_new(AVTXContext *s, FFTComplex *z, const FFTSample *cos, | |
unsigned int n, FFTComplex *test, FFTComplex *ref) | |
{ | |
const int o1 = 2*n; | |
const int o2 = 4*n; | |
const int o3 = 6*n; | |
const FFTSample *wim = cos + o1 - 7; | |
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1; | |
for (int i = 0; i < 8; i += 4) { | |
TRANSFORM(z[0 + 0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]); | |
TRANSFORM(z[0 + 1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[2], wim[5]); | |
TRANSFORM(z[0 + 2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[4], wim[3]); | |
TRANSFORM(z[0 + 3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[6], wim[1]); | |
TRANSFORM(z[0 + 8], z[o1 + 8], z[o2 + 8], z[o3 + 8], cos[1], wim[6]); | |
TRANSFORM(z[0 + 9], z[o1 + 9], z[o2 + 9], z[o3 + 9], cos[3], wim[4]); | |
TRANSFORM(z[0 + 10], z[o1 + 10], z[o2 + 10], z[o3 + 10], cos[5], wim[2]); | |
TRANSFORM(z[0 + 11], z[o1 + 11], z[o2 + 11], z[o3 + 11], cos[7], wim[0]); | |
z += 4; | |
cos += 2*4; | |
wim -= 2*4; | |
} | |
} | |
static av_unused void test_fft(AVTXContext *s, void *_out, void *_in, | |
ptrdiff_t stride) | |
{ | |
FFTComplex *in = _in; | |
FFTComplex *out = _out; | |
FFTComplex *out_inter = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_inter_orig = out_inter; | |
FFTComplex *out_test = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_test_orig = out_test; | |
FFTComplex *out_main = av_mallocz(2 * s->m * sizeof(FFTComplex)), *out_main_orig = out_main; | |
FFTComplex *in_inter = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_inter_orig = in_inter; | |
FFTComplex *in_test = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_test_orig = in_test; | |
FFTComplex *in_main = av_mallocz(2 * s->m * sizeof(FFTComplex)), *in_main_orig = in_main; | |
for (int i = 0; i < s->m; i++) { | |
in_main[i] = in[i]; | |
in_test[i] = in[i]; | |
in_inter[i] = in[i]; | |
out[i] = in[old_revtab[i]]; | |
} | |
printf("\n"); | |
if (s->m == 64) | |
fft64(out, 2); | |
else if (s->m == 32) | |
fft32(out); | |
else if (s->m == 128) | |
fft128(out, 4); | |
else if (s->m == 256) | |
fft256(out, 8); | |
else if (s->m == 512) | |
fft512(out, 16); | |
else { | |
} | |
ff_split_radix_fft_float_fma3(s, out_main, in_main, stride); | |
// recombine_new(s, out_main, ff_cos_128_float, 16, in_inter, out); | |
#if 1 | |
FFTComplex *temp = out_main; | |
FFTComplex *ref = out + (temp - out_main); | |
int match; | |
#define MONTE 0 | |
#if 0 | |
int hay = -1; | |
int idx = 112; | |
float needle = ref[idx].re; | |
for (int i = 0; i < s->m; i++) { | |
int fr = fabsf(needle - temp[i].re) <= 32*FLT_EPSILON; | |
int fi = fabsf(needle - temp[i].im) <= 32*FLT_EPSILON; | |
if (fr || fi) { | |
hay = i; | |
break; | |
} | |
} | |
printf("z[%i] = %f %i\n", idx, needle, hay); | |
#endif | |
#if MONTE | |
int start = 96; | |
do { | |
FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1; | |
int n = 16; | |
const int o1 = 2*n; | |
const int o2 = 4*n; | |
const int o3 = 6*n; | |
const FFTSample *cos = ff_cos_128_float; | |
const FFTSample *wim = cos + o1 - 7; | |
int r[6]; | |
for (int i = 0; i < 6; i++) { | |
float val = (double)rand() / (double)RAND_MAX; | |
float val2 = val * s->m; | |
int lolval = lrintf(val2); | |
r[i] = lolval; | |
} | |
FFTComplex *z = temp + 0; | |
cos += 2*4*0; | |
wim -= 2*4*0; | |
FFTComplex backup[4] = { | |
z[0 + r[0]], z[0 + r[1]], z[0 + r[2]], z[0 + r[3]], | |
}; | |
TRANSFORM(z[0 + r[0]], z[0 + r[1]], z[0 + r[2]], z[0 + r[3]], cos[r[4]], cos[r[5]]); | |
#endif | |
int found = 0; | |
int i = 0; | |
match = 0; | |
for (i = 0; i < s->m; i++) | |
{ | |
int correct = 0; | |
int j = 0; | |
for (j = 0; j < s->m; j++) | |
{ | |
int fr = fabsf(ref[i].re - temp[j].re) <= 32*FLT_EPSILON; | |
int fi = fabsf(ref[i].im - temp[j].im) <= 32*FLT_EPSILON; | |
correct += fr && fi; | |
found += correct; | |
if (fr && fi) { | |
match++; | |
if (correct > 0) | |
printf("%i %i\n", i, j); | |
break; | |
} | |
} | |
if (!correct) { | |
printf("%i nope\n", i); | |
} | |
} | |
#if MONTE | |
if (match > start) { | |
printf("Matches = { %i %i %i %i %i %i } %i\n", r[0], r[1], r[2], r[3], r[4], r[5], match); | |
start++; | |
} else { | |
z[0 + r[0]] = backup[0]; | |
z[0 + r[1]] = backup[1]; | |
z[0 + r[2]] = backup[2]; | |
z[0 + r[3]] = backup[3]; | |
} | |
} while (1); | |
#endif | |
for (int i = 0; i < s->m; i++) { | |
int fr = fabsf(0 - temp[i].re) <= 32*FLT_EPSILON; | |
int fi = fabsf(0 - temp[i].im) <= 32*FLT_EPSILON; | |
// if (fr && fi) | |
// printf("Quack at %i\n", i); | |
} | |
temp += 0; | |
ref = out + (temp - out_main); | |
printf("\nIDX = 0re 0im 1re 1im 2re 2im 3re 3im\n" | |
"REF = %f %f %f %f %f %f %f %f\n" | |
"OUT = %f %f %f %f %f %f %f %f\n" | |
"TST = %i %i %i %i %i %i %i %i\n" | |
"MCH = %i/%i, %li\n\n", | |
ref[0].re, ref[0].im, ref[1].re, ref[1].im, | |
ref[2].re, ref[2].im, ref[3].re, ref[3].im, | |
temp[0].re, temp[0].im, temp[1].re, temp[1].im, | |
temp[2].re, temp[2].im, temp[3].re, temp[3].im, | |
fabsf(ref[0].re - temp[0].re) <= 8*FLT_EPSILON, fabsf(ref[0].im - temp[0].im) <= 8*FLT_EPSILON, | |
fabsf(ref[1].re - temp[1].re) <= 8*FLT_EPSILON, fabsf(ref[1].im - temp[1].im) <= 8*FLT_EPSILON, | |
fabsf(ref[2].re - temp[2].re) <= 8*FLT_EPSILON, fabsf(ref[2].im - temp[2].im) <= 8*FLT_EPSILON, | |
fabsf(ref[3].re - temp[3].re) <= 8*FLT_EPSILON, fabsf(ref[3].im - temp[3].im) <= 8*FLT_EPSILON, | |
match, s->m, ff_aliens); | |
#endif | |
// memcpy(out, orig, s->m*sizeof(*test)); | |
av_free(in_inter_orig); | |
av_free(in_test_orig); | |
av_free(in_main_orig); | |
av_free(out_inter_orig); | |
av_free(out_test_orig); | |
av_free(out_main_orig); | |
} | |
av_cold void ff_tx_init_float_x86(AVTXContext *s, av_tx_fn *tx) | |
{ | |
int cpu_flags = av_get_cpu_flags(); | |
int gen_revtab = 0, basis, revtab_interleave; | |
if (s->flags & AV_TX_UNALIGNED) | |
return; | |
if (ff_tx_type_is_mdct(s->type)) | |
return; | |
#define TXFN(fn, gentab, sr_basis, interleave) \ | |
do { \ | |
*tx = fn; \ | |
gen_revtab = gentab; \ | |
basis = sr_basis; \ | |
revtab_interleave = interleave; \ | |
} while (0) | |
if (s->n == 1) { | |
if (EXTERNAL_SSE2(cpu_flags)) { | |
if (s->m == 4 && s->inv) | |
TXFN(ff_fft4_inv_float_sse2, 0, 0, 0); | |
else if (s->m == 4) | |
TXFN(ff_fft4_fwd_float_sse2, 0, 0, 0); | |
} | |
if (EXTERNAL_SSE3(cpu_flags)) { | |
if (s->m == 2) | |
TXFN(ff_fft2_float_sse3, 0, 0, 0); | |
else if (s->m == 8) | |
TXFN(ff_fft8_float_sse3, 1, 8, 0); | |
} | |
if (EXTERNAL_AVX_FAST(cpu_flags)) { | |
if (s->m == 8) | |
TXFN(ff_fft8_float_avx, 1, 8, 0); | |
else if (s->m == 16) | |
TXFN(ff_fft16_float_avx, 1, 8, 2); | |
#if ARCH_X86_64 | |
else if (s->m == 32) | |
TXFN(ff_fft32_float_avx, 1, 8, 2); | |
// else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) | |
// TXFN(ff_split_radix_fft_float_avx, 1, 8, 2); | |
#endif | |
} | |
if (EXTERNAL_FMA3_FAST(cpu_flags)) { | |
if (s->m == 16) | |
TXFN(ff_fft16_float_fma3, 1, 8, 2); | |
#if ARCH_X86_64 | |
else if (s->m == 32) | |
TXFN(ff_fft32_float_fma3, 1, 8, 2); | |
else if (s->m >= 64 && s->m <= 131072 && !(s->flags & AV_TX_INPLACE)) | |
TXFN(ff_split_radix_fft_float_fma3, 1, 8, 2); | |
#endif | |
} | |
} | |
// if (s->n == 1) | |
// TXFN(test_fft, 1, 8, 2); | |
// if (s->revtab) | |
// memcpy(old_revtab, s->revtab, s->m*sizeof(*s->revtab)); | |
if (gen_revtab) | |
ff_tx_gen_split_radix_parity_revtab(s->revtab, s->m, s->inv, basis, | |
revtab_interleave); | |
#undef TXFN | |
#if 0 | |
int *revtab = av_malloc(20*s->m*sizeof(*revtab)); | |
memcpy(revtab, s->revtab, s->m*sizeof(*revtab)); | |
memset(s->revtab, 0, s->m * sizeof(*s->revtab)); | |
revtab_avx(s->revtab, s->m, s->inv, 0, s->m); | |
for (int i = 0; i < s->m; i++) { | |
printf("Remap entry [%i]: %i -> %i\n", i, revtab[i], s->revtab[i]); | |
} | |
av_free(revtab); | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment