Created
January 25, 2024 20:07
-
-
Save fredrik-johansson/2f88376c5dd9dc0b668e7b6d91b9d266 to your computer and use it in GitHub Desktop.
mulhigh.c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include "profiler.h" | |
#include "mpn_extras.h" | |
#include "crt_helpers.h" | |
#include "ulong_extras.h" | |
FLINT_FORCE_INLINE mp_limb_t | |
flint_mpn_inline_addmul_1(mp_ptr res, mp_srcptr a, mp_size_t n, mp_limb_t c) | |
{ | |
mp_limb_t hi, lo, cy_limb; | |
slong i; | |
umul_ppmm(hi, lo, a[0], c); | |
add_ssaaaa(cy_limb, res[0], hi, lo, 0, res[0]); | |
for (i = 1; i < n; i++) | |
{ | |
umul_ppmm(hi, lo, a[i], c); | |
add_ssaaaa(hi, lo, hi, lo, 0, res[i]); | |
add_ssaaaa(cy_limb, res[i], hi, lo, 0, cy_limb); | |
} | |
return cy_limb; | |
} | |
#undef NN_DOTREV_S3_1X1 | |
/* {s0,s1,s2} = u[0]v[n-1] + u[1]v[n-2] + ... */ | |
/* Assumes n >= 2 */ | |
#define NN_DOTREV_S3_1X1(s2, s1, s0, u, v, n) \ | |
do { \ | |
mp_limb_t __dt0, __dt1, __ds0, __ds1, __ds2; \ | |
slong __i; \ | |
FLINT_ASSERT((n) >= 2); \ | |
umul_ppmm(__ds1, __ds0, (u)[0], (v)[(n) - 1]); \ | |
umul_ppmm(__dt1, __dt0, (u)[1], (v)[(n) - 2]); \ | |
add_sssaaaaaa(__ds2, __ds1, __ds0, 0, __ds1, __ds0, 0, __dt1, __dt0); \ | |
for (__i = 2; __i < (n); __i++) \ | |
{ \ | |
umul_ppmm(__dt1, __dt0, (u)[__i], (v)[(n) - 1 - __i]); \ | |
add_sssaaaaaa(__ds2, __ds1, __ds0, __ds2, __ds1, __ds0, 0, __dt1, __dt0); \ | |
} \ | |
(s0) = __ds0; (s1) = __ds1; (s2) = __ds2; \ | |
} while (0) \ | |
/* {r0,r1,r2} = {s0,s1,s2} + u[0]v[n-1] + u[1]v[n-2] + ... */ | |
/* Assumes n >= 1. May have s2 != 0, but the final sum is assumed to fit in 3 limbs. */ | |
#define NN_DOTREV_S3_A3_1X1(r2, r1, r0, s2, s1, s0, u, v, n) \ | |
do { \ | |
mp_limb_t __dt0, __dt1, __ds0, __ds1, __ds2; \ | |
slong __i; \ | |
FLINT_ASSERT((n) >= 1); \ | |
__ds0 = (s0); __ds1 = (s1); __ds2 = (s2); \ | |
for (__i = 0; __i < (n); __i++) \ | |
{ \ | |
umul_ppmm(__dt1, __dt0, (u)[__i], (v)[(n) - 1 - __i]); \ | |
add_sssaaaaaa(__ds2, __ds1, __ds0, __ds2, __ds1, __ds0, 0, __dt1, __dt0); \ | |
} \ | |
(r0) = __ds0; (r1) = __ds1; (r2) = __ds2; \ | |
} while (0) \ | |
#define NN_MUL_1X1 umul_ppmm | |
/* {r0,r1} = {s0,s1} + x * y, with no carry-out. */ | |
#define NN_ADDMUL_S2_A2_1X1(r1, r0, s1, s0, x, y) \ | |
do { \ | |
mp_limb_t __dt0, __dt1; \ | |
umul_ppmm(__dt1, __dt0, (x), (y)); \ | |
add_ssaaaa(r1, r0, s1, s0, __dt1, __dt0); \ | |
} while (0); \ | |
void flint_mpn_mulhigh_1(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
NN_MUL_1X1(res[1], res[0], u[0], v[0]); | |
} | |
void flint_mpn_mulhigh_2(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[1], u, v, 2); | |
NN_ADDMUL_S2_A2_1X1(res[3], res[2], b, a, u[1], v[1]); | |
} | |
void flint_mpn_mulhigh_3(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[2], u, v, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[3], 0, b, a, u + 1, v + 1, 2); | |
NN_ADDMUL_S2_A2_1X1(res[5], res[4], b, a, u[2], v[2]); | |
} | |
void flint_mpn_mulhigh_4(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[3], u, v, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[4], 0, b, a, u + 1, v + 1, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[5], 0, b, a, u + 2, v + 2, 2); | |
NN_ADDMUL_S2_A2_1X1(res[7], res[6], b, a, u[3], v[3]); | |
} | |
void flint_mpn_mulhigh_5(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[4], u, v, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[5], 0, b, a, u + 1, v + 1, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[6], 0, b, a, u + 2, v + 2, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 3, v + 3, 2); | |
NN_ADDMUL_S2_A2_1X1(res[9], res[8], b, a, u[4], v[4]); | |
} | |
void flint_mpn_mulhigh_6(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[5], u, v, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[6], 0, b, a, u + 1, v + 1, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 2, v + 2, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 3, v + 3, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 4, v + 4, 2); | |
NN_ADDMUL_S2_A2_1X1(res[11], res[10], b, a, u[5], v[5]); | |
} | |
void flint_mpn_mulhigh_7(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[6], u, v, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[7], 0, b, a, u + 1, v + 1, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 2, v + 2, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 3, v + 3, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 4, v + 4, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 5, v + 5, 2); | |
NN_ADDMUL_S2_A2_1X1(res[13], res[12], b, a, u[6], v[6]); | |
} | |
void flint_mpn_mulhigh_8(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[7], u, v, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[8], 0, b, a, u + 1, v + 1, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 2, v + 2, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 3, v + 3, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 4, v + 4, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 5, v + 5, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 6, v + 6, 2); | |
NN_ADDMUL_S2_A2_1X1(res[15], res[14], b, a, u[7], v[7]); | |
} | |
void flint_mpn_mulhigh_9(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[8], u, v, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[9], 0, b, a, u + 1, v + 1, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 2, v + 2, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 3, v + 3, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 4, v + 4, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 5, v + 5, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 6, v + 6, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 7, v + 7, 2); | |
NN_ADDMUL_S2_A2_1X1(res[17], res[16], b, a, u[8], v[8]); | |
} | |
void flint_mpn_mulhigh_10(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[9], u, v, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[10], 0, b, a, u + 1, v + 1, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 2, v + 2, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 3, v + 3, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 4, v + 4, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 5, v + 5, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 6, v + 6, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 7, v + 7, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 8, v + 8, 2); | |
NN_ADDMUL_S2_A2_1X1(res[19], res[18], b, a, u[9], v[9]); | |
} | |
void flint_mpn_mulhigh_11(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[10], u, v, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[11], 0, b, a, u + 1, v + 1, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 2, v + 2, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 3, v + 3, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 4, v + 4, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 5, v + 5, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 6, v + 6, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 7, v + 7, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 8, v + 8, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 9, v + 9, 2); | |
NN_ADDMUL_S2_A2_1X1(res[21], res[20], b, a, u[10], v[10]); | |
} | |
void flint_mpn_mulhigh_12(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[11], u, v, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[12], 0, b, a, u + 1, v + 1, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 2, v + 2, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 3, v + 3, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 4, v + 4, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 5, v + 5, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 6, v + 6, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 7, v + 7, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 8, v + 8, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 9, v + 9, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 10, v + 10, 2); | |
NN_ADDMUL_S2_A2_1X1(res[23], res[22], b, a, u[11], v[11]); | |
} | |
void flint_mpn_mulhigh_13(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[12], u, v, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[13], 0, b, a, u + 1, v + 1, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 2, v + 2, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 3, v + 3, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 4, v + 4, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 5, v + 5, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 6, v + 6, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 7, v + 7, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 8, v + 8, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 9, v + 9, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 10, v + 10, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 11, v + 11, 2); | |
NN_ADDMUL_S2_A2_1X1(res[25], res[24], b, a, u[12], v[12]); | |
} | |
void flint_mpn_mulhigh_14(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[13], u, v, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[14], 0, b, a, u + 1, v + 1, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 2, v + 2, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 3, v + 3, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 4, v + 4, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 5, v + 5, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 6, v + 6, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 7, v + 7, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 8, v + 8, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 9, v + 9, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 10, v + 10, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 11, v + 11, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 12, v + 12, 2); | |
NN_ADDMUL_S2_A2_1X1(res[27], res[26], b, a, u[13], v[13]); | |
} | |
void flint_mpn_mulhigh_15(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[14], u, v, 15); | |
NN_DOTREV_S3_A3_1X1(b, a, res[15], 0, b, a, u + 1, v + 1, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 2, v + 2, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 3, v + 3, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 4, v + 4, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 5, v + 5, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 6, v + 6, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 7, v + 7, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 8, v + 8, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 9, v + 9, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 10, v + 10, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 11, v + 11, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 12, v + 12, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 13, v + 13, 2); | |
NN_ADDMUL_S2_A2_1X1(res[29], res[28], b, a, u[14], v[14]); | |
} | |
void flint_mpn_mulhigh_16(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[15], u, v, 16); | |
NN_DOTREV_S3_A3_1X1(b, a, res[16], 0, b, a, u + 1, v + 1, 15); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 2, v + 2, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 3, v + 3, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 4, v + 4, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 5, v + 5, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 6, v + 6, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 7, v + 7, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 8, v + 8, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 9, v + 9, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 10, v + 10, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 11, v + 11, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 12, v + 12, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 13, v + 13, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 14, v + 14, 2); | |
NN_ADDMUL_S2_A2_1X1(res[31], res[30], b, a, u[15], v[15]); | |
} | |
void flint_mpn_mulhigh_17(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[16], u, v, 17); | |
NN_DOTREV_S3_A3_1X1(b, a, res[17], 0, b, a, u + 1, v + 1, 16); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 2, v + 2, 15); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 3, v + 3, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 4, v + 4, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 5, v + 5, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 6, v + 6, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 7, v + 7, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 8, v + 8, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 9, v + 9, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 10, v + 10, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 11, v + 11, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 12, v + 12, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 13, v + 13, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 14, v + 14, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 15, v + 15, 2); | |
NN_ADDMUL_S2_A2_1X1(res[33], res[32], b, a, u[16], v[16]); | |
} | |
void flint_mpn_mulhigh_18(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[17], u, v, 18); | |
NN_DOTREV_S3_A3_1X1(b, a, res[18], 0, b, a, u + 1, v + 1, 17); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 2, v + 2, 16); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 3, v + 3, 15); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 4, v + 4, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 5, v + 5, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 6, v + 6, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 7, v + 7, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 8, v + 8, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 9, v + 9, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 10, v + 10, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 11, v + 11, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 12, v + 12, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 13, v + 13, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 14, v + 14, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[32], 0, b, a, u + 15, v + 15, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[33], 0, b, a, u + 16, v + 16, 2); | |
NN_ADDMUL_S2_A2_1X1(res[35], res[34], b, a, u[17], v[17]); | |
} | |
void flint_mpn_mulhigh_19(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
mp_limb_t a, b; | |
NN_DOTREV_S3_1X1(b, a, res[18], u, v, 19); | |
NN_DOTREV_S3_A3_1X1(b, a, res[19], 0, b, a, u + 1, v + 1, 18); | |
NN_DOTREV_S3_A3_1X1(b, a, res[20], 0, b, a, u + 2, v + 2, 17); | |
NN_DOTREV_S3_A3_1X1(b, a, res[21], 0, b, a, u + 3, v + 3, 16); | |
NN_DOTREV_S3_A3_1X1(b, a, res[22], 0, b, a, u + 4, v + 4, 15); | |
NN_DOTREV_S3_A3_1X1(b, a, res[23], 0, b, a, u + 5, v + 5, 14); | |
NN_DOTREV_S3_A3_1X1(b, a, res[24], 0, b, a, u + 6, v + 6, 13); | |
NN_DOTREV_S3_A3_1X1(b, a, res[25], 0, b, a, u + 7, v + 7, 12); | |
NN_DOTREV_S3_A3_1X1(b, a, res[26], 0, b, a, u + 8, v + 8, 11); | |
NN_DOTREV_S3_A3_1X1(b, a, res[27], 0, b, a, u + 9, v + 9, 10); | |
NN_DOTREV_S3_A3_1X1(b, a, res[28], 0, b, a, u + 10, v + 10, 9); | |
NN_DOTREV_S3_A3_1X1(b, a, res[29], 0, b, a, u + 11, v + 11, 8); | |
NN_DOTREV_S3_A3_1X1(b, a, res[30], 0, b, a, u + 12, v + 12, 7); | |
NN_DOTREV_S3_A3_1X1(b, a, res[31], 0, b, a, u + 13, v + 13, 6); | |
NN_DOTREV_S3_A3_1X1(b, a, res[32], 0, b, a, u + 14, v + 14, 5); | |
NN_DOTREV_S3_A3_1X1(b, a, res[33], 0, b, a, u + 15, v + 15, 4); | |
NN_DOTREV_S3_A3_1X1(b, a, res[34], 0, b, a, u + 16, v + 16, 3); | |
NN_DOTREV_S3_A3_1X1(b, a, res[35], 0, b, a, u + 17, v + 17, 2); | |
NN_ADDMUL_S2_A2_1X1(res[37], res[36], b, a, u[18], v[18]); | |
} | |
void | |
flint_mpn_mulhigh_n_classical(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n) | |
{ | |
slong i; | |
res += n - 1; | |
umul_ppmm(res[1], res[0], u[n - 1], v[0]); | |
for (i = 2 ; i <= n ; i++) | |
res[i] = mpn_addmul_1(res, u + n - i, i, v[i - 1]); | |
} | |
void flint_mpn_mulhigh_n_basecase(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n) | |
{ | |
switch (n) | |
{ | |
case 0: FLINT_UNREACHABLE; | |
case 1: flint_mpn_mulhigh_1(rp, np, mp); break; | |
case 2: flint_mpn_mulhigh_2(rp, np, mp); break; | |
case 3: flint_mpn_mulhigh_3(rp, np, mp); break; | |
case 4: flint_mpn_mulhigh_4(rp, np, mp); break; | |
case 5: flint_mpn_mulhigh_5(rp, np, mp); break; | |
case 6: flint_mpn_mulhigh_6(rp, np, mp); break; | |
case 7: flint_mpn_mulhigh_7(rp, np, mp); break; | |
case 8: flint_mpn_mulhigh_8(rp, np, mp); break; | |
case 9: flint_mpn_mulhigh_9(rp, np, mp); break; | |
case 10: flint_mpn_mulhigh_10(rp, np, mp); break; | |
case 11: flint_mpn_mulhigh_11(rp, np, mp); break; | |
case 12: flint_mpn_mulhigh_12(rp, np, mp); break; | |
case 13: flint_mpn_mulhigh_13(rp, np, mp); break; | |
case 14: flint_mpn_mulhigh_14(rp, np, mp); break; | |
case 15: flint_mpn_mulhigh_15(rp, np, mp); break; | |
case 16: flint_mpn_mulhigh_16(rp, np, mp); break; | |
case 17: flint_mpn_mulhigh_17(rp, np, mp); break; | |
case 18: flint_mpn_mulhigh_18(rp, np, mp); break; | |
case 19: flint_mpn_mulhigh_19(rp, np, mp); break; | |
default: flint_mpn_mulhigh_n_classical(rp, np, mp, n); break; | |
} | |
} | |
#if 0 | |
FLINT_FORCE_INLINE | |
void flint_mpn_mulhigh_1(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[0], v[0]); | |
} | |
FLINT_FORCE_INLINE | |
void flint_mpn_mulhigh_2(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[1], v[0]); | |
res[2] = flint_mpn_inline_addmul_1(res, u + 0, 2, v[1]); | |
} | |
FLINT_FORCE_INLINE | |
void flint_mpn_mulhigh_3(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[2], v[0]); | |
res[2] = flint_mpn_inline_addmul_1(res, u + 1, 2, v[1]); | |
res[3] = flint_mpn_inline_addmul_1(res, u + 0, 3, v[2]); | |
} | |
FLINT_FORCE_INLINE | |
void flint_mpn_mulhigh_4(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[3], v[0]); | |
res[2] = flint_mpn_inline_addmul_1(res, u + 2, 2, v[1]); | |
res[3] = flint_mpn_inline_addmul_1(res, u + 1, 3, v[2]); | |
res[4] = flint_mpn_inline_addmul_1(res, u + 0, 4, v[3]); | |
} | |
void flint_mpn_mulhigh_5(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[4], v[0]); | |
res[2] = flint_mpn_inline_addmul_1(res, u + 3, 2, v[1]); | |
res[3] = flint_mpn_inline_addmul_1(res, u + 2, 3, v[2]); | |
res[4] = flint_mpn_inline_addmul_1(res, u + 1, 4, v[3]); | |
res[5] = flint_mpn_inline_addmul_1(res, u + 0, 5, v[4]); | |
} | |
void flint_mpn_mulhigh_6a(mp_ptr res, mp_srcptr u, mp_srcptr v) | |
{ | |
umul_ppmm(res[1], res[0], u[5], v[0]); | |
res[2] = flint_mpn_inline_addmul_1(res, u + 4, 2, v[1]); | |
res[3] = flint_mpn_inline_addmul_1(res, u + 3, 3, v[2]); | |
res[4] = flint_mpn_inline_addmul_1(res, u + 2, 4, v[3]); | |
res[5] = flint_mpn_inline_addmul_1(res, u + 1, 5, v[4]); | |
res[6] = flint_mpn_inline_addmul_1(res, u + 0, 6, v[5]); | |
} | |
#endif | |
/* Tuning values against GMP 6.3.0 and fft_small on Zen3. */ | |
/* Table stores k / 4 so that entries can fit in a byte. */ | |
#define KTAB_LEN 800 | |
static unsigned char flint_mpn_mulhigh_ktab[KTAB_LEN] = { | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
0, 0, 12, 0, 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, | |
14, 14, 14, 14, 16, 14, 14, 16, 16, 16, 16, 16, 16, 17, 19, 16, 19, 19, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 19, | |
20, 19, 22, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 27, 24, 24, 24, 24, 25, 24, 27, 24, 24, 27, 24, 27, 27, 24, 27, 27, 27, 27, | |
27, 27, 26, 27, 24, 27, 27, 27, 27, 26, 27, 29, 28, 29, 28, 27, 27, 27, 29, 29, 30, 28, 27, 27, 27, 28, 28, 27, 33, 27, 27, 33, | |
28, 34, 36, 30, 29, 35, 36, 37, 35, 36, 35, 35, 36, 37, 36, 37, 36, 36, 37, 37, 36, 37, 38, 37, 37, 34, 39, 38, 36, 37, 37, 38, | |
37, 36, 37, 37, 36, 37, 37, 38, 37, 38, 38, 37, 36, 38, 36, 35, 36, 37, 43, 47, 47, 47, 39, 47, 47, 38, 47, 47, 47, 47, 43, 47, | |
47, 47, 46, 47, 47, 47, 43, 47, 47, 47, 43, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 51, 47, 47, 47, 47, 47, 47, 47, 55, 47, 55, | |
51, 55, 46, 47, 51, 51, 51, 51, 55, 51, 51, 51, 55, 47, 51, 51, 47, 47, 55, 55, 55, 47, 55, 54, 51, 55, 55, 54, 63, 59, 55, 59, | |
55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 64, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 67, 55, 64, 67, 64, 67, 67, 63, | |
67, 66, 59, 66, 67, 67, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 63, 67, 67, 67, 67, 67, 67, 71, 71, 71, 67, 67, 72, 67, | |
67, 67, 67, 71, 67, 75, 72, 75, 75, 75, 71, 76, 75, 76, 75, 71, 71, 75, 82, 75, 82, 75, 75, 75, 76, 82, 75, 75, 82, 75, 82, 82, | |
76, 82, 75, 82, 82, 82, 82, 82, 76, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, | |
82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 82, 94, 110, 108, 112, 109, 110, 112, 108, 111, 109, | |
112, 110, 111, 112, 113, 110, 112, 112, 110, 113, 114, 115, 115, 113, 112, 117, 115, 118, 115, 118, 116, 119, 115, 119, 117, 120, 120, 118, 119, 119, 120, 119, | |
118, 119, 120, 119, 119, 118, 120, 120, 119, 120, 120, 120, 120, 120, 119, 119, 120, 119, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, | |
120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, | |
120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 141, 120, 140, 136, 140, 140, 139, 138, 139, 140, 141, 140, 144, 141, 142, 144, 144, 144, | |
141, 144, 144, 144, 144, 142, 144, 144, 144, 144, 144, 144, 144, 143, 143, 144, 143, 144, 144, 144, 144, 144, 144, 144, 144, 143, 144, 144, 144, 144, 144, 143, | |
144, 144, 143, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 141, 144, 144, 144, 144, 144, 144, 144, 144, 144, | |
144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 144, 142, 143, 144, 144, 144, 144, 142, 144, 143, 143, 144, 144, 144, 140, 141, 141, 141, | |
144, 143, 139, 143, 142, 144, 139, 142, 139, 141, 140, 141, 144, 142, 142, 143, 143, 144, 143, 144, 143, 144, 141, 143, 143, 143, 141, 142, 144, 143, 142, 144, | |
140, 144, 143, 140, 142, 141, 141, 144, 142, 142, 144, 143, 144, 143, 142, 142, 142, 142, 143, 143, 143, 144, 144, 144, 140, 141, 142, 140, 143, 142, 142, 144, | |
144, 144, 144, 255, 255, 255, 255, 255, 255, 143, 255, 255, 255, 255, 255, 194, 255, 194, 193, 193, 193, 194, 195, 194, 193, 195, 255, 194, 194, 193, 194, 195, | |
}; | |
void flint_mpn_mulhigh_n(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n); | |
void | |
flint_mpn_mulhigh_n_recursive(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n, int k) | |
{ | |
mp_limb_t cy; | |
mp_size_t l; | |
if (k == 0) | |
{ | |
flint_mpn_mulhigh_n_basecase(res, u, v, n); | |
return; | |
} | |
else if (k == 255) | |
{ | |
flint_mpn_mul_n(res, u, v, n); | |
return; | |
} | |
/* Required for correctness (see MPFR). */ | |
k = FLINT_MAX(k, (n + 4) / 2); | |
k = FLINT_MIN(k, n - 1); | |
l = n - k; | |
flint_mpn_mul_n(res + 2 * l, u + l, v + l, k); | |
flint_mpn_mulhigh_n(res, u + k, v, l); | |
cy = mpn_add_n(res + n - 1, res + n - 1, res + l - 1, l + 1); | |
flint_mpn_mulhigh_n(res, u, v + k, l); | |
cy += mpn_add_n(res + n - 1, res + n - 1, res + l - 1, l + 1); | |
mpn_add_1(res + n + l, res + n + l, k, cy); | |
} | |
void | |
flint_mpn_mulhigh_n(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n) | |
{ | |
int k; | |
switch (n) | |
{ | |
case 1: flint_mpn_mulhigh_1(res, u, v); break; | |
case 2: flint_mpn_mulhigh_2(res, u, v); break; | |
case 3: flint_mpn_mulhigh_3(res, u, v); break; | |
case 4: flint_mpn_mulhigh_4(res, u, v); break; | |
case 5: flint_mpn_mulhigh_5(res, u, v); break; | |
case 6: flint_mpn_mulhigh_6(res, u, v); break; | |
case 7: flint_mpn_mulhigh_7(res, u, v); break; | |
case 8: flint_mpn_mulhigh_8(res, u, v); break; | |
case 9: flint_mpn_mulhigh_9(res, u, v); break; | |
case 10: flint_mpn_mulhigh_10(res, u, v); break; | |
case 11: flint_mpn_mulhigh_11(res, u, v); break; | |
case 12: flint_mpn_mulhigh_12(res, u, v); break; | |
case 13: flint_mpn_mulhigh_13(res, u, v); break; | |
case 14: flint_mpn_mulhigh_14(res, u, v); break; | |
case 15: flint_mpn_mulhigh_15(res, u, v); break; | |
case 16: flint_mpn_mulhigh_16(res, u, v); break; | |
case 17: flint_mpn_mulhigh_17(res, u, v); break; | |
case 18: flint_mpn_mulhigh_18(res, u, v); break; | |
case 19: flint_mpn_mulhigh_19(res, u, v); break; | |
default: | |
{ | |
if (n < KTAB_LEN) | |
{ | |
k = (int) flint_mpn_mulhigh_ktab[n]; | |
if (k != 255) | |
k *= 4; | |
if (k == 0) | |
flint_mpn_mulhigh_n_basecase(res, u, v, n); | |
else | |
flint_mpn_mulhigh_n_recursive(res, u, v, n, k); | |
} | |
else | |
{ | |
flint_mpn_mul_n(res, u, v, n); | |
} | |
} | |
} | |
} | |
void mpfr_mulhigh_n (mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n); | |
void mpfr_sqrhigh_n(mp_ptr rp, mp_srcptr np, mp_size_t n); | |
#undef TIMEIT_END_REPEAT | |
#undef TIMEIT_STOP_VALUES | |
#define TIMEIT_END_REPEAT(__timer, __reps) \ | |
} \ | |
timeit_stop(__timer); \ | |
if (__timer->cpu >= 10) \ | |
break; \ | |
__reps *= 10; \ | |
} \ | |
} while (0); | |
#define TIMEIT_STOP_VALUES(tcpu, twall) \ | |
TIMEIT_END_REPEAT(__timer, __reps) \ | |
(tcpu) = __timer->cpu*0.001 / __reps; \ | |
(twall) = __timer->wall*0.001 / __reps; \ | |
} while (0); | |
slong next_r(slong r) | |
{ | |
if (r <= 15) | |
return r + 1; | |
if ((r & (r - 1)) == 0) | |
r = (3 * r) / 2; | |
else | |
r = (4 * r) / 3; | |
return r; | |
} | |
int main() | |
{ | |
int iter; | |
flint_rand_t state; | |
flint_randinit(state); | |
for (iter = 0; iter < 100000; iter++) | |
{ | |
slong i, N; | |
mp_ptr X, Y, R1, R2, ERR, TOL; | |
if (n_randint(state, 100) == 0) | |
N = 1 + n_randtest(state) % 1000; | |
else if (n_randint(state, 10) == 0) | |
N = 1 + n_randtest(state) % 100; | |
else | |
N = 1 + n_randtest(state) % 20; | |
X = flint_malloc(sizeof(mp_limb_t) * N); | |
Y = flint_malloc(sizeof(mp_limb_t) * N); | |
R1 = flint_malloc(sizeof(mp_limb_t) * 2 * N); | |
R2 = flint_malloc(sizeof(mp_limb_t) * 2 * N); | |
ERR = flint_malloc(sizeof(mp_limb_t) * (N + 1)); | |
TOL = flint_malloc(sizeof(mp_limb_t) * (N + 1)); | |
for (i = 0; i < N; i++) | |
{ | |
X[i] = n_randtest(state); | |
Y[i] = n_randtest(state); | |
} | |
for (i = 0; i < 2 * N; i++) | |
R1[i] = n_randtest(state); | |
flint_mpn_mulhigh_n(R1, X, Y, N); | |
flint_mpn_mul_n(R2, X, Y, N); | |
ERR[N] = mpn_sub_n(ERR, R2 + N, R1 + N, N); | |
mpn_zero(TOL + 1, N); | |
TOL[0] = N - 1; | |
if (mpn_cmp(ERR, TOL, N + 1) > 0) | |
{ | |
flint_printf("FAIL: N = %wd\n", N); | |
flint_printf("X = "); flint_mpn_debug(X, N); | |
flint_printf("Y = "); flint_mpn_debug(Y, N); | |
flint_printf("R1 = "); flint_mpn_debug(R1, 2 * N); | |
flint_printf("R2 = "); flint_mpn_debug(R2, 2 * N); | |
flint_printf("ERR = "); flint_mpn_debug(ERR, N + 1); | |
flint_printf("TOL = "); flint_mpn_debug(TOL, N + 1); | |
flint_abort(); | |
} | |
flint_free(X); | |
flint_free(Y); | |
flint_free(R1); | |
flint_free(R2); | |
flint_free(ERR); | |
flint_free(TOL); | |
} | |
flint_printf("OK\n"); | |
{ | |
slong i, n; | |
mp_limb_t x[4000], y[2000], z[2000]; | |
for (i = 0; i < 2000; i++) | |
{ | |
x[i] = n_randtest(state); | |
y[i] = n_randtest(state); | |
z[i] = n_randtest(state); | |
} | |
flint_printf("\n"); | |
flint_printf("n, flint_mpn_mul_n, flint_mpn_mul_n / mpfr, flint_mpn_mul_n / flint_mpn_mulhigh_n\n"); | |
for (n = 1; n <= 2000; n = next_r(n)) | |
{ | |
double t1, t2, t3, tt, __attribute__((unused)) __; | |
TIMEIT_START | |
flint_mpn_mul_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, t1) | |
TIMEIT_START | |
flint_mpn_mul_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, tt) | |
t1 = FLINT_MIN(t1, tt); | |
TIMEIT_START | |
mpfr_mulhigh_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, t2) | |
TIMEIT_START | |
mpfr_mulhigh_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, tt) | |
t2 = FLINT_MIN(t2, tt); | |
TIMEIT_START | |
flint_mpn_mulhigh_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, t3) | |
TIMEIT_START | |
flint_mpn_mulhigh_n(z, x, y, n); | |
TIMEIT_STOP_VALUES(__, tt) | |
t3 = FLINT_MIN(t3, tt); | |
flint_printf("%wd %g %.3f %.3f %s\n", n, t1, t1 / t2, t1 / t3, (t3 < 1.01 * t2 ? " " : "!!!!!")); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment