Last active
December 8, 2023 22:24
-
-
Save fredrik-johansson/952587923f3e90a9f76e9302bb72a549 to your computer and use it in GitHub Desktop.
mpn instrinsics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <x86intrin.h> | |
#include "flint.h" | |
#include "longlong.h" | |
#include "ulong_extras.h" | |
#include "mpn_extras.h" | |
#include "profiler.h" | |
#define umul_ppmm_mulx(w1, w0, u, v) \ | |
__asm__ ("mulx\t%3, %q0, %q1" \ | |
: "=r" (w0), "=r" (w1) \ | |
: "%d" ((ulong)(u)), "rm" ((ulong)(v))) | |
#define umul_ppmm_int128(r1, r0, u, v) do { __uint128_t __rh = (__uint128_t) u * (__uint128_t) v; (r0) = (ulong) __rh; (r1) = (ulong) (__rh >> 64); } while (0) | |
FLINT_FORCE_INLINE unsigned char n_addc_intrinsic(unsigned char cf, ulong x, ulong y, ulong* s) | |
{ | |
long long unsigned int _s; | |
cf = _addcarry_u64(cf, (long long unsigned int)(x), | |
(long long unsigned int)(y), | |
&_s); | |
*s = (ulong)(_s); | |
return cf; | |
} | |
#if 1 | |
/* code suggested on https://gcc.gnu.org/onlinedocs/gcc/Integer-Overflow-Builtins.html */ | |
#define __builtin_addcl(a, b, carry_in, carry_out) \ | |
({ __typeof__ (a) s; \ | |
__typeof__ (a) c1 = __builtin_add_overflow (a, b, &s); \ | |
__typeof__ (a) c2 = __builtin_add_overflow (s, carry_in, &s); \ | |
*(carry_out) = c1 | c2; \ | |
s; }) | |
#endif | |
FLINT_FORCE_INLINE unsigned char n_addc_builtin(unsigned char cf, ulong x, ulong y, ulong* s) | |
{ | |
ulong cy; | |
*s = __builtin_addcl(x, y, cf, &cy); | |
return cy; | |
} | |
/* generic version from GMP */ | |
mp_limb_t | |
mpn_addmul_1_a(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
do | |
{ | |
u0 = *up++; | |
umul_ppmm(p1, p0, u0, v0); | |
r0 = *rp; | |
p0 = r0 + p0; | |
c = r0 > p0; | |
p1 = p1 + c; | |
r0 = p0 + crec; | |
c = p0 > r0; | |
crec = p1 + c; | |
*rp++ = r0; | |
} | |
while (--n != 0); | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_b(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
do | |
{ | |
u0 = *up++; | |
umul_ppmm_mulx(p1, p0, u0, v0); | |
r0 = *rp; | |
p0 = r0 + p0; | |
c = r0 > p0; | |
p1 = p1 + c; | |
r0 = p0 + crec; | |
c = p0 > r0; | |
crec = p1 + c; | |
*rp++ = r0; | |
} | |
while (--n != 0); | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_c(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
do | |
{ | |
u0 = *up++; | |
umul_ppmm_int128(p1, p0, u0, v0); | |
r0 = *rp; | |
p0 = r0 + p0; | |
c = r0 > p0; | |
p1 = p1 + c; | |
r0 = p0 + crec; | |
c = p0 > r0; | |
crec = p1 + c; | |
*rp++ = r0; | |
} | |
while (--n != 0); | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_d(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_intrinsic(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_intrinsic(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_e(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm_mulx(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_intrinsic(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_intrinsic(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_f(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm_int128(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_intrinsic(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_intrinsic(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_g(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_builtin(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_builtin(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_h(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm_mulx(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_builtin(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_builtin(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
mp_limb_t | |
mpn_addmul_1_i(mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) | |
{ | |
mp_limb_t u0, crec, c, p1, p0, r0; | |
crec = 0; | |
slong i; | |
for (i = 0; i < n; i++) | |
{ | |
u0 = up[i]; | |
umul_ppmm_int128(p1, p0, u0, v0); | |
r0 = rp[i]; | |
c = n_addc_builtin(0, r0, p0, &p0); | |
p1 += c; | |
c = n_addc_builtin(0, p0, crec, &r0); | |
crec = p1 + c; | |
rp[i] = r0; | |
} | |
return crec; | |
} | |
#define NMAX 256 | |
int main() | |
{ | |
mp_limb_t X[NMAX], Y[NMAX], Z[NMAX], d, c = 0; | |
slong i, N; | |
double tcpu, twall, twall_gmp; | |
flint_rand_t state; | |
flint_randinit(state); | |
for (i = 0; i < 256; i++) | |
{ | |
X[i] = n_randlimb(state); | |
Y[i] = n_randlimb(state); | |
Z[i] = n_randlimb(state); | |
} | |
d = n_randlimb(state); | |
for (N = 1; N <= NMAX; N = FLINT_MAX(N + 1, N + N / 2)) | |
{ | |
// N = 1; | |
for (i = 0; i < 5; i++) | |
{ | |
flint_printf("%5wd", N); | |
TIMEIT_START | |
c += mpn_addmul_1(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall_gmp) | |
TIMEIT_START | |
c += mpn_addmul_1_a(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_b(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_c(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_d(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_e(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_f(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_g(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_h(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
TIMEIT_START | |
c += mpn_addmul_1_i(Y, X, N, d); | |
d ^= c; | |
TIMEIT_STOP_VALUES(tcpu, twall) | |
flint_printf(" %6.3g", twall / twall_gmp); | |
flint_printf("\n"); | |
} | |
} | |
flint_printf("c = %wu\n", c); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment