fredrik-johansson · March 7, 2024 19:54
diff --git a/taylortest.c b/taylortest.c
 #include <math.h>
 #include "longlong.h"
 #include "ulong_extras.h"
 #include "mpn_extras.h"
 #include "arf.h"
 #include "arb.h"
 #include "profiler.h"

 /* util: print fixed-point number given by n limbs where fracn limbs are fractional. */
 void
 flint_mpn_print_fixed(mp_srcptr x, mp_size_t n, mp_size_t fracn)
 {
    if (n == 0)
    {
        flint_printf("0");
    }
    else
    {
        arf_t y;
        fmpz_t t;
        fmpz_init(t);
        slong d, fracd, dmag;

        arf_init(y);
        fmpz_init(t);
        fmpz_set_ui_array(t, x, n);

        arf_set_fmpz(y, t);
        arf_mul_2exp_si(y, y, -fracn * FLINT_BITS);
        dmag = log10(arf_get_d(y, ARF_RND_DOWN));
        fracd = fracn * (FLINT_BITS * 0.3010299956639812);
        d = FLINT_MAX(1, dmag + fracd);
        arf_printd(y, d);

        fmpz_clear(t);
        arf_clear(y);
    }

    flint_printf("\n");
 }

 /* c_n = (20! / n!) */
 ulong cs[] = {
    UWORD(2432902008176640000),
    UWORD(2432902008176640000),
    UWORD(1216451004088320000),
    UWORD(405483668029440000),
    UWORD(101370917007360000),
    UWORD(20274183401472000),
    UWORD(3379030566912000),
    UWORD(482718652416000),
    UWORD(60339831552000),
    UWORD(6704425728000),
    UWORD(670442572800),
    UWORD(60949324800),
    UWORD(5079110400),
    UWORD(390700800),
    UWORD(27907200),
    UWORD(1860480),
    UWORD(116280),
    UWORD(6840),
    UWORD(380),
    UWORD(20),
    UWORD(1),
 };

 mp_limb_t __gmpn_preinv_divrem_1(mp_ptr qp, mp_size_t fn, mp_srcptr np, mp_size_t nn, mp_limb_t d, mp_limb_t dinv, int cnt);

 /* Single-limb inverse of 20! */
 ulong c0_preinv1;
 int c0_preinv1_norm;

 /* Multi-limb inverse of 20! */
 mp_limb_t c0inv[100];

 void
 precompute()
 {
    c0_preinv1 = n_preinvert_limb(cs[0]);
    c0_preinv1_norm = FLINT_BITS - FLINT_BIT_COUNT(cs[0]);

    /* 1/cs[0] */
    flint_mpn_zero(c0inv, 11);
    c0inv[11] = 1;
    mpn_divrem_1(c0inv, 0, c0inv, 12, cs[0]);
 }

 /* a0 + x (a1 + x (a2 + x (a3 + x (a4 + x (a5 + x (a6 + x (a7 + x (a8 + x a9)))))))) */
 void
 mpn_exp_series_10_horner(mp_ptr res, mp_srcptr c, mp_srcptr x)
 {
    mp_limb_t t[10];
    mp_limb_t s[10];

    s[1] = c[9];
    s[0] = 0;
    flint_mpn_mulhigh_n(t, s, x + 9 - 2, 2);
    t[2] = c[8];
    flint_mpn_mulhigh_n(s, t, x + 9 - 3, 3);
    s[3] = c[7];
    flint_mpn_mulhigh_n(t, s, x + 9 - 4, 4);
    t[4] = c[6];
    flint_mpn_mulhigh_n(s, t, x + 9 - 5, 5);
    s[5] = c[5];
    flint_mpn_mulhigh_n(t, s, x + 9 - 6, 6);
    t[6] = c[4];
    flint_mpn_mulhigh_n(s, t, x + 9 - 7, 7);
    s[7] = c[3];
    flint_mpn_mulhigh_n(t, s, x + 9 - 8, 8);
    t[8] = c[2];
    flint_mpn_mulhigh_n(s, t, x + 9 - 9, 9);
    s[9] = c[1];
    flint_mpn_mulhigh_n(res + 1, s + 1, x, 9);
    res[10] = c[0];
 }


 /* (a0 + a1 x + a2 x^2) + x^3 * ((a3 + a4 x + a5 x^2) + x^3*((a6 + a7 x + a8 x^2) + a9 x^3))

   todo: consider
   a0 + (a1 x + a2 x^2) + a3 x^3 + x^3 * ((a4 x + a5 x^2) + a6 x^3 + x^3*((a7 x + a8 x^2) + a9 x^3))
   to reduce the precision of the muls
 */

 void
 mpn_exp_series_10_rs3b(mp_ptr res, mp_srcptr c, mp_srcptr x)
 {
    mp_limb_t x2[8];
    mp_limb_t x3p[8];
    mp_limb_t s[11];
    mp_limb_t t[11];

    /* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
    mp_ptr x3 = x3p + 1;
    x3p[0] = 0;

    /* x^2 with 8 limbs of precision */
    flint_mpn_sqrhigh(x2, x + 1, 8);
    /* x^3 with 7 limbs of precision */
    flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

    umul_ppmm(t[1], t[0], x3[6], c[9]);
    t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
    t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
    t[4] = c[6];
    flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
    s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
    s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
    s[7] = c[3];

    flint_mpn_mulhigh_n(res, s, x3 - 1, 8);
    res[8] = mpn_addmul_1(res, x2, 8, c[2]);
    __gmpn_preinv_divrem_1(res, 0, res, 9, c[0], c0_preinv1, c0_preinv1_norm);

    res[9] = mpn_add_n(res, res, x, 9);
    res[10] = 1;
 }

 void
 mpn_exp_series_10_rs3c(mp_ptr res, mp_srcptr c, mp_srcptr x)
 {
    mp_limb_t x2[8];
    mp_limb_t x3p[8];
    mp_limb_t s[11];
    mp_limb_t t[11];

    /* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
    mp_ptr x3 = x3p + 1;
    x3p[0] = 0;

    /* x^2 with 8 limbs of precision */
    flint_mpn_sqrhigh(x2, x + 1, 8);
    /* x^3 with 7 limbs of precision */
    flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

    umul_ppmm(t[1], t[0], x3[6], c[9]);
    t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
    t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
    t[4] = c[6];
    flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
    s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
    s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
    s[7] = c[3];

    flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
    t[8] = mpn_addmul_1(t, x2, 8, c[2]);
    flint_mpn_mulhigh_n(res, t, c0inv + 2, 9);
    res[9] = mpn_add_n(res, res, x, 9);
    res[10] = 1;
 }

 void
 mpn_exp_series_10_rs3d(mp_ptr res, mp_srcptr c, mp_srcptr x)
 {
    mp_limb_t x2[8];
    mp_limb_t x3d[8];
    mp_limb_t s[11];
    mp_limb_t t[11];

    /* zero-pad x^3 so that we can fudge the final the 8x7 horner mul by x^3
       as an 8x8 mulhigh_n */
    x3d[0] = 0;
    mp_ptr x3 = x3d + 1;

    /* x^2 with 8 limbs of precision */
    flint_mpn_sqrhigh(x2, x + 1, 8);
    /* x^3 with 7 limbs of precision */
    flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

    umul_ppmm(t[1], t[0], x3[6], c[9]);  /* t[1] = mpn_addmul_1(t, x3 + 8 - 1, 1, c[9]); */
    t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
    t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
    t[4] = c[6];
    flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
    s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
    s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
    s[7] = c[3];
    flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
    /* divide by factorial */
    flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);

    /* + x^2 / 2 (9 limbs) */
    mpn_rshift(t, x2, 8, 1);
    res[8] = mpn_add_n(res, res, t, 8);
    /* + x (10 limbs) */
    res[9] = mpn_add_n(res, res, x, 9);
    /* + 1 */
    res[10] = 1;
 }

 /* (a0 + a1 x + a2 x^2 + a3 x^3) + x^4 * ((a4 + a5 x + a6 x^2 + a7 x^3) + x^4* (a8 + a9 x) */
 void
 mpn_exp_series_10_rs4(mp_ptr res, mp_srcptr c, mp_srcptr x)
 {
    mp_limb_t x2[8];
    mp_limb_t x3[8];
    mp_limb_t x4p[8];
    mp_limb_t s[11];
    mp_limb_t t[11];

    /* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
    mp_ptr x4 = x4p + 1;
    x4p[0] = 0;

    flint_mpn_sqrhigh(x2, x + 1, 8);
    flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
    flint_mpn_sqrhigh(x4, x2 + 2, 6);

    umul_ppmm(t[1], t[0], x[8], c[9]);  /* t[1] = mpn_mul_1(t, x + 9 - 1, 1, c[9]); */
    t[2] = c[8];
    flint_mpn_mulhigh_n(s, t, x4 + 6 - 3, 3);
    s[3] = mpn_addmul_1(s, x3 + 7 - 3, 3, c[7]);
    s[4] = mpn_addmul_1(s, x2 + 8 - 4, 4, c[6]);
    s[5] = mpn_addmul_1(s, x  + 9 - 5, 5, c[5]);
    s[6] = c[4];
    flint_mpn_mulhigh_n(t, s, x4 + 6 - 7, 7);
    t[7] = mpn_addmul_1(t, x3, 7, c[3]);

    /* divide by factorial */
    flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);

    /* + x^2 / 2 (9 limbs) */
    mpn_rshift(t, x2, 8, 1);
    res[8] = mpn_add_n(res, res, t, 8);
    /* + x (10 limbs) */
    res[9] = mpn_add_n(res, res, x, 9);
    /* + 1 */
    res[10] = 1;
 }





 int main()
 {
    mp_limb_t X[400], Y[400], Z[400];

    precompute();

    flint_mpn_zero(X, 100);
    flint_mpn_zero(Y, 100);

 #if 0
    ulong dinv = n_preinvert_limb(123987);
    int cnt = 64 - FLINT_BIT_COUNT(123987);

    slong n;
    for (n = 1; n < 20; n++)
    {
        flint_printf("%wd\n", n);
        TIMEIT_START
        flint_mpn_mulhigh_n(Z, X, Y, n);
        TIMEIT_STOP
        TIMEIT_START
        mpn_divrem_1(X, 0, X, n, 123987);
        TIMEIT_STOP
        TIMEIT_START
        __gmpn_preinv_divrem_1(X, 0, X, n, 123987, dinv, cnt);
        TIMEIT_STOP
    }
 #endif

    /* x = 1/10^20 < 2^-64 */
    X[10] = 1;
    mpn_divrem_1(X, 0, X, 11, n_pow(10, 10));
    mpn_divrem_1(X, 0, X, 10, n_pow(10, 10));

    arb_t y, x;
    arb_init(x);
    arb_init(y);
    arb_set_str(x, "1e-20", 640);
    mag_zero(arb_radref(x));

    flint_printf("arb_exp\n");
    TIMEIT_START
    arb_exp(y, x, 640);
    TIMEIT_STOP
    arb_printn(y, 0.3010299956639812 * 640, ARB_STR_NO_RADIUS); flint_printf("\n");

    flint_printf("\nmpn_exp_series_10_rs3d (mulhigh):\n");
    TIMEIT_START
    mpn_exp_series_10_rs3d(Y, cs, X);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);

    flint_printf("\n_arb_exp_taylor_rs:\n");
    TIMEIT_START
    _arb_exp_taylor_rs(Z, Y + 50, X, 10, 10);
    TIMEIT_STOP
    flint_mpn_print_fixed(Z, 11, 10);

    flint_printf("\nmpn_exp_series_10_horner (divrem_1):\n");
    TIMEIT_START
    mpn_exp_series_10_horner(Y, cs, X);
    mpn_divrem_1(Y, 0, Y, 11, cs[0]);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);

 #if 0
    flint_printf("\nmpn_exp_series_10_horner (preinv_divrem_1):\n");
    TIMEIT_START
    mpn_exp_series_10_horner(Y, cs, X);
    __gmpn_preinv_divrem_1(Y, 0, Y, 11, cs[0], c0_preinv1, c0_preinv1_norm);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);

    flint_printf("\nmpn_exp_series_10_horner (mulhigh):\n");
    TIMEIT_START
    mpn_exp_series_10_horner(Z, cs, X);
    flint_mpn_mulhigh_n(Y, Z, c0inv, 11);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);
 #endif

    flint_printf("\nmpn_exp_series_10_rs3d:\n");
    TIMEIT_START
    mpn_exp_series_10_rs3c(Y, cs, X);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);

    flint_printf("\nmpn_exp_series_10_rs4:\n");
    TIMEIT_START
    mpn_exp_series_10_rs4(Y, cs, X);
    TIMEIT_STOP
    flint_mpn_print_fixed(Y, 11, 10);

 }
	#include <math.h>
	#include "longlong.h"
	#include "ulong_extras.h"
	#include "mpn_extras.h"
	#include "arf.h"
	#include "arb.h"
	#include "profiler.h"

	/* util: print fixed-point number given by n limbs where fracn limbs are fractional. */
	void
	flint_mpn_print_fixed(mp_srcptr x, mp_size_t n, mp_size_t fracn)
	{
	if (n == 0)
	{
	flint_printf("0");
	}
	else
	{
	arf_t y;
	fmpz_t t;
	fmpz_init(t);
	slong d, fracd, dmag;

	arf_init(y);
	fmpz_init(t);
	fmpz_set_ui_array(t, x, n);

	arf_set_fmpz(y, t);
	arf_mul_2exp_si(y, y, -fracn * FLINT_BITS);
	dmag = log10(arf_get_d(y, ARF_RND_DOWN));
	fracd = fracn * (FLINT_BITS * 0.3010299956639812);
	d = FLINT_MAX(1, dmag + fracd);
	arf_printd(y, d);

	fmpz_clear(t);
	arf_clear(y);
	}

	flint_printf("\n");
	}

	/* c_n = (20! / n!) */
	ulong cs[] = {
	UWORD(2432902008176640000),
	UWORD(2432902008176640000),
	UWORD(1216451004088320000),
	UWORD(405483668029440000),
	UWORD(101370917007360000),
	UWORD(20274183401472000),
	UWORD(3379030566912000),
	UWORD(482718652416000),
	UWORD(60339831552000),
	UWORD(6704425728000),
	UWORD(670442572800),
	UWORD(60949324800),
	UWORD(5079110400),
	UWORD(390700800),
	UWORD(27907200),
	UWORD(1860480),
	UWORD(116280),
	UWORD(6840),
	UWORD(380),
	UWORD(20),
	UWORD(1),
	};

	mp_limb_t __gmpn_preinv_divrem_1(mp_ptr qp, mp_size_t fn, mp_srcptr np, mp_size_t nn, mp_limb_t d, mp_limb_t dinv, int cnt);

	/* Single-limb inverse of 20! */
	ulong c0_preinv1;
	int c0_preinv1_norm;

	/* Multi-limb inverse of 20! */
	mp_limb_t c0inv[100];

	void
	precompute()
	{
	c0_preinv1 = n_preinvert_limb(cs[0]);
	c0_preinv1_norm = FLINT_BITS - FLINT_BIT_COUNT(cs[0]);

	/* 1/cs[0] */
	flint_mpn_zero(c0inv, 11);
	c0inv[11] = 1;
	mpn_divrem_1(c0inv, 0, c0inv, 12, cs[0]);
	}

	/* a0 + x (a1 + x (a2 + x (a3 + x (a4 + x (a5 + x (a6 + x (a7 + x (a8 + x a9)))))))) */
	void
	mpn_exp_series_10_horner(mp_ptr res, mp_srcptr c, mp_srcptr x)
	{
	mp_limb_t t[10];
	mp_limb_t s[10];

	s[1] = c[9];
	s[0] = 0;
	flint_mpn_mulhigh_n(t, s, x + 9 - 2, 2);
	t[2] = c[8];
	flint_mpn_mulhigh_n(s, t, x + 9 - 3, 3);
	s[3] = c[7];
	flint_mpn_mulhigh_n(t, s, x + 9 - 4, 4);
	t[4] = c[6];
	flint_mpn_mulhigh_n(s, t, x + 9 - 5, 5);
	s[5] = c[5];
	flint_mpn_mulhigh_n(t, s, x + 9 - 6, 6);
	t[6] = c[4];
	flint_mpn_mulhigh_n(s, t, x + 9 - 7, 7);
	s[7] = c[3];
	flint_mpn_mulhigh_n(t, s, x + 9 - 8, 8);
	t[8] = c[2];
	flint_mpn_mulhigh_n(s, t, x + 9 - 9, 9);
	s[9] = c[1];
	flint_mpn_mulhigh_n(res + 1, s + 1, x, 9);
	res[10] = c[0];
	}


	/* (a0 + a1 x + a2 x^2) + x^3 * ((a3 + a4 x + a5 x^2) + x^3*((a6 + a7 x + a8 x^2) + a9 x^3))

	todo: consider
	a0 + (a1 x + a2 x^2) + a3 x^3 + x^3 * ((a4 x + a5 x^2) + a6 x^3 + x^3*((a7 x + a8 x^2) + a9 x^3))
	to reduce the precision of the muls
	*/

	void
	mpn_exp_series_10_rs3b(mp_ptr res, mp_srcptr c, mp_srcptr x)
	{
	mp_limb_t x2[8];
	mp_limb_t x3p[8];
	mp_limb_t s[11];
	mp_limb_t t[11];

	/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
	mp_ptr x3 = x3p + 1;
	x3p[0] = 0;

	/* x^2 with 8 limbs of precision */
	flint_mpn_sqrhigh(x2, x + 1, 8);
	/* x^3 with 7 limbs of precision */
	flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

	umul_ppmm(t[1], t[0], x3[6], c[9]);
	t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
	t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
	t[4] = c[6];
	flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
	s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
	s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
	s[7] = c[3];

	flint_mpn_mulhigh_n(res, s, x3 - 1, 8);
	res[8] = mpn_addmul_1(res, x2, 8, c[2]);
	__gmpn_preinv_divrem_1(res, 0, res, 9, c[0], c0_preinv1, c0_preinv1_norm);

	res[9] = mpn_add_n(res, res, x, 9);
	res[10] = 1;
	}

	void
	mpn_exp_series_10_rs3c(mp_ptr res, mp_srcptr c, mp_srcptr x)
	{
	mp_limb_t x2[8];
	mp_limb_t x3p[8];
	mp_limb_t s[11];
	mp_limb_t t[11];

	/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
	mp_ptr x3 = x3p + 1;
	x3p[0] = 0;

	/* x^2 with 8 limbs of precision */
	flint_mpn_sqrhigh(x2, x + 1, 8);
	/* x^3 with 7 limbs of precision */
	flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

	umul_ppmm(t[1], t[0], x3[6], c[9]);
	t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
	t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
	t[4] = c[6];
	flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
	s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
	s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
	s[7] = c[3];

	flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
	t[8] = mpn_addmul_1(t, x2, 8, c[2]);
	flint_mpn_mulhigh_n(res, t, c0inv + 2, 9);
	res[9] = mpn_add_n(res, res, x, 9);
	res[10] = 1;
	}

	void
	mpn_exp_series_10_rs3d(mp_ptr res, mp_srcptr c, mp_srcptr x)
	{
	mp_limb_t x2[8];
	mp_limb_t x3d[8];
	mp_limb_t s[11];
	mp_limb_t t[11];

	/* zero-pad x^3 so that we can fudge the final the 8x7 horner mul by x^3
	as an 8x8 mulhigh_n */
	x3d[0] = 0;
	mp_ptr x3 = x3d + 1;

	/* x^2 with 8 limbs of precision */
	flint_mpn_sqrhigh(x2, x + 1, 8);
	/* x^3 with 7 limbs of precision */
	flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);

	umul_ppmm(t[1], t[0], x3[6], c[9]); /* t[1] = mpn_addmul_1(t, x3 + 8 - 1, 1, c[9]); */
	t[2] = mpn_addmul_1(t, x2 + 8 - 2, 2, c[8]);
	t[3] = mpn_addmul_1(t, x + 9 - 3, 3, c[7]);
	t[4] = c[6];
	flint_mpn_mulhigh_n(s, t, x3 + 7 - 5, 5);
	s[5] = mpn_addmul_1(s, x2 + 8 - 5, 5, c[5]);
	s[6] = mpn_addmul_1(s, x + 9 - 6, 6, c[4]);
	s[7] = c[3];
	flint_mpn_mulhigh_n(t, s, x3 - 1, 8);
	/* divide by factorial */
	flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);

	/* + x^2 / 2 (9 limbs) */
	mpn_rshift(t, x2, 8, 1);
	res[8] = mpn_add_n(res, res, t, 8);
	/* + x (10 limbs) */
	res[9] = mpn_add_n(res, res, x, 9);
	/* + 1 */
	res[10] = 1;
	}

	/* (a0 + a1 x + a2 x^2 + a3 x^3) + x^4 * ((a4 + a5 x + a6 x^2 + a7 x^3) + x^4* (a8 + a9 x) */
	void
	mpn_exp_series_10_rs4(mp_ptr res, mp_srcptr c, mp_srcptr x)
	{
	mp_limb_t x2[8];
	mp_limb_t x3[8];
	mp_limb_t x4p[8];
	mp_limb_t s[11];
	mp_limb_t t[11];

	/* zero-pad so that we can fudge the final nx(n-1) mul as an nxn mul */
	mp_ptr x4 = x4p + 1;
	x4p[0] = 0;

	flint_mpn_sqrhigh(x2, x + 1, 8);
	flint_mpn_mulhigh_n(x3, x + 2, x2 + 1, 7);
	flint_mpn_sqrhigh(x4, x2 + 2, 6);

	umul_ppmm(t[1], t[0], x[8], c[9]); /* t[1] = mpn_mul_1(t, x + 9 - 1, 1, c[9]); */
	t[2] = c[8];
	flint_mpn_mulhigh_n(s, t, x4 + 6 - 3, 3);
	s[3] = mpn_addmul_1(s, x3 + 7 - 3, 3, c[7]);
	s[4] = mpn_addmul_1(s, x2 + 8 - 4, 4, c[6]);
	s[5] = mpn_addmul_1(s, x + 9 - 5, 5, c[5]);
	s[6] = c[4];
	flint_mpn_mulhigh_n(t, s, x4 + 6 - 7, 7);
	t[7] = mpn_addmul_1(t, x3, 7, c[3]);

	/* divide by factorial */
	flint_mpn_mulhigh_n(res, t, c0inv + 3, 8);

	/* + x^2 / 2 (9 limbs) */
	mpn_rshift(t, x2, 8, 1);
	res[8] = mpn_add_n(res, res, t, 8);
	/* + x (10 limbs) */
	res[9] = mpn_add_n(res, res, x, 9);
	/* + 1 */
	res[10] = 1;
	}





	int main()
	{
	mp_limb_t X[400], Y[400], Z[400];

	precompute();

	flint_mpn_zero(X, 100);
	flint_mpn_zero(Y, 100);

	#if 0
	ulong dinv = n_preinvert_limb(123987);
	int cnt = 64 - FLINT_BIT_COUNT(123987);

	slong n;
	for (n = 1; n < 20; n++)
	{
	flint_printf("%wd\n", n);
	TIMEIT_START
	flint_mpn_mulhigh_n(Z, X, Y, n);
	TIMEIT_STOP
	TIMEIT_START
	mpn_divrem_1(X, 0, X, n, 123987);
	TIMEIT_STOP
	TIMEIT_START
	__gmpn_preinv_divrem_1(X, 0, X, n, 123987, dinv, cnt);
	TIMEIT_STOP
	}
	#endif

	/* x = 1/10^20 < 2^-64 */
	X[10] = 1;
	mpn_divrem_1(X, 0, X, 11, n_pow(10, 10));
	mpn_divrem_1(X, 0, X, 10, n_pow(10, 10));

	arb_t y, x;
	arb_init(x);
	arb_init(y);
	arb_set_str(x, "1e-20", 640);
	mag_zero(arb_radref(x));

	flint_printf("arb_exp\n");
	TIMEIT_START
	arb_exp(y, x, 640);
	TIMEIT_STOP
	arb_printn(y, 0.3010299956639812 * 640, ARB_STR_NO_RADIUS); flint_printf("\n");

	flint_printf("\nmpn_exp_series_10_rs3d (mulhigh):\n");
	TIMEIT_START
	mpn_exp_series_10_rs3d(Y, cs, X);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);

	flint_printf("\n_arb_exp_taylor_rs:\n");
	TIMEIT_START
	_arb_exp_taylor_rs(Z, Y + 50, X, 10, 10);
	TIMEIT_STOP
	flint_mpn_print_fixed(Z, 11, 10);

	flint_printf("\nmpn_exp_series_10_horner (divrem_1):\n");
	TIMEIT_START
	mpn_exp_series_10_horner(Y, cs, X);
	mpn_divrem_1(Y, 0, Y, 11, cs[0]);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);

	#if 0
	flint_printf("\nmpn_exp_series_10_horner (preinv_divrem_1):\n");
	TIMEIT_START
	mpn_exp_series_10_horner(Y, cs, X);
	__gmpn_preinv_divrem_1(Y, 0, Y, 11, cs[0], c0_preinv1, c0_preinv1_norm);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);

	flint_printf("\nmpn_exp_series_10_horner (mulhigh):\n");
	TIMEIT_START
	mpn_exp_series_10_horner(Z, cs, X);
	flint_mpn_mulhigh_n(Y, Z, c0inv, 11);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);
	#endif

	flint_printf("\nmpn_exp_series_10_rs3d:\n");
	TIMEIT_START
	mpn_exp_series_10_rs3c(Y, cs, X);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);

	flint_printf("\nmpn_exp_series_10_rs4:\n");
	TIMEIT_START
	mpn_exp_series_10_rs4(Y, cs, X);
	TIMEIT_STOP
	flint_mpn_print_fixed(Y, 11, 10);

	}