/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2009 William Hart
    Copyright (C) 2010 Sebastian Pancratz
    Copyright (C) 2010 Fredrik Johansson

******************************************************************************/

*******************************************************************************

    Random functions 

*******************************************************************************

void n_randinit(flint_rand_t state)

    Initialise a random state for use in random functions. 

void n_randclear(flint_rand_t state)

    Release any memory used by a random state. 

mp_limb_t n_randlimb(flint_rand_t state)

    Returns a uniformly pseudo random limb. 

    The algorithm generates two random half limbs $s_j$, $j = 0, 1$, 
    by iterating respectively $v_{i+1} = (v_i a + b) \bmod{p_j}$ for 
    some initial seed $v_0$, randomly chosen values $a$ and $b$ and 
    \code{p_0 = 4294967311 = nextprime(2^32)} on a 64-bit machine 
    and \code{p_0 = nextprime(2^16)} on a 32-bit machine and 
    \code{p_1 = nextprime(p_0)}.

mp_limb_t n_randbits(flint_rand_t state, unsigned int bits)

    Returns a uniformly pseudo random number with the given number of 
    bits. The most significant bit is always set, unless zero is passed,
    in which case zero is returned.

mp_limb_t n_randtest_bits(flint_rand_t state, int bits)

    Returns a uniformly pseudo random number with the given number of 
    bits. The most significant bit is always set, unless zero is passed,
    in which case zero is returned. The probability of a value with a
    sparse binary representation being returned is increased. This
    function is intended for use in test code.

mp_limb_t n_randint(flint_rand_t state, mp_limb_t limit)

    Returns a uniformly pseudo random number up to but not including
    the given limit. If zero is passed as a parameter, an entire random
    limb is returned.

mp_limb_t n_randtest(flint_rand_t state)

    Returns a pseudo random number with a random number of bits,
    from $0$ to \code{FLINT_BITS}.  The probability of the special 
    values $0$, $1$, \code{COEFF_MAX} and \code{WORD_MAX} is increased
    as is the probability of a value with sparse binary representation.  
    This random function is mainly used for testing purposes.
    This function is intended for use in test code. 

mp_limb_t n_randtest_not_zero(flint_rand_t state)

    As for \code{n_randtest()}, but does not return $0$.
    This function is intended for use in test code. 

mp_limb_t n_randprime(flint_rand_t state, unsigned slong bits, int proved)

    Returns a random prime number (proved = 1) or probable prime (proved = 0)
    with \code{bits} bits, where \code{bits} must be at least 2 and
    at most \code{FLINT_BITS}.

mp_limb_t n_randtest_prime(flint_rand_t state, int proved)

    Returns a random prime number (proved = 1) or probable prime (proved = 0)
    with size randomly chosen between 2 and \code{FLINT_BITS} bits.
    This function is intended for use in test code.

*******************************************************************************

    Basic arithmetic 

*******************************************************************************

mp_limb_t n_pow(mp_limb_t n, ulong exp)

    Returns \code{n^exp}. No checking is done for overflow. The exponent
    may be zero. We define $0^0 = 1$.

    The algorithm simply uses a for loop. Repeated squaring is
    unlikely to speed up this algorithm.

mp_limb_t n_flog(mp_limb_t n, mp_limb_t b)

    Returns $\floor{\log_b x}$.

    Assumes that $x \geq 1$ and $b \geq 2$.

mp_limb_t n_clog(mp_limb_t n, mp_limb_t b)

    Returns $\ceil{\log_b x}$.

    Assumes that $x \geq 1$ and $b \geq 2$.

*******************************************************************************

    Miscellaneous

*******************************************************************************

ulong n_revbin(ulong in, ulong bits)

    Returns the binary reverse of \code{in}, assuming it is the given 
    number of bits in length, e.g.\ \code{n_revbin(10110, 6)} will return
    \code{110100}.

int n_sizeinbase(mp_limb_t n, int base)

    Returns the exact number of digits needed to represent $n$ as a
    string in base \code{base} assumed to be between 2 and 36.
    Returns 1 when $n = 0$.


*******************************************************************************

    Basic arithmetic with precomputed inverses

*******************************************************************************

mp_limb_t n_mod_precomp(mp_limb_t a, mp_limb_t n, double ninv)

    Returns $a \bmod{n}$ given a precomputed inverse of $n$ computed 
    by\\ \code{n_precompute_inverse()}. We require \code{n < 2^FLINT_D_BITS}
    and \code{a < 2^(FLINT_BITS-1)} and $0 \leq a < n^2$.

    We assume the processor is in the standard round to nearest
    mode. Thus \code{ninv} is correct to $53$ binary bits, the least 
    significant bit of which we shall call a place, and can be at most 
    half a place out. When $a$ is multiplied by $ninv$, the binary 
    representation of $a$ is exact and the mantissa is less than $2$, thus we 
    see that \code{a * ninv} can be at most one out in the mantissa. We now 
    truncate \code{a * ninv} to the nearest integer, which is always a round 
    down. Either we already have an integer, or we need to make a change down 
    of at least $1$ in the last place. In the latter case we either get 
    precisely the exact quotient or below it as when we rounded the
    product to the nearest place we changed by at most half a place.
    In the case that truncating to an integer takes us below the
    exact quotient, we have rounded down by less than $1$ plus half a 
    place. But as the product is less than $n$ and $n$ is less than $2^{53}$,
    half a place is less than $1$, thus we are out by less than $2$ from 
    the exact quotient, i.e.\ the quotient we have computed is the 
    quotient we are after or one too small. That leaves only the case 
    where we had to round up to the nearest place which happened to 
    be an integer, so that truncating to an integer didn't change 
    anything. But this implies that the exact quotient $a/n$ is less 
    than $2^{-54}$ from an integer. We deal with this rare case by 
    subtracting 1 from the quotient. Then the quotient we have computed is 
    either exactly what we are after, or one too small.

mp_limb_t n_mod2_precomp(mp_limb_t a, mp_limb_t n, double ninv)

    Returns $a \bmod{n}$ given a precomputed inverse of $n$ computed by\\ 
    \code{n_precompute_inverse()}. There are no restrictions on $a$ or 
    on $n$.

    As for \code{n_mod_precomp()} for $n < 2^{53}$ and $a < n^2$ the 
    computed quotient is either what we are after or one too large or small. 
    We deal with these cases. Otherwise we can be sure that the 
    top $52$ bits of the quotient are computed correctly. We take
    the remainder and adjust the quotient by multiplying the
    remainder by \code{ninv} to compute another approximate quotient as
    per \code{mod_precomp}. Now the remainder may be either 
    negative or positive, so the quotient we compute may be one
    out in either direction.

mp_limb_t n_mod2_preinv(mp_limb_t a, mp_limb_t n, mp_limb_t ninv)

    Returns $a \bmod{n}$ given a precomputed inverse of $n$ computed by 
    \code{n_preinvert_limb()}. There are no restrictions on $a$ or on $n$. 

    The old version of this function was implemented simply by 
    making use of\\ \code{udiv_qrnnd_preinv()}.

    The new version uses the new algorithm of Granlund and 
    M\"oller~\citep{GraMol2010}. First $n$ is normalised and a 
    shifted into two limbs to compensate. Then their algorithm is 
    applied verbatim and the result shifted back.

    % Torbjorn Granlund and Niels Moller
    % Improved Division by Invariant Integers
    % http://www.lysator.liu.se/~nisse/archive/draft-division-paper.pdf

mp_limb_t n_divrem2_precomp(mp_limb_t * q, mp_limb_t a, mp_limb_t n, 
                                                                   double npre)

    Returns $a \bmod{n}$ given a precomputed inverse of $n$ computed by\\ 
    \code{n_precompute_inverse()} and sets $q$ to the quotient. There 
    are no restrictions on $a$ or on $n$.

    This is as for \code{n_mod2_precomp()} with some additional care taken
    to retain the quotient information. There are also special
    cases to deal with the case where $a$ is already reduced modulo 
    $n$ and where $n$ is $64$ bits and $a$ is not reduced modulo $n$.

mp_limb_t n_ll_mod_preinv(mp_limb_t a_hi, mp_limb_t a_lo, 
                                     mp_limb_t n, mp_limb_t ninv)

    Returns $a \bmod{n}$ given a precomputed inverse of $n$ computed by 
    \code{n_preinvert_limb()}. There are no restrictions on $a$, which
    will be two limbs \code{(a_hi, a_lo)}, or on $n$.

    The old version of this function merely reduced the top limb 
    \code{a_hi} modulo $n$ so that \code{udiv_qrnnd_preinv()} could 
    be used.

    The new version reduces the top limb modulo $n$ as per 
    \code{n_mod2_preinv()} and then the algorithm of Granlund and 
    M\"oller~\citep{GraMol2010} is used again to reduce modulo $n$.

    % Torbjorn Granlund and Niels Moller
    % Improved Division by Invariant Integers
    % http://www.lysator.liu.se/~nisse/archive/draft-division-paper.pdf

mp_limb_t n_lll_mod_preinv(mp_limb_t a_hi, mp_limb_t a_mi, 
                       mp_limb_t a_lo, mp_limb_t n, mp_limb_t ninv)

    Returns $a \bmod{n}$, where $a$ has three limbs \code{(a_hi, a_mi, a_lo)}, 
    given a precomputed inverse of $n$ computed by \code{n_preinvert_limb()}. 
    It is assumed that \code{a_hi} is reduced modulo $n$. There are no 
    restrictions on $n$.

    This function uses the algorithm of Granlund and 
    M\"oller~\citep{GraMol2010} to first reduce the top two limbs 
    modulo $n$, then does the same on the bottom two limbs.

    % Torbjorn Granlund and Niels Moller
    % Improved Division by Invariant Integers
    % http://www.lysator.liu.se/~nisse/archive/draft-division-paper.pdf

mp_limb_t n_mulmod_precomp(mp_limb_t a, mp_limb_t b, mp_limb_t n, double ninv)

    Returns $a b \bmod{n}$ given a precomputed inverse of $n$ 
    computed by\\ \code{n_precompute_inverse()}. We require 
    \code{n < 2^FLINT_D_BITS} and $0 \leq a, b < n$.

    We assume the processor is in the standard round to nearest
    mode. Thus \code{ninv} is correct to $53$ binary bits, the least 
    significant bit of which we shall call a place, and can be at most half 
    a place out. The product of $a$ and $b$ is computed with error at most 
    half a place. When \code{a * b} is multiplied by $ninv$ we find that the 
    exact quotient and computed quotient differ by less than two places. As 
    the quotient is less than $n$ this means that the exact quotient is at 
    most $1$ away from the computed quotient. We truncate this quotient to 
    an integer which reduces the value by less than $1$. We end up with a 
    value which can be no more than two above the quotient we are after and 
    no less than two below. However an argument similar to that for 
    \code{n_mod_precomp()} shows that the truncated computed quotient cannot 
    be two smaller than the truncated exact quotient. In other words the 
    computed integer quotient is at most two above and one below the quotient 
    we are after.

mp_limb_t n_mulmod2_preinv(mp_limb_t a, mp_limb_t b, 
                                           mp_limb_t n, mp_limb_t ninv)

    Returns $a b \bmod{n}$ given a precomputed inverse of $n$ computed by\\ 
    \code{n_preinvert_limb()}. There are no restrictions on $a$, $b$ or 
    on $n$. This is implemented by multiplying using \code{umul_ppmm()} and 
    then reducing using \code{n_ll_mod_preinv()}.

mp_limb_t n_mulmod_preinv(mp_limb_t a, mp_limb_t b, mp_limb_t n, 
                                            mp_limb_t ninv, ulong norm)

    Returns $a b \pmod{n}$ given a precomputed inverse of $n$ computed by\\ 
    \code{n_preinvert_limb()}, assuming $a$ and $b$ are reduced modulo $n$ 
    and $n$ is normalised, i.e. with most significant bit set. There are 
    no other restrictions on $a$, $b$ or $n$.

    The value \code{norm} is provided for convenience. As $n$ is required
    to be normalised, it may be that $a$ and $b$ have been shifted to the
    left by \code{norm} bits before calling the function. Their product
    then has an extra factor of $2^\text{norm}$. Specifying a nonzero
    \code{norm} will shift the product right by this many bits before
    reducing it.

    The algorithm use is that of Granlund and M\"oller~\citep{GraMol2010}.

    % Torbjorn Granlund and Niels Moller
    % Improved Division by Invariant Integers
    % http://www.lysator.liu.se/~nisse/archive/draft-division-paper.pdf


*******************************************************************************

    Greatest common divisor

*******************************************************************************

mp_limb_t n_gcd(mp_limb_t x, mp_limb_t y)

    Returns the greatest common divisor $g$ of $x$ and $y$. 
    We require $x \geq y$.

    The algorithm is a slight embelishment of the Euclidean algorithm
    which uses some branches to avoid most divisions.

    One wishes to compute the quotient and remainder of $u_3 / v_3$ without 
    division where possible. This is accomplished when $u_3 < 4 v_3$, i.e.\ 
    the quotient is either $1$, $2$ or $3$.

    We first compute $s = u_3 - v_3$. If $s < v_3$, i.e.\ $u_3 < 2 v_3$, we 
    know the quotient is $1$, else if $s < 2 v_3$, i.e.\ $u_3 < 3 v_3$ we 
    know the quotient is $2$. In the remaining cases, the quotient must 
    be $3$. When the quotient is $4$ or above, we use division. However this 
    happens rarely for generic inputs.

mp_limb_t n_gcd_full(mp_limb_t x, mp_limb_t y)

    Returns the greatest common divisor $g$ of $x$ and $y$.
    No assumptions are made about $x$ and $y$.

mp_limb_t n_gcdinv(mp_limb_t * a, mp_limb_t x, mp_limb_t y)

    Returns the greatest common divisor $g$ of $x$ and $y$ and computes 
    $a$ such that $0 \leq a < y$ and $a x = \gcd(x, y) \bmod{y}$, when 
    this is defined. We require $0 \leq x < y$.

    This is merely an adaption of the extended Euclidean algorithm with 
    appropriate normalisation.

mp_limb_t n_xgcd(mp_limb_t * a, mp_limb_t * b, mp_limb_t x, mp_limb_t y)

    Returns the greatest common divisor $g$ of $x$ and $y$ and unsigned 
    values $a$ and $b$ such that $a x - b y = g$. We require $x \geq y$.

    We claim that computing the extended greatest common divisor via the 
    Euclidean algorithm always results in cofactor $\abs{a} < x/2$, 
    $\abs{b} < x/2$, with perhaps some small degenerate exceptions.

    We proceed by induction.

    Suppose we are at some step of the algorithm, with $x_n = q y_n + r$ 
    with $r \geq 1$, and suppose $1 = s y_n - t r$ with 
    $s < y_n / 2$, $t < y_n / 2$ by hypothesis. 

    Write $1 = s y_n - t (x_n - q y_n) = (s + t q) y_n - t x_n$. 

    It suffices to show that $(s + t q) < x_n / 2$ as $t < y_n / 2 < x_n / 2$, 
    which will complete the induction step. 

    But at the previous step in the backsubstitution we would have had 
    $1 = s r - c d$ with $s < r/2$ and $c < r/2$. 

    Then $s + t q < r/2 + y_n / 2 q = (r + q y_n)/2 = x_n / 2$. 

    See the documentation of \code{n_gcd()} for a description of the 
    branching in the algorithm, which is faster than using division.

*******************************************************************************

    Jacobi and Kronecker symbols

*******************************************************************************

int n_jacobi(mp_limb_signed_t x, mp_limb_t y)

    Computes the Jacobi symbol of $x \bmod{y}$.  Assumes that $y$ is positive 
    and odd, and for performance reasons that $\gcd(x, y) = 1$.

    This is just a straightforward application of the law of quadratic
    reciprocity. For performance, divisions are replaced with some 
    comparisons and subtractions where possible.

int n_jacobi_unsigned(mp_limb_t x, mp_limb_t y)

    Computes the Jacobi symbol, allowing $x$ to go up to a full limb.

*******************************************************************************

    Modular Arithmetic

*******************************************************************************

mp_limb_t n_addmod(mp_limb_t a, mp_limb_t b, mp_limb_t n)

    Returns $(a + b) \bmod{n}$.

mp_limb_t n_submod(mp_limb_t a, mp_limb_t b, mp_limb_t n)

    Returns $(a - b) \bmod{n}$.

mp_limb_t n_invmod(mp_limb_t x, mp_limb_t y)

    Returns a value $a$ such that $0 \leq a < y$ and 
    $a x = \gcd(x, y) \bmod{y}$, when this is defined. 
    We require $0 \leq x < y$.

    Specifically, when $x$ is coprime to $y$, $a$ is the inverse 
    of $x$ in $\Z / y \Z$.

    This is merely an adaption of the extended Euclidean algorithm 
    with appropriate normalisation.

mp_limb_t n_powmod_precomp(mp_limb_t a, mp_limb_signed_t exp, 
                                                      mp_limb_t n, double npre)

    Returns \code{a^exp} modulo $n$ given a precomputed inverse of $n$ 
    computed by\\ \code{n_precompute_inverse()}. We require $n < 2^{53}$ 
    and $0 \leq a < n$. There are no restrictions on \code{exp}, i.e.\ 
    it can be negative.

    This is implemented as a standard binary powering algorithm using
    repeated squaring and reducing modulo $n$ at each step.

mp_limb_t n_powmod_ui_precomp(mp_limb_t a, mp_limb_t exp, 
                                                      mp_limb_t n, double npre)

    Returns \code{a^exp} modulo $n$ given a precomputed inverse of $n$ 
    computed by\\ \code{n_precompute_inverse()}. We require $n < 2^{53}$ 
    and $0 \leq a < n$. The exponent \code{exp} is unsigned and so
    can be larger than allowed by \code{n_powmod_precomp}.

    This is implemented as a standard binary powering algorithm using
    repeated squaring and reducing modulo $n$ at each step.

mp_limb_t n_powmod(mp_limb_t a, mp_limb_signed_t exp, mp_limb_t n)

    Returns \code{a^exp} modulo $n$. We require \code{n < 2^FLINT_D_BITS} 
    and $0 \leq a < n$. There are no restrictions on \code{exp}, i.e.\ 
    it can be negative.

    This is implemented by precomputing an inverse and calling the 
    \code{precomp} version of this function.

mp_limb_t n_powmod2_preinv(mp_limb_t a, mp_limb_signed_t exp, mp_limb_t n, 
                                                                mp_limb_t ninv)

    Returns \code{(a^exp) % n} given a precomputed inverse of $n$ computed 
    by \code{n_preinvert_limb()}. We require $0 \leq a < n$, but there are no 
    restrictions on $n$ or on \code{exp}, i.e.\ it can be negative.

    This is implemented as a standard binary powering algorithm using
    repeated squaring and reducing modulo $n$ at each step.

mp_limb_t n_powmod2(mp_limb_t a, mp_limb_signed_t exp, mp_limb_t n)

    Returns \code{(a^exp) % n}. We require $0 \leq a < n$, but there are 
    no restrictions on $n$ or on \code{exp}, i.e.\ it can be negative.

    This is implemented by precomputing an inverse limb and calling the 
    \code{preinv} version of this function.

mp_limb_t n_powmod2_ui_preinv(mp_limb_t a, mp_limb_t exp, mp_limb_t n, 
                                                                mp_limb_t ninv)

    Returns \code{(a^exp) % n} given a precomputed inverse of $n$ computed 
    by \code{n_preinvert_limb()}. We require $0 \leq a < n$, but there are no 
    restrictions on $n$. The exponent \code{exp} is unsigned and so can be
    larger than allowed by \code{n_powmod2_preinv}.

    This is implemented as a standard binary powering algorithm using
    repeated squaring and reducing modulo $n$ at each step.

mp_limb_t n_sqrtmod(mp_limb_t a, mp_limb_t p)

    Computes a square root of $a$ modulo $p$.

    Assumes that $p$ is a prime and that $a$ is reduced modulo $p$.
    Returns 0 if $a$ is a quadratic non-residue modulo $p$.

slong n_sqrtmod_2pow(mp_limb_t ** sqrt, mp_limb_t a, slong exp)

    Computes all the square roots of \code{a} modulo \code{2^exp}. The roots 
    are stored in an array which is created and whose address is stored in 
    the location pointed to by \code{sqrt}. The array of roots is allocated 
    by the function but must be cleaned up by the user by calling 
    \code{flint_free}. The number of roots is returned by the function. If 
    \code{a} is not a quadratic residue modulo \code{2^exp} then 0 is 
    returned by the function and the location \code{sqrt} points to is set to 
    NULL. 

slong
n_sqrtmod_primepow(mp_limb_t ** sqrt, mp_limb_t a, mp_limb_t p, slong exp)

    Computes all the square roots of \code{a} modulo \code{p^exp}. The roots 
    are stored in an array which is created and whose address is stored in 
    the location pointed to by \code{sqrt}. The array of roots is allocated 
    by the function but must be cleaned up by the user by calling 
    \code{flint_free}. The number of roots is returned by the function. If 
    \code{a} is not a quadratic residue modulo \code{p^exp} then 0 is 
    returned by the function and the location \code{sqrt} points to is set to 
    NULL. 

slong n_sqrtmodn(mp_limb_t ** sqrt, mp_limb_t a, n_factor_t * fac)

    Computes all the square roots of \code{a} modulo \code{m} given the 
    factorisation of \code{m} in \code{fac}. The roots are stored in an array 
    which is created and whose address is stored in the location pointed to by 
    \code{sqrt}. The array of roots is allocated by the function but must be 
    cleaned up by the user by calling \code{flint_free}. The number of roots 
    is returned by the function. If \code{a} is not a quadratic residue modulo 
    \code{m} then 0 is returned by the function and the location \code{sqrt} 
    points to is set to NULL. 

*******************************************************************************

    Prime number generation and counting

*******************************************************************************

void n_primes_init(n_primes_t iter)

    Initialises the prime number iterator \code{iter} for use.

void n_primes_clear(n_primes_t iter)

    Clears memory allocated by the prime number iterator \code{iter}.

mp_limb_t n_primes_next(n_primes_t iter)

    Returns the next prime number and advances the state of \code{iter}.
    The first call returns 2.

    Small primes are looked up from \code{flint_small_primes}.
    When this table is exhausted, primes are generated in blocks
    by calling \code{n_primes_sieve_range}.

void n_primes_jump_after(n_primes_t iter, mp_limb_t n)

    Changes the state of \code{iter} to start generating primes
    after $n$ (excluding $n$ itself).

void n_primes_extend_small(n_primes_t iter, mp_limb_t bound)

    Extends the table of small primes in \code{iter} to contain
    at least two primes larger than or equal to \code{bound}.

void n_primes_sieve_range(n_primes_t iter, mp_limb_t a, mp_limb_t b)

    Sets the block endpoints of \code{iter} to the smallest and
    largest odd numbers between $a$ and $b$ inclusive, and
    sieves to mark all odd primes in this range.
    The iterator state is changed to point to the first
    number in the sieved range.

void n_compute_primes(ulong num_primes)

    Precomputes at least \code{num_primes} primes and their \code{double} 
    precomputed inverses and stores them in an internal cache.
    Assuming that FLINT has been built with support for thread-local storage,
    each thread has its own cache.

const mp_limb_t * n_primes_arr_readonly(ulong num_primes)

    Returns a pointer to a read-only array of the first \code{num_primes}
    prime numbers. The computed primes are cached for repeated calls.
    The pointer is valid until the user calls \code{n_cleanup_primes}
    in the same thread.

const double * n_prime_inverses_arr_readonly(ulong n)

    Returns a pointer to a read-only array of inverses of the first
    \code{num_primes} prime numbers. The computed primes are cached for
    repeated calls. The pointer is valid until the user calls
    \code{n_cleanup_primes} in the same thread.

void n_cleanup_primes()

    Frees the internal cache of prime numbers used by the current thread.
    This will invalidate any pointers returned by
    \code{n_primes_arr_readonly} or \code{n_prime_inverses_arr_readonly}.

mp_limb_t n_nextprime(mp_limb_t n, int proved)

    Returns the next prime after $n$. Assumes the result will fit in an
    \code{mp_limb_t}. If proved is $0$, i.e.\ false, the prime is not 
    proven prime, otherwise it is.

ulong n_prime_pi(mp_limb_t n)

    Returns the value of the prime counting function $\pi(n)$, i.e.\ the
    number of primes less than or equal to $n$. The invariant
    \code{n_prime_pi(n_nth_prime(n)) == n}.

    Currently, this function simply extends the table of cached primes up to
    an upper limit and then performs a binary search.

void n_prime_pi_bounds(ulong *lo, ulong *hi, mp_limb_t n)

    Calculates lower and upper bounds for the value of the prime counting
    function \code{lo <= pi(n) <= hi}. If \code{lo} and \code{hi} point to 
    the same location, the high value will be stored.

    The upper approximation is $1.25506 n / \ln n$, and the 
    lower is $n / \ln n$.  These bounds are due to Rosser and 
    Schoenfeld~\citep{RosSch1962} and valid for $n \geq 17$.

    We use the number of bits in $n$ (or one less) to form an 
    approximation to $\ln n$, taking care to use a value too
    small or too large to maintain the inequality.

mp_limb_t n_nth_prime(ulong n)

    Returns the $n$th prime number $p_n$, using the mathematical indexing
    convention $p_1 = 2, p_2 = 3, \dotsc$.

    This function simply ensures that the table of cached primes is large
    enough and then looks up the entry.

void n_nth_prime_bounds(mp_limb_t *lo, mp_limb_t *hi, ulong n)

    Calculates lower and upper bounds for the $n$th prime number $p_n$,
    \code{lo <= p_n <= hi}. If \code{lo} and \code{hi} point to the same 
    location, the high value will be stored. Note that this function will 
    overflow for sufficiently large $n$.

    We use the following estimates, valid for $n > 5$:
    \begin{align*}
    p_n  & >  n (\ln n + \ln \ln n - 1) \\
    p_n  & <  n (\ln n + \ln \ln n) \\
    p_n  & <  n (\ln n + \ln \ln n - 0.9427) \quad (n \geq 15985)
    \end{align*}

    The first inequality was proved by Dusart~\citep{Dus1999}, and the last 
    is due to Massias and Robin~\citep{MasRob1996}.  For a further overview, 
    see \url{http://primes.utm.edu/howmany.shtml}.

    We bound $\ln n$ using the number of bits in $n$ as in 
    \code{n_prime_pi_bounds()}, and estimate $\ln \ln n$ to the nearest 
    integer; this function is nearly constant.

    % P. Dusart, "The kth prime is greater than k(ln k+ln ln k-1) for k> 2,"
    % Math. Comp., 68:225 (January 1999) 411--415.

    % J. Massias and G. Robin, "Bornes effectives pour certaines fonctions
    % concernant les nombres premiers," J. Theorie Nombres Bordeaux, 8
    % (1996) 215-242.

*******************************************************************************

    Primality testing

*******************************************************************************

int n_is_oddprime_small(mp_limb_t n) 

    Returns $1$ if $n$ is an odd prime smaller than 
    \code{FLINT_ODDPRIME_SMALL_CUTOFF}. Expects $n$ 
    to be odd and smaller than the cutoff.

    This function merely uses a lookup table with one bit allocated for each
    odd number up to the cutoff.

int n_is_oddprime_binary(mp_limb_t n) 

    This function performs a simple binary search through 
    the table of cached primes for $n$. If it exists in the array it returns
    $1$, otherwise $0$. For the algorithm to operate correctly 
    $n$ should be odd and at least $17$. 

    Lower and upper bounds are computed with \code{n_prime_pi_bounds()}.
    Once we have bounds on where to look in the table, we 
    refine our search with a simple binary algorithm, taking
    the top or bottom of the current interval as necessary.

int n_is_prime_pocklington(mp_limb_t n, ulong iterations)

    Tests if $n$ is a prime using the Pocklington--Lehmer primality
    test. If $1$ is returned $n$ has been proved prime. If $0$ is returned 
    $n$ is composite. However $-1$ may be returned if nothing was proved 
    either way due to the number of iterations being too small. 

    The most time consuming part of the algorithm is factoring 
    $n - 1$. For this reason \code{n_factor_partial()} is used, 
    which uses a combination of trial factoring and Hart's one 
    line factor algorithm~\citep{Har2009} to try to quickly factor $n - 1$. 
    Additionally if the cofactor is less than the square root of 
    $n - 1$ the algorithm can still proceed.

    One can also specify a number of iterations if less time 
    should be taken. Simply set this to \code{~WORD(0)} if this is irrelevant.
    In most cases a greater number of iterations will not 
    significantly affect timings as most of the time is spent 
    factoring.

    See 
    \url{http://mathworld.wolfram.com/PocklingtonsTheorem.html}
    for a description of the algorithm.

int n_is_prime_pseudosquare(mp_limb_t n)

    Tests if $n$ is a prime according to~\citep[Theorem~2.7]{LukPatWil1996}.

    % "Some results on pseudosquares" by Lukes, Patterson and Williams,
    % Math. Comp. vol 65, No. 213. pp 361-372. See 
    % http://www.ams.org/mcom/1996-65-213/S0025-5718-96-00678-3/
    %   S0025-5718-96-00678-3.pdf

    We first factor $N$ using trial division up to some limit $B$.
    In fact, the number of primes used in the trial factoring is at 
    most \code{FLINT_PSEUDOSQUARES_CUTOFF}.

    Next we compute $N/B$ and find the next pseudosquare $L_p$ above
    this value, using a static table as per
    \url{http://research.att.com/~njas/sequences/b002189.txt}.

    As noted in the text, if $p$ is prime then Step~3 will pass. This
    test rejects many composites, and so by this time we suspect
    that $p$ is prime. If $N$ is $3$ or $7$ modulo $8$, we are done, 
    and $N$ is prime.

    We now run a probable prime test, for which no known 
    counterexamples are known, to reject any composites. We then 
    proceed to prove $N$ prime by executing Step~4. In the case that
    $N$ is $1$ modulo $8$, if Step~4 fails, we extend the number of primes
    $p_i$ at Step~3 and hope to find one which passes Step~4. We take
    the test one past the largest $p$ for which we have pseudosquares
    $L_p$ tabulated, as this already corresponds to the next $L_p$ which 
    is bigger than $2^{64}$ and hence larger than any prime we might be
    testing.

    As explained in the text, Condition~4 cannot fail if $N$ is prime.

    The possibility exists that the probable prime test declares a
    composite prime. However in that case an error is printed, as
    that would be of independent interest.

int n_is_prime(mp_limb_t n)

    Tests if $n$ is a prime. Up to $10^{16}$ this simply calls 
    \code{n_is_probabprime()} which is a primality test up to that limit. 
    Beyond that point it calls \code{n_is_probabprime()} and returns $0$ 
    if $n$ is composite, then it calls \code{n_is_prime_pocklington()} 
    which proves the primality of $n$ in most cases. As a fallback, 
    \code{n_is_prime_pseudosquare()} is called, which will unconditionally 
    prove the primality of $n$.

int n_is_strong_probabprime_precomp(mp_limb_t n, double npre, 
                                                      mp_limb_t a, mp_limb_t d)

    Tests if $n$ is a strong probable prime to the base $a$. We 
    require that $d$ is set to the largest odd factor of $n - 1$ and 
    \code{npre} is a precomputed inverse of $n$ computed with 
    \code{n_precompute_inverse()}.  We also require that $n < 2^{53}$, 
    $a$ to be reduced modulo $n$ and not $0$ and $n$ to be odd.

    If we write $n - 1 = 2^s d$ where $d$ is odd then $n$ is a strong 
    probable prime to the base $a$, i.e.\ an $a$-SPRP, if either 
    $a^d = 1 \pmod n$ or $(a^d)^{2^r} = -1 \pmod n$ for some~$r$ less 
    than~$s$.

    A description of strong probable primes is given here:
    \url{http://mathworld.wolfram.com/StrongPseudoprime.html}

int n_is_strong_probabprime2_preinv(mp_limb_t n, mp_limb_t ninv, 
                                                      mp_limb_t a, mp_limb_t d)

    Tests if $n$ is a strong probable prime to the base $a$. We require 
    that $d$ is set to the largest odd factor of $n - 1$ and \code{npre} 
    is a precomputed inverse of $n$ computed with \code{n_preinvert_limb()}.
    We require a to be reduced modulo $n$ and not $0$ and $n$ to be odd.

    If we write $n - 1 = 2^s d$ where $d$ is odd then $n$ is a strong 
    probable prime to the base $a$ (an $a$-SPRP) if either $a^d = 1 \pmod n$ 
    or $(a^d)^{2^r} = -1 \pmod n$ for some $r$ less than $s$.

    A description of strong probable primes is given here:
    \url{http://mathworld.wolfram.com/StrongPseudoprime.html}

int n_is_probabprime_fermat(mp_limb_t n, mp_limb_t i)

    Returns $1$ if $n$ is a base $i$ Fermat probable prime. Requires 
    $1 < i < n$ and that $i$ does not divide $n$.

    By Fermat's Little Theorem if $i^{n-1}$ is not congruent to $1$ 
    then $n$ is not prime.

int n_is_probabprime_fibonacci(mp_limb_t n)

    Let $F_j$ be the $j$th element of the Fibonacci sequence 
    $0, 1, 1, 2, 3, 5, \dotsc$, starting at $j = 0$. Then if $n$ is prime
    we have $F_{n - (n/5)} = 0 \pmod n$, where $(n/5)$ is the Jacobi
    symbol.

    For further details, see~\citep[pp.~142]{CraPom2005}.

    We require that $n$ is not divisible by $2$ or $5$. 

int n_is_probabprime_BPSW(mp_limb_t n)

    Implements the Baillie--Pomerance--Selfridge--Wagstaff probable primality
    test. There are no known counterexamples to this being a primality test.
    For further details, see~\citep{CraPom2005}.

int n_is_probabprime_lucas(mp_limb_t n)

    For details on Lucas pseudoprimes, see~\citep[pp.~143]{CraPom2005}.

    We implement a variant of the Lucas pseudoprime test as
    described by Baillie and Wagstaff~\citep{BaiWag1980}.

int n_is_probabprime(mp_limb_t n)

    Tests if $n$ is a probable prime. Up to \code{FLINT_ODDPRIME_SMALL_CUTOFF} 
    this algorithm uses \code{n_is_oddprime_small()} which uses a lookup table.
    Next it calls \code{n_compute_primes()} with the maximum table size and 
    uses this table to perform a binary search for $n$ up to the table limit.
    Then up to $10^{16}$ it uses a number of strong probable prime tests,
    \code{n_is_strong_probabprime_precomp()}, etc., for various bases. The 
    output of the algorithm is guaranteed to be correct up to this bound due 
    to exhaustive tables, described at 
    \url{http://uucode.com/obf/dalbec/alg.html}.

    Beyond that point the BPSW probabilistic primality test is used, by 
    calling the function \code{n_is_probabprime_BPSW()}. There are no known 
    counterexamples, but it may well declare some composites to be prime.

*******************************************************************************

    Square root and perfect power testing

*******************************************************************************

mp_limb_t n_sqrt(mp_limb_t a)

    Computes the integer truncation of the square root of $a$. The integer
    itself can be represented exactly as a double and its square root is 
    computed to the nearest place. If $a$ is one below a square, the 
    rounding may be up, whereas if it is one above a square, the rounding 
    will be down. Thus the square root may be one too large in some 
    instances. We also have to be careful when the square of this too large 
    value causes an overflow. The same assumptions hold for a single 
    precision float provided the square root itself can be represented 
    in a single float, i.e.\ for $a < 281474976710656 = 2^{46}$.

mp_limb_t n_sqrtrem(mp_limb_t * r, mp_limb_t a)

    Computes the integer truncation of the square root of $a$. The integer
    itself can be represented exactly as a double and its square root is 
    computed to the nearest place. If $a$ is one below a square, the 
    rounding may be up, whereas if it is one above a square, the rounding 
    will be down. Thus the square root may be one too large in some 
    instances. We also have to be careful when the square of this too 
    large value causes an overflow. The same assumptions hold for a 
    single precision float provided the square root itself can be 
    represented in a single float, i.e. for \
    $a < 281474976710656 = 2^{46}$. The remainder is computed by 
    subtracting the square of the computed square root from $a$.

int n_is_square(mp_limb_t x)

    Returns $1$ if $x$ is a square, otherwise $0$.

    This code first checks if $x$ is a square modulo $64$, 
    $63 = 3 \times 3 \times 7$ and $65 = 5 \times 13$, using lookup tables, 
    and if so it then takes a square root and checks that the square of this 
    equals the original value. 

int n_is_perfect_power235(mp_limb_t n)

    Returns $1$ if $n$ is a perfect square, cube or fifth power. 

    This function uses a series of modular tests to reject most
    non $235$-powers. Each modular test returns a value from $0$ to $7$ 
    whose bits respectively indicate whether the value is a square,
    cube or fifth power modulo the given modulus. When these are
    logically \code{AND}ed together, this gives a powerful test which will
    reject most non-$235$ powers. 

    If a bit remains set indicating it may be a square, a standard
    square root test is performed. Similarly a cube root or fifth 
    root can be taken, if indicated, to determine whether the power
    of that root is exactly equal to $n$.

*******************************************************************************

    Factorisation

*******************************************************************************

int n_remove(mp_limb_t * n, mp_limb_t p)

    Removes the highest possible power of $p$ from $n$, replacing
    $n$ with the quotient. The return value is that highest 
    power of $p$ that divided $n$. Assumes $n$ is not $0$.

    For $p = 2$ trailing zeroes are counted. For other primes
    $p$ is repeatedly squared and stored in a table of powers
    with the current highest power of $p$ removed at each step
    until no higher power can be removed. The algorithm then
    proceeds down the power tree again removing powers of $p$
    until none remain.

int n_remove2_precomp(mp_limb_t * n, mp_limb_t p, double ppre)

    Removes the highest possible power of $p$ from $n$, replacing
    $n$ with the quotient. The return value is that highest 
    power of $p$ that divided $n$. Assumes $n$ is not $0$. We require
    \code{ppre} to be set to a precomputed inverse of $p$ computed 
    with \code{n_precompute_inverse()}.

    For $p = 2$ trailing zeroes are counted. For other primes
    $p$ we make repeated use of \code{n_divrem2_precomp()} until division
    by $p$ is no longer possible.

void n_factor_insert(n_factor_t * factors, mp_limb_t p, ulong exp)

    Inserts the given prime power factor \code{p^exp} into 
    the \code{n_factor_t} \code{factors}. See the documentation for 
    \code{n_factor_trial()} for a description of the \code{n_factor_t} type. 

    The algorithm performs a simple search to see if $p$ already 
    exists as a prime factor in the structure. If so the exponent
    there is increased by the supplied exponent. Otherwise a new 
    factor \code{p^exp} is added to the end of the structure.

    There is no test code for this function other than its use by
    the various factoring functions, which have test code.

mp_limb_t n_factor_trial_range(n_factor_t * factors,
                  mp_limb_t n, ulong start, ulong num_primes)

    Trial factor $n$ with the first \code{num_primes} primes, but
    starting at the prime with index start (counting from zero).

    One requires an initialised \code{n_factor_t} structure, but factors
    will be added by default to an already used \code{n_factor_t}. Use 
    the function \code{n_factor_init()} defined in \code{ulong_extras} if 
    initialisation has not already been completed on factors.

    Once completed, \code{num} will contain the number of distinct 
    prime factors found. The field $p$ is an array of \code{mp_limb_t}'s 
    containing the distinct prime factors, \code{exp} an array 
    containing the corresponding exponents.

    The return value is the unfactored cofactor after trial 
    factoring is done.

    The function calls \code{n_compute_primes()} automatically. See
    the documentation for that function regarding limits.

    The algorithm stops when the current prime has a square 
    exceeding $n$, as no prime factor of $n$ can exceed this 
    unless $n$ is prime.

    The precomputed inverses of all the primes computed by
    \code{n_compute_primes()} are utilised with the \code{n_remove2_precomp()}
    function.

mp_limb_t n_factor_trial(n_factor_t * factors, mp_limb_t n, ulong num_primes)

    This function calls \code{n_factor_trial_range()}, with the value of 
    $0$ for \code{start}. By default this adds factors to an already existing
    \code{n_factor_t} or to a newly initialised one.

mp_limb_t n_factor_power235(ulong *exp, mp_limb_t n)

    Returns $0$ if $n$ is not a perfect square, cube or fifth power.
    Otherwise it returns the root and sets \code{exp} to either $2$, 
    $3$ or $5$ appropriately. 

    This function uses a series of modular tests to reject most
    non $235$-powers. Each modular test returns a value from $0$ to $7$ 
    whose bits respectively indicate whether the value is a square,
    cube or fifth power modulo the given modulus. When these are
    logically \code{AND}ed together, this gives a powerful test which will
    reject most non-$235$ powers. 

    If a bit remains set indicating it may be a square, a standard
    square root test is performed. Similarly a cube root or fifth 
    root can be taken, if indicated, to determine whether the power
    of that root is exactly equal to $n$.

mp_limb_t n_factor_one_line(mp_limb_t n, ulong iters)

    This implements Bill Hart's one line factoring algorithm~\citep{Har2009}.
    It is a variant of Fermat's algorithm which cycles through a large number 
    of multipliers instead of incrementing the square root. It is faster than 
    SQUFOF for $n$ less than about $2^{40}$.

mp_limb_t n_factor_lehman(mp_limb_t n)

    Lehman's factoring algorithm. Currently works up to $10^{16}$, but is
    not particularly efficient and so is not used in the general factor
    function. Always returns a factor of $n$.

mp_limb_t n_factor_SQUFOF(mp_limb_t n, ulong iters)

    Attempts to split $n$ using the given number of iterations
    of SQUFOF. Simply set \code{iters} to \code{~WORD(0)} for maximum 
    persistence.

    The version of SQUFOF imlemented here is as described by Gower 
    and Wagstaff~\citep{GowWag2008}.

    % Jason Gower and Sam Wagstaff
    % "Square form factoring"
    % Math. Comp. 77, 2008, pp 551-588, see:
    % http://www.ams.org/mcom/2008-77-261/S0025-5718-07-02010-8/home.html

    We start by trying SQUFOF directly on $n$. If that fails we
    multiply it by each of the primes in \code{flint_primes_small} in
    turn. As this multiplication may result in a two limb value
    we allow this in our implementation of SQUFOF. As SQUFOF 
    works with values about half the size of $n$ it only needs 
    single limb arithmetic internally.

    If SQUFOF fails to factor $n$ we return $0$, however with 
    \code{iters} large enough this should never happen.

void n_factor(n_factor_t * factors, mp_limb_t n, int proved)

    Factors $n$ with no restrictions on $n$. If the prime factors are 
    required to be certified prime, one may set \code{proved} to $1$, 
    otherwise set it to $0$, and they will only be probable primes
    (with no known counterexamples to the conjecture that they are
    in fact all prime).

    For details on the \code{n_factor_t} structure, see 
    \code{n_factor_trial()}.

    This function first tries trial factoring with a number of primes
    specified by the constant \code{FLINT_FACTOR_TRIAL_PRIMES}. If the 
    cofactor is $1$ or prime the function returns with all the factors.

    Otherwise, the cofactor is placed in the array \code{factor_arr}. Whilst 
    there are factors remaining in there which have not been split, the 
    algorithm continues. At each step each factor is first checked to 
    determine if it is a perfect power. If so it is replaced by the power 
    that has been found. Next if the factor is small enough and composite, 
    in particular, less than \code{FLINT_FACTOR_ONE_LINE_MAX} then 
    \code{n_factor_one_line()} is called with 
    \code{FLINT_FACTOR_ONE_LINE_ITERS} to try and split the factor. If 
    that fails or the factor is too large for \code{n_factor_one_line()} 
    then \code{n_factor_SQUFOF()} is called, with 
    \code{FLINT_FACTOR_SQUFOF_ITERS}. If that fails an error results and
    the program aborts. However this should not happen in practice.

mp_limb_t n_factor_trial_partial(n_factor_t * factors, mp_limb_t n, 
                  mp_limb_t * prod, ulong num_primes, mp_limb_t limit)

    Attempts trial factoring of $n$ with the first \code{num_primes primes}, 
    but stops when the product of prime factors so far exceeds \code{limit}.

    One requires an initialised \code{n_factor_t} structure, but factors
    will be added by default to an already used \code{n_factor_t}. Use 
    the function \code{n_factor_init()} defined in \code{ulong_extras} if 
    initialisation has not already been completed on \code{factors}.

    Once completed, \code{num} will contain the number of distinct 
    prime factors found. The field $p$ is an array of \code{mp_limb_t}'s 
    containing the distinct prime factors, \code{exp} an array 
    containing the corresponding exponents.

    The return value is the unfactored cofactor after trial 
    factoring is done. The value \code{prod} will be set to the product
    of the factors found.

    The function calls \code{n_compute_primes()} automatically. See
    the documentation for that function regarding limits.

    The algorithm stops when the current prime has a square 
    exceeding $n$, as no prime factor of $n$ can exceed this 
    unless $n$ is prime.

    The precomputed inverses of all the primes computed by
    \code{n_compute_primes()} are utilised with the \code{n_remove2_precomp()}
    function.

mp_limb_t n_factor_partial(n_factor_t * factors, 
                     mp_limb_t n, mp_limb_t limit, int proved)

    Factors $n$, but stops when the product of prime factors so far 
    exceeds \code{limit}.

    One requires an initialised \code{n_factor_t} structure, but factors
    will be added by default to an already used \code{n_factor_t}. Use 
    the function \code{n_factor_init()} defined in \code{ulong_extras} if 
    initialisation has not already been completed on \code{factors}.

    On exit, \code{num} will contain the number of distinct prime factors 
    found. The field $p$ is an array of \code{mp_limb_t}'s containing the 
    distinct prime factors, \code{exp} an array containing the corresponding 
    exponents.

    The return value is the unfactored cofactor after factoring is done. 

    The factors are proved prime if \code{proved} is $1$, otherwise
    they are merely probably prime.

mp_limb_t n_factor_pp1(mp_limb_t n, ulong B1, ulong c)

    Factors $n$ using Williams' $p + 1$ factoring algorithm, with prime
    limit set to $B1$. We require $c$ to be set to a random value. Each
    trial of the algorithm with a different value of $c$ gives another
    chance to factor $n$, with roughly exponentially decreasing chance
    of finding a missing factor. If $p + 1$ (or $p - 1$) is not smooth
    for any factor $p$ of $n$, the algorithm will never succeed. The
    value $c$ should be less than $n$ and greater than $2$.

    If the algorithm succeeds, it returns the factor, otherwise it
    returns $0$ or $1$ (the trivial factors modulo $n$).

*******************************************************************************

    Arithmetic functions

*******************************************************************************

int n_moebius_mu(mp_limb_t n)

    Computes the Moebius function $\mu(n)$, which is defined as $\mu(n) = 0$ 
    if $n$ has a prime factor of multiplicity greater than $1$, $\mu(n) = -1$ 
    if $n$ has an odd number of distinct prime factors, and $\mu(n) = 1$ if 
    $n$ has an even number of distinct prime factors. By convention, 
    $\mu(0) = 0$.

    For even numbers, we use the identities $\mu(4n) = 0$ and 
    $\mu(2n) = - \mu(n)$. Odd numbers up to a cutoff are then looked up from 
    a precomputed table storing $\mu(n) + 1$ in groups of two bits.

    For larger $n$, we first check if $n$ is divisible by a small odd square
    and otherwise call \code{n_factor()} and count the factors.

void n_moebius_mu_vec(int * mu, ulong len)

    Computes $\mu(n)$ for \code{n = 0, 1, ..., len - 1}. This 
    is done by sieving over each prime in the range, flipping the sign 
    of $\mu(n)$ for every multiple of a prime $p$ and setting $\mu(n) = 0$ 
    for every multiple of $p^2$.

int n_is_squarefree(mp_limb_t n)

    Returns $0$ if $n$ is divisible by some perfect square, and $1$ otherwise.
    This simply amounts to testing whether $\mu(n) \neq 0$. As special 
    cases, $1$ is considered squarefree and $0$ is not considered squarefree.

mp_limb_t n_euler_phi(mp_limb_t n)

    Computes the Euler totient function $\phi(n)$, counting the number of
    positive integers less than or equal to $n$ that are coprime to $n$.

*******************************************************************************

    Factorials

*******************************************************************************

mp_limb_t n_factorial_fast_mod2_preinv(ulong n, mp_limb_t p, mp_limb_t pinv)

    Returns $n! \bmod p$ given a precomputed inverse of $p$ as computed
    by \code{n_preinvert_limb()}. $p$ is not required to be a prime, but
    no special optimisations are made for composite $p$.
    Uses fast multipoint evaluation, running in about $O(n^{1/2})$ time.

mp_limb_t n_factorial_mod2_preinv(ulong n, mp_limb_t p, mp_limb_t pinv)

    Returns $n! \bmod p$ given a precomputed inverse of $p$ as computed
    by \code{n_preinvert_limb()}. $p$ is not required to be a prime, but
    no special optimisations are made for composite $p$.

    Uses a lookup table for small $n$, otherwise computes the product
    if $n$ is not too large, and calls the fast algorithm for extremely
    large $n$.

*******************************************************************************

    Primitive Roots and Discrete Logarithms

*******************************************************************************

mp_limb_t n_primitive_root_prime_prefactor(mp_limb_t p, n_factor_t * factors)

    Returns a primitive root for the multiplicative subgroup of $Z/pZ$
    where $p$ is prime given the factorisation ($factors$) of $p - 1$.


mp_limb_t n_primitive_root_prime(mp_limb_t p)

    Returns a primitive root for the multiplicative subgroup of $Z/pZ$
    where $p$ is prime.

mp_limb_t n_discrete_log_bsgs(mp_limb_t b, mp_limb_t a, mp_limb_t n)

    Returns the discrete logarithm of $b$ with respect to $a$ in the
    multiplicative subgroup of $Z/nZ$ when $Z/nZ$ is cyclic That is,
    it returns an number $x$ such that $a^x = b \bmod n$.  The
    multiplicative subgroup is only cyclic when $n$ is $2$, $4$,
    $p^k$, or $2p^k$ where $p$ is an odd prime and $k$ is a positive
    integer.