659 lines
31 KiB
Plaintext
659 lines
31 KiB
Plaintext
/*
|
|
Copyright 2009, 2011 William Hart. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without modification, are
|
|
permitted provided that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright notice, this list of
|
|
conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright notice, this list
|
|
of conditions and the following disclaimer in the documentation and/or other materials
|
|
provided with the distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY William Hart ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
|
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL William Hart OR
|
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
|
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
The views and conclusions contained in the software and documentation are those of the
|
|
authors and should not be interpreted as representing official policies, either expressed
|
|
or implied, of William Hart.
|
|
*/
|
|
|
|
/******************************************************************************
|
|
|
|
Copyright (C) 2011 William Hart
|
|
|
|
******************************************************************************/
|
|
|
|
*******************************************************************************
|
|
|
|
Split/combine FFT coefficients
|
|
|
|
*******************************************************************************
|
|
|
|
mp_size_t fft_split_limbs(mp_limb_t ** poly, mp_srcptr limbs,
|
|
mp_size_t total_limbs, mp_size_t coeff_limbs, mp_size_t output_limbs)
|
|
|
|
Split an integer \code{(limbs, total_limbs)} into coefficients of length
|
|
\code{coeff_limbs} limbs and store as the coefficients of \code{poly}
|
|
which are assumed to have space for \code{output_limbs + 1} limbs per
|
|
coefficient. The coefficients of the polynomial do not need to be zeroed
|
|
before calling this function, however the number of coefficients written
|
|
is returned by the function and any coefficients beyond this point are
|
|
not touched.
|
|
|
|
mp_size_t fft_split_bits(mp_limb_t ** poly, mp_srcptr limbs,
|
|
mp_size_t total_limbs, mp_bitcnt_t bits, mp_size_t output_limbs)
|
|
|
|
Split an integer \code{(limbs, total_limbs)} into coefficients of the
|
|
given number of \code{bits} and store as the coefficients of \code{poly}
|
|
which are assumed to have space for \code{output_limbs + 1} limbs per
|
|
coefficient. The coefficients of the polynomial do not need to be zeroed
|
|
before calling this function, however the number of coefficients written
|
|
is returned by the function and any coefficients beyond this point are
|
|
not touched.
|
|
|
|
void fft_combine_limbs(mp_limb_t * res, mp_limb_t ** poly, slong length,
|
|
mp_size_t coeff_limbs, mp_size_t output_limbs, mp_size_t total_limbs)
|
|
|
|
Evaluate the polynomial \code{poly} of the given \code{length} at
|
|
\code{B^coeff_limbs}, where \code{B = 2^FLINT_BITS}, and add the
|
|
result to the integer \code{(res, total_limbs)} throwing away any bits
|
|
that exceed the given number of limbs. The polynomial coefficients are
|
|
assumed to have at least \code{output_limbs} limbs each, however any
|
|
additional limbs are ignored.
|
|
|
|
If the integer is initially zero the result will just be the evaluation
|
|
of the polynomial.
|
|
|
|
void fft_combine_bits(mp_limb_t * res, mp_limb_t ** poly, slong length,
|
|
mp_bitcnt_t bits, mp_size_t output_limbs, mp_size_t total_limbs)
|
|
|
|
Evaluate the polynomial \code{poly} of the given \code{length} at
|
|
\code{2^bits} and add the result to the integer
|
|
\code{(res, total_limbs)} throwing away any bits that exceed the given
|
|
number of limbs. The polynomial coefficients are assumed to have at least
|
|
\code{output_limbs} limbs each, however any additional limbs are ignored.
|
|
If the integer is initially zero the result will just be the evaluation
|
|
of the polynomial.
|
|
|
|
*******************************************************************************
|
|
|
|
Test helper functions
|
|
|
|
*******************************************************************************
|
|
|
|
void fermat_to_mpz(mpz_t m, mp_limb_t * i, mp_size_t limbs)
|
|
|
|
Convert the Fermat number \code{(i, limbs)} modulo \code{B^limbs + 1} to
|
|
an \code{mpz_t m}. Assumes \code{m} has been initialised. This function
|
|
is used only in test code.
|
|
|
|
*******************************************************************************
|
|
|
|
Arithmetic modulo a generalised Fermat number
|
|
|
|
*******************************************************************************
|
|
|
|
void mpn_addmod_2expp1_1(mp_limb_t * r, mp_size_t limbs, mp_limb_signed_t c)
|
|
|
|
Adds the signed limb \code{c} to the generalised fermat number \code{r}
|
|
modulo \code{B^limbs + 1}. The compiler should be able to inline
|
|
this for the case that there is no overflow from the first limb.
|
|
|
|
void mpn_normmod_2expp1(mp_limb_t * t, mp_size_t limbs)
|
|
|
|
Given \code{t} a signed integer of \code{limbs + 1} limbs in twos
|
|
complement format, reduce \code{t} to the corresponding value modulo the
|
|
generalised Fermat number \code{B^limbs + 1}, where
|
|
\code{B = 2^FLINT_BITS}.
|
|
|
|
void mpn_mul_2expmod_2expp1(mp_limb_t * t,
|
|
mp_limb_t * i1, mp_size_t limbs, mp_bitcnt_t d)
|
|
|
|
Given \code{i1} a signed integer of \code{limbs + 1} limbs in twos
|
|
complement format reduced modulo \code{B^limbs + 1} up to some
|
|
overflow, compute \code{t = i1*2^d} modulo $p$. The result will not
|
|
necessarily be fully reduced. The number of bits \code{d} must be
|
|
nonnegative and less than \code{FLINT_BITS}. Aliasing is permitted.
|
|
|
|
void mpn_div_2expmod_2expp1(mp_limb_t * t,
|
|
mp_limb_t * i1, mp_size_t limbs, mp_bitcnt_t d)
|
|
|
|
Given \code{i1} a signed integer of \code{limbs + 1} limbs in twos
|
|
complement format reduced modulo \code{B^limbs + 1} up to some
|
|
overflow, compute \code{t = i1/2^d} modulo $p$. The result will not
|
|
necessarily be fully reduced. The number of bits \code{d} must be
|
|
nonnegative and less than \code{FLINT_BITS}. Aliasing is permitted.
|
|
|
|
|
|
*******************************************************************************
|
|
|
|
Generic butterflies
|
|
|
|
*******************************************************************************
|
|
|
|
void fft_adjust(mp_limb_t * r, mp_limb_t * i1,
|
|
mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)
|
|
|
|
Set \code{r} to \code{i1} times $z^i$ modulo \code{B^limbs + 1} where
|
|
$z$ corresponds to multiplication by $2^w$. This can be thought of as part
|
|
of a butterfly operation. We require $0 \leq i < n$ where $nw =$
|
|
\code{limbs*FLINT_BITS}. Aliasing is not supported.
|
|
|
|
void fft_adjust_sqrt2(mp_limb_t * r, mp_limb_t * i1,
|
|
mp_size_t i, mp_size_t limbs, mp_bitcnt_t w, mp_limb_t * temp)
|
|
|
|
Set \code{r} to \code{i1} times $z^i$ modulo \code{B^limbs + 1} where
|
|
$z$ corresponds to multiplication by $\sqrt{2}^w$. This can be thought of
|
|
as part of a butterfly operation. We require $0 \leq i < 2*n$ and odd
|
|
where $nw =$ \code{limbs*FLINT_BITS}.
|
|
|
|
void butterfly_lshB(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
|
|
mp_limb_t * i2, mp_size_t limbs, mp_size_t x, mp_size_t y)
|
|
|
|
We are given two integers \code{i1} and \code{i2} modulo
|
|
\code{B^limbs + 1} which are not necessarily normalised. We compute
|
|
\code{t = (i1 + i2)*B^x} and \code{u = (i1 - i2)*B^y} modulo $p$. Aliasing
|
|
between inputs and outputs is not permitted. We require \code{x} and
|
|
\code{y} to be less than \code{limbs} and nonnegative.
|
|
|
|
void butterfly_rshB(mp_limb_t * t, mp_limb_t * u, mp_limb_t * i1,
|
|
mp_limb_t * i2, mp_size_t limbs, mp_size_t x, mp_size_t y)
|
|
|
|
We are given two integers \code{i1} and \code{i2} modulo
|
|
\code{B^limbs + 1} which are not necessarily normalised. We compute
|
|
\code{t = (i1 + i2)/B^x} and \code{u = (i1 - i2)/B^y} modulo $p$. Aliasing
|
|
between inputs and outputs is not permitted. We require \code{x} and
|
|
\code{y} to be less than \code{limbs} and nonnegative.
|
|
|
|
*******************************************************************************
|
|
|
|
Radix 2 transforms
|
|
|
|
*******************************************************************************
|
|
|
|
void fft_butterfly(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1,
|
|
mp_limb_t * i2, mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)
|
|
|
|
Set \code{s = i1 + i2}, \code{t = z1^i*(i1 - i2)} modulo
|
|
\code{B^limbs + 1} where \code{z1 = exp(Pi*I/n)} corresponds to
|
|
multiplication by $2^w$. Requires $0 \leq i < n$ where $nw =$
|
|
\code{limbs*FLINT_BITS}.
|
|
|
|
void ifft_butterfly(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1,
|
|
mp_limb_t * i2, mp_size_t i, mp_size_t limbs, mp_bitcnt_t w)
|
|
|
|
Set \code{s = i1 + z1^i*i2}, \code{t = i1 - z1^i*i2} modulo
|
|
\code{B^limbs + 1} where\\ \code{z1 = exp(-Pi*I/n)} corresponds to
|
|
division by $2^w$. Requires $0 \leq i < 2n$ where $nw =$
|
|
\code{limbs*FLINT_BITS}.
|
|
|
|
void fft_radix2(mp_limb_t ** ii,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2)
|
|
|
|
The radix 2 DIF FFT works as follows:
|
|
|
|
Input: \code{[i0, i1, ..., i(m-1)]}, for $m = 2n$ a power of $2$.
|
|
|
|
Output: \code{[r0, r1, ..., r(m-1)]}\\ \code{ = FFT[i0, i1, ..., i(m-1)]}.
|
|
|
|
Algorithm:
|
|
|
|
$\bullet$ Recursively compute \code{[r0, r2, r4, ...., r(m-2)]}\\
|
|
\code{= FFT[i0+i(m/2), i1+i(m/2+1), ..., i(m/2-1)+i(m-1)]}
|
|
|
|
$\bullet$ Let \code{[t0, t1, ..., t(m/2-1)]}\\
|
|
\code{= [i0-i(m/2), i1-i(m/2+1), ..., i(m/2-1)-i(m-1)]}
|
|
|
|
$\bullet$ Let \code{[u0, u1, ..., u(m/2-1)]}\\
|
|
\code{= [z1^0*t0, z1^1*t1, ..., z1^(m/2-1)*t(m/2-1)]}
|
|
where \code{z1 = exp(2*Pi*I/m)} corresponds to multiplication
|
|
by $2^w$.
|
|
|
|
$\bullet$ Recursively compute \code{[r1, r3, ..., r(m-1)]}\\
|
|
\code{= FFT[u0, u1, ..., u(m/2-1)]}
|
|
|
|
The parameters are as follows:
|
|
|
|
$\bullet$ \code{2*n} is the length of the input and output
|
|
arrays
|
|
|
|
$\bullet$ $w$ is such that $2^w$ is an $2n$-th root of unity
|
|
in the ring $\mathbb{Z}/p\mathbb{Z}$ that we are working in,
|
|
i.e. $p = 2^{wn} + 1$ (here $n$ is divisible by
|
|
\code{GMP_LIMB_BITS})
|
|
|
|
$\bullet$ \code{ii} is the array of inputs (each input is an
|
|
array of limbs of length \code{wn/GMP_LIMB_BITS + 1} (the
|
|
extra limbs being a "carry limb"). Outputs are written
|
|
in-place.
|
|
|
|
We require $nw$ to be at least 64 and the two temporary space pointers to
|
|
point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void fft_truncate(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
|
|
|
|
As for \code{fft_radix2} except that only the first \code{trunc}
|
|
coefficients of the output are computed and the input is regarded as
|
|
having (implied) zero coefficients from coefficient \code{trunc} onwards.
|
|
The coefficients must exist as the algorithm needs to use this extra
|
|
space, but their value is irrelevant. The value of \code{trunc} must be
|
|
divisible by 2.
|
|
|
|
void fft_truncate1(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
|
|
|
|
As for \code{fft_radix2} except that only the first \code{trunc}
|
|
coefficients of the output are computed. The transform still needs all
|
|
$2n$ input coefficients to be specified.
|
|
|
|
void ifft_radix2(mp_limb_t ** ii,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2)
|
|
|
|
The radix 2 DIF IFFT works as follows:
|
|
|
|
Input: \code{[i0, i1, ..., i(m-1)]}, for $m = 2n$ a power of $2$.
|
|
|
|
Output: \code{[r0, r1, ..., r(m-1)]}\\
|
|
\code{ = IFFT[i0, i1, ..., i(m-1)]}.
|
|
|
|
Algorithm:
|
|
|
|
$\bullet$ Recursively compute \code{[s0, s1, ...., s(m/2-1)]}\\
|
|
\code{= IFFT[i0, i2, ..., i(m-2)]}
|
|
|
|
$\bullet$ Recursively compute \code{[t(m/2), t(m/2+1), ..., t(m-1)]}\\
|
|
\code{= IFFT[i1, i3, ..., i(m-1)]}
|
|
|
|
$\bullet$ Let \code{[r0, r1, ..., r(m/2-1)]}\\
|
|
\code{= [s0+z1^0*t0, s1+z1^1*t1, ..., s(m/2-1)+z1^(m/2-1)*t(m/2-1)]}
|
|
where \code{z1 = exp(-2*Pi*I/m)} corresponds to division by $2^w$.
|
|
|
|
$\bullet$ Let \code{[r(m/2), r(m/2+1), ..., r(m-1)]}\\
|
|
\code{= [s0-z1^0*t0, s1-z1^1*t1, ..., s(m/2-1)-z1^(m/2-1)*t(m/2-1)]}
|
|
|
|
The parameters are as follows:
|
|
|
|
$\bullet$ \code{2*n} is the length of the input and output
|
|
arrays
|
|
|
|
$\bullet$ $w$ is such that $2^w$ is an $2n$-th root of unity
|
|
in the ring $\mathbb{Z}/p\mathbb{Z}$ that we are working in,
|
|
i.e. $p = 2^{wn} + 1$ (here $n$ is divisible by
|
|
\code{GMP_LIMB_BITS})
|
|
|
|
$\bullet$ \code{ii} is the array of inputs (each input is an
|
|
array of limbs of length \code{wn/GMP_LIMB_BITS + 1} (the
|
|
extra limbs being a "carry limb"). Outputs are written
|
|
in-place.
|
|
|
|
We require $nw$ to be at least 64 and the two temporary space pointers
|
|
to point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void ifft_truncate(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
|
|
|
|
As for \code{ifft_radix2} except that the output is assumed to have
|
|
zeros from coefficient trunc onwards and only the first trunc
|
|
coefficients of the input are specified. The remaining coefficients need
|
|
to exist as the extra space is needed, but their value is irrelevant.
|
|
The value of \code{trunc} must be divisible by 2.
|
|
|
|
Although the implementation does not require it, we assume for simplicity
|
|
that \code{trunc} is greater than $n$. The algorithm begins by computing
|
|
the inverse transform of the first $n$ coefficients of the input array.
|
|
The unspecified coefficients of the second half of the array are then
|
|
written: coefficient \code{trunc + i} is computed as a twist of
|
|
coefficient \code{i} by a root of unity. The values of these coefficients
|
|
are then equal to what they would have been if the inverse transform of
|
|
the right hand side of the input array had been computed with full data
|
|
from the start. The function \code{ifft_truncate1} is then called on the
|
|
entire right half of the input array with this auxilliary data filled in.
|
|
Finally a single layer of the IFFT is completed on all the coefficients
|
|
up to \code{trunc} being careful to note that this involves doubling the
|
|
coefficients from \code{trunc - n} up to \code{n}.
|
|
|
|
void ifft_truncate1(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_size_t trunc)
|
|
|
|
Computes the first \code{trunc} coefficients of the radix 2 inverse
|
|
transform assuming the first \code{trunc} coefficients are given and that
|
|
the remaining coefficients have been set to the value they would have if
|
|
an inverse transform had already been applied with full data.
|
|
|
|
The algorithm is the same as for \code{ifft_truncate} except that the
|
|
coefficients from \code{trunc} onwards after the inverse transform are
|
|
not inferred to be zero but the supplied values.
|
|
|
|
void fft_butterfly_sqrt2(mp_limb_t * s, mp_limb_t * t,
|
|
mp_limb_t * i1, mp_limb_t * i2, mp_size_t i,
|
|
mp_size_t limbs, mp_bitcnt_t w, mp_limb_t * temp)
|
|
|
|
Let $w = 2k + 1$, $i = 2j + 1$. Set \code{s = i1 + i2},
|
|
\code{t = z1^i*(i1 - i2)} modulo \code{B^limbs + 1} where
|
|
\code{z1^2 = exp(Pi*I/n)} corresponds to multiplication by $2^w$. Requires
|
|
$0 \leq i < 2n$ where $nw =$ \code{limbs*FLINT_BITS}.
|
|
|
|
Here \code{z1} corresponds to multiplication by $2^k$ then multiplication
|
|
by\\ \code{(2^(3nw/4) - 2^(nw/4))}. We see \code{z1^i} corresponds to
|
|
multiplication by \code{(2^(3nw/4) - 2^(nw/4))*2^(j+ik)}.
|
|
|
|
We first multiply by \code{2^(j + ik + wn/4)} then multiply by an
|
|
additional \code{2^(nw/2)} and subtract.
|
|
|
|
void ifft_butterfly_sqrt2(mp_limb_t * s, mp_limb_t * t, mp_limb_t * i1,
|
|
mp_limb_t * i2, mp_size_t i, mp_size_t limbs,
|
|
mp_bitcnt_t w, mp_limb_t * temp)
|
|
|
|
Let $w = 2k + 1$, $i = 2j + 1$. Set \code{s = i1 + z1^i*i2},
|
|
\code{t = i1 - z1^i*i2} modulo \code{B^limbs + 1} where
|
|
\code{z1^2 = exp(-Pi*I/n)} corresponds to division by $2^w$. Requires
|
|
$0 \leq i < 2n$ where $nw =$ \code{limbs*FLINT_BITS}.
|
|
|
|
Here \code{z1} corresponds to division by $2^k$ then division by
|
|
\code{(2^(3nw/4) - 2^(nw/4))}. We see \code{z1^i} corresponds to division
|
|
by \code{(2^(3nw/4) - 2^(nw/4))*2^(j+ik)} which is the same as division
|
|
by \code{2^(j+ik + 1)} then multiplication by
|
|
\code{(2^(3nw/4) - 2^(nw/4))}.
|
|
|
|
Of course, division by \code{2^(j+ik + 1)} is the same as multiplication
|
|
by \code{2^(2*wn - j - ik - 1)}. The exponent is positive as
|
|
$i \leq 2*n$, $j < n$, $k < w/2$.
|
|
|
|
We first multiply by \code{2^(2*wn - j - ik - 1 + wn/4)} then multiply by
|
|
an additional \code{2^(nw/2)} and subtract.
|
|
|
|
void fft_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, mp_size_t trunc)
|
|
|
|
As per \code{fft_truncate} except that the transform is twice the usual
|
|
length, i.e. length $4n$ rather than $2n$. This is achieved by making use
|
|
of twiddles by powers of a square root of 2, not powers of 2 in the first
|
|
layer of the transform.
|
|
|
|
We require $nw$ to be at least 64 and the three temporary space pointers
|
|
to point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void ifft_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp, mp_size_t trunc)
|
|
|
|
As per \code{ifft_truncate} except that the transform is twice the usual
|
|
length, i.e. length $4n$ instead of $2n$. This is achieved by making use
|
|
of twiddles by powers of a square root of 2, not powers of 2 in the final
|
|
layer of the transform.
|
|
|
|
We require $nw$ to be at least 64 and the three temporary space pointers
|
|
to point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
*******************************************************************************
|
|
|
|
Matrix Fourier Transforms
|
|
|
|
*******************************************************************************
|
|
|
|
void fft_butterfly_twiddle(mp_limb_t * u, mp_limb_t * v,
|
|
mp_limb_t * s, mp_limb_t * t, mp_size_t limbs,
|
|
mp_bitcnt_t b1, mp_bitcnt_t b2)
|
|
|
|
Set \code{u = 2^b1*(s + t)}, \code{v = 2^b2*(s - t)} modulo
|
|
\code{B^limbs + 1}. This is used to compute
|
|
\code{u = 2^(ws*tw1)*(s + t)},\\ \code{v = 2^(w+ws*tw2)*(s - t)} in the
|
|
matrix fourier algorithm, i.e. effectively computing an ordinary butterfly
|
|
with additional twiddles by \code{z1^rc} for row $r$ and column $c$ of the
|
|
matrix of coefficients. Aliasing is not allowed.
|
|
|
|
void ifft_butterfly_twiddle(mp_limb_t * u, mp_limb_t * v,
|
|
mp_limb_t * s, mp_limb_t * t, mp_size_t limbs,
|
|
mp_bitcnt_t b1, mp_bitcnt_t b2)
|
|
|
|
Set \code{u = s/2^b1 + t/2^b1)}, \code{v = s/2^b1 - t/2^b1} modulo
|
|
\code{B^limbs + 1}. This is used to compute
|
|
\code{u = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)},\\
|
|
\code{v = 2^(-ws*tw1)*s + 2^(-ws*tw2)*t)} in the matrix fourier algorithm,
|
|
i.e. effectively computing an ordinary butterfly with additional twiddles
|
|
by \code{z1^(-rc)} for row $r$ and column $c$ of the matrix of
|
|
coefficients. Aliasing is not allowed.
|
|
|
|
void fft_radix2_twiddle(mp_limb_t ** ii, mp_size_t is,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs)
|
|
|
|
As for \code{fft_radix2} except that the coefficients are spaced by
|
|
\code{is} in the array \code{ii} and an additional twist by \code{z^c*i}
|
|
is applied to each coefficient where $i$ starts at $r$ and increases by
|
|
\code{rs} as one moves from one coefficient to the next. Here \code{z}
|
|
corresponds to multiplication by \code{2^ws}.
|
|
|
|
void ifft_radix2_twiddle(mp_limb_t ** ii, mp_size_t is,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs)
|
|
|
|
As for \code{ifft_radix2} except that the coefficients are spaced by
|
|
\code{is} in the array \code{ii} and an additional twist by
|
|
\code{z^(-c*i)} is applied to each coefficient where $i$ starts at $r$
|
|
and increases by \code{rs} as one moves from one coefficient to the next.
|
|
Here \code{z} corresponds to multiplication by \code{2^ws}.
|
|
|
|
void fft_truncate1_twiddle(mp_limb_t ** ii, mp_size_t is,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs, mp_size_t trunc)
|
|
|
|
As per \code{fft_radix2_twiddle} except that the transform is truncated
|
|
as per\\ \code{fft_truncate1}.
|
|
|
|
void ifft_truncate1_twiddle(mp_limb_t ** ii, mp_size_t is,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_size_t ws, mp_size_t r, mp_size_t c, mp_size_t rs, mp_size_t trunc)
|
|
|
|
As per \code{ifft_radix2_twiddle} except that the transform is truncated
|
|
as per\\ \code{ifft_truncate1}.
|
|
|
|
void fft_mfa_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n,
|
|
mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)
|
|
|
|
This is as per the \code{fft_truncate_sqrt2} function except that the
|
|
matrix fourier algorithm is used for the left and right FFTs. The total
|
|
transform length is $4n$ where \code{n = 2^depth} so that the left and
|
|
right transforms are both length $2n$. We require \code{trunc > 2*n} and
|
|
that \code{trunc} is divisible by \code{2*n1} (explained below).
|
|
|
|
The matrix fourier algorithm, which is applied to each transform of length
|
|
$2n$, works as follows. We set \code{n1} to a power of 2 about the square
|
|
root of $n$. The data is then thought of as a set of \code{n2} rows each
|
|
with \code{n1} columns (so that \code{n1*n2 = 2n}).
|
|
|
|
The length $2n$ transform is then computed using a whole pile of short
|
|
transforms. These comprise \code{n1} column transforms of length \code{n2}
|
|
followed by some twiddles by roots of unity (namely \code{z^rc} where $r$
|
|
is the row and $c$ the column within the data) followed by \code{n2}
|
|
row transforms of length \code{n1}. Along the way the data needs to be
|
|
rearranged due to the fact that the short transforms output the data in
|
|
binary reversed order compared with what is needed.
|
|
|
|
The matrix fourier algorithm provides better cache locality by decomposing
|
|
the long length $2n$ transforms into many transforms of about the square
|
|
root of the original length.
|
|
|
|
For better cache locality the sqrt2 layer of the full length $4n$
|
|
transform is folded in with the column FFTs performed as part of the first
|
|
matrix fourier algorithm on the left half of the data.
|
|
|
|
The second half of the data requires a truncated version of the matrix
|
|
fourier algorithm. This is achieved by truncating to an exact multiple of
|
|
the row length so that the row transforms are full length. Moreover, the
|
|
column transforms will then be truncated transforms and their truncated
|
|
length needs to be a multiple of 2. This explains the condition on
|
|
\code{trunc} given above.
|
|
|
|
To improve performance, the extra twiddles by roots of unity are combined
|
|
with the butterflies performed at the last layer of the column transforms.
|
|
|
|
We require $nw$ to be at least 64 and the three temporary space pointers
|
|
to point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void ifft_mfa_truncate_sqrt2(mp_limb_t ** ii, mp_size_t n,
|
|
mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)
|
|
|
|
This is as per the \code{ifft_truncate_sqrt2} function except that the
|
|
matrix fourier algorithm is used for the left and right IFFTs. The total
|
|
transform length is $4n$ where \code{n = 2^depth} so that the left and
|
|
right transforms are both length $2n$. We require \code{trunc > 2*n} and
|
|
that \code{trunc} is divisible by \code{2*n1}.
|
|
|
|
We set \code{n1} to a power of 2 about the square root of $n$.
|
|
|
|
As per the matrix fourier FFT the sqrt2 layer is folded into the the
|
|
final column IFFTs for better cache locality and the extra twiddles that
|
|
occur in the matrix fourier algorithm are combined with the butterflied
|
|
performed at the first layer of the final column transforms.
|
|
|
|
We require $nw$ to be at least 64 and the three temporary space pointers
|
|
to point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void fft_mfa_truncate_sqrt2_outer(mp_limb_t ** ii, mp_size_t n,
|
|
mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)
|
|
|
|
Just the outer layers of \code{fft_mfa_truncate_sqrt2}.
|
|
|
|
void fft_mfa_truncate_sqrt2_inner(mp_limb_t ** ii, mp_limb_t ** jj,
|
|
mp_size_t n, mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc, mp_limb_t * tt)
|
|
|
|
The inner layers of \code{fft_mfa_truncate_sqrt2} and
|
|
\code{ifft_mfa_truncate_sqrt2} combined with pointwise mults.
|
|
|
|
void ifft_mfa_truncate_sqrt2_outer(mp_limb_t ** ii, mp_size_t n,
|
|
mp_bitcnt_t w, mp_limb_t ** t1, mp_limb_t ** t2,
|
|
mp_limb_t ** temp, mp_size_t n1, mp_size_t trunc)
|
|
|
|
The outer layers of \code{ifft_mfa_truncate_sqrt2} combined with
|
|
normalisation.
|
|
|
|
*******************************************************************************
|
|
|
|
Negacyclic multiplication
|
|
|
|
*******************************************************************************
|
|
|
|
void fft_negacyclic(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
|
|
|
|
As per \code{fft_radix2} except that it performs a sqrt2 negacyclic
|
|
transform of length $2n$. This is the same as the radix 2 transform
|
|
except that the $i$-th coefficient of the input is first multiplied by
|
|
$\sqrt{2}^{iw}$.
|
|
|
|
We require $nw$ to be at least 64 and the two temporary space pointers to
|
|
point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void ifft_negacyclic(mp_limb_t ** ii, mp_size_t n, mp_bitcnt_t w,
|
|
mp_limb_t ** t1, mp_limb_t ** t2, mp_limb_t ** temp)
|
|
|
|
As per \code{ifft_radix2} except that it performs a sqrt2 negacyclic
|
|
inverse transform of length $2n$. This is the same as the radix 2 inverse
|
|
transform except that the $i$-th coefficient of the output is finally
|
|
divided by $\sqrt{2}^{iw}$.
|
|
|
|
We require $nw$ to be at least 64 and the two temporary space pointers to
|
|
point to blocks of size \code{n*w + FLINT_BITS} bits.
|
|
|
|
void fft_naive_convolution_1(mp_limb_t * r, mp_limb_t * ii,
|
|
mp_limb_t * jj, mp_size_t m)
|
|
|
|
Performs a naive negacyclic convolution of \code{ii} with \code{jj},
|
|
both of length $m$ and sets $r$ to the result. This is essentially
|
|
multiplication of polynomials modulo $x^m + 1$.
|
|
|
|
void _fft_mulmod_2expp1(mp_limb_t * r1, mp_limb_t * i1, mp_limb_t * i2,
|
|
mp_size_t r_limbs, mp_bitcnt_t depth, mp_bitcnt_t w)
|
|
|
|
Multiply \code{i1} by \code{i2} modulo \code{B^r_limbs + 1} where
|
|
\code{r_limbs = nw/FLINT_BITS} with \code{n = 2^depth}. Uses the
|
|
negacyclic FFT convolution CRT'd with a 1 limb naive convolution. We
|
|
require that \code{depth} and \code{w} have been selected as per the
|
|
wrapper \code{fft_mulmod_2expp1} below.
|
|
|
|
slong fft_adjust_limbs(mp_size_t limbs)
|
|
|
|
Given a number of limbs, returns a new number of limbs (no more than
|
|
the next power of 2) which will work with the Nussbaumer code. It is only
|
|
necessary to make this adjustment if
|
|
\code{limbs > FFT_MULMOD_2EXPP1_CUTOFF}.
|
|
|
|
void fft_mulmod_2expp1(mp_limb_t * r, mp_limb_t * i1, mp_limb_t * i2,
|
|
mp_size_t n, mp_size_t w, mp_limb_t * tt)
|
|
|
|
As per \code{_fft_mulmod_2expp1} but with a tuned cutoff below which more
|
|
classical methods are used for the convolution. The temporary space is
|
|
required to fit \code{n*w + FLINT_BITS} bits. There are no restrictions
|
|
on $n$, but if \code{limbs = n*w/FLINT_BITS} then if \code{limbs} exceeds
|
|
\code{FFT_MULMOD_2EXPP1_CUTOFF} the function \code{fft_adjust_limbs} must
|
|
be called to increase the number of limbs to an appropriate value.
|
|
|
|
*******************************************************************************
|
|
|
|
Integer multiplication
|
|
|
|
*******************************************************************************
|
|
|
|
void mul_truncate_sqrt2(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
|
|
mp_srcptr i2, mp_size_t n2, mp_bitcnt_t depth, mp_bitcnt_t w)
|
|
|
|
Integer multiplication using the radix 2 truncated sqrt2 transforms.
|
|
|
|
Set \code{(r1, n1 + n2)} to the product of \code{(i1, n1)} by
|
|
\code{(i2, n2)}. This is achieved through an FFT convolution of length at
|
|
most \code{2^(depth + 2)} with coefficients of size $nw$ bits where
|
|
\code{n = 2^depth}. We require \code{depth >= 6}. The input data is
|
|
broken into chunks of data not exceeding \code{(nw - (depth + 1))/2}
|
|
bits. If breaking the first integer into chunks of this size results in
|
|
\code{j1} coefficients and breaking the second integer results in
|
|
\code{j2} chunks then \code{j1 + j2 - 1 <= 2^(depth + 2)}.
|
|
|
|
If \code{n = 2^depth} then we require $nw$ to be at least 64.
|
|
|
|
void mul_mfa_truncate_sqrt2(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
|
|
mp_srcptr i2, mp_size_t n2, mp_bitcnt_t depth, mp_bitcnt_t w)
|
|
|
|
As for \code{mul_truncate_sqrt2} except that the cache friendly matrix
|
|
fourier algorithm is used.
|
|
|
|
If \code{n = 2^depth} then we require $nw$ to be at least 64. Here we
|
|
also require $w$ to be $2^i$ for some $i \geq 0$.
|
|
|
|
void flint_mpn_mul_fft_main(mp_ptr r1, mp_srcptr i1, mp_size_t n1,
|
|
mp_srcptr i2, mp_size_t n2)
|
|
|
|
The main integer multiplication routine. Sets \code{(r1, n1 + n2)} to
|
|
\code{(i1, n1)} times \code{(i2, n2)}. We require \code{n1 >= n2 > 0}.
|
|
|
|
*******************************************************************************
|
|
|
|
Convolution
|
|
|
|
*******************************************************************************
|
|
|
|
void fft_convolution(mp_limb_t ** ii, mp_limb_t ** jj, slong depth,
|
|
slong limbs, slong trunc, mp_limb_t ** t1,
|
|
mp_limb_t ** t2, mp_limb_t ** s1, mp_limb_t * tt)
|
|
|
|
Perform an FFT convolution of \code{ii} with \code{jj}, both of length
|
|
\code{4*n} where \code{n = 2^depth}. Assume that all but the first
|
|
\code{trunc} coefficients of the output (placed in \code{ii}) are zero.
|
|
Each coefficient is taken modulo \code{B^limbs + 1}. The temporary
|
|
spaces \code{t1}, \code{t2} and \code{s1} must have \code{limbs + 1}
|
|
limbs of space and \code{tt} must have \code{2*(limbs + 1)} of free
|
|
space.
|