141 lines
6.1 KiB
Plaintext
141 lines
6.1 KiB
Plaintext
/*=============================================================================
|
|
|
|
longlong.h -- 64 bit arithetic
|
|
|
|
This file is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU Lesser General Public License as published
|
|
by the Free Software Foundation; either version 2.1 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This file is distributed in the hope that it will be useful, but
|
|
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this file; see the file COPYING.LIB. If not, write
|
|
to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
|
Boston, MA 02110-1301, USA
|
|
|
|
=============================================================================*/
|
|
/******************************************************************************
|
|
|
|
Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001,
|
|
2002, 2003, 2004, 2005 Free Software Foundation, Inc.
|
|
Copyright (C) 2009 William Hart
|
|
Copyright (C) 2011 Fredrik Johansson
|
|
|
|
******************************************************************************/
|
|
|
|
*******************************************************************************
|
|
|
|
Auxiliary asm macros
|
|
|
|
*******************************************************************************
|
|
|
|
umul_ppmm(high_prod, low_prod, multipler, multiplicand)
|
|
|
|
Multiplies two single limb integers \code{MULTIPLER} and
|
|
\code{MULTIPLICAND}, and generates a two limb product in
|
|
\code{HIGH_PROD} and \code{LOW_PROD}.
|
|
|
|
smul_ppmm(high_prod, low_prod, multipler, multiplicand)
|
|
|
|
As for \code{umul_ppmm()} but the numbers are signed.
|
|
|
|
udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
|
|
|
|
Divides an unsigned integer, composed by the limb integers
|
|
\code{HIGH_NUMERATOR} and\\ \code{LOW_NUMERATOR}, by \code{DENOMINATOR}
|
|
and places the quotient in \code{QUOTIENT} and the remainder in
|
|
\code{REMAINDER}. \code{HIGH_NUMERATOR} must be less than
|
|
\code{DENOMINATOR} for correct operation.
|
|
|
|
sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, denominator)
|
|
|
|
As for \code{udiv_qrnnd()} but the numbers are signed. The quotient is
|
|
rounded towards $0$. Note that as the quotient is signed it must lie in
|
|
the range $[-2^63, 2^63)$.
|
|
|
|
count_leading_zeros(count, x)
|
|
|
|
Counts the number of zero-bits from the msb to the first non-zero bit
|
|
in the limb \code{x}. This is the number of steps \code{x} needs to
|
|
be shifted left to set the msb. If \code{x} is $0$ then count is
|
|
undefined.
|
|
|
|
count_trailing_zeros(count, x)
|
|
|
|
As for \code{count_leading_zeros()}, but counts from the least
|
|
significant end. If \code{x} is zero then count is undefined.
|
|
|
|
add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
|
|
high_addend_2, low_addend_2)
|
|
|
|
Adds two limb integers, composed by \code{HIGH_ADDEND_1} and
|
|
\code{LOW_ADDEND_1}, and\\ \code{HIGH_ADDEND_2} and \code{LOW_ADDEND_2},
|
|
respectively. The result is placed in \code{HIGH_SUM} and
|
|
\code{LOW_SUM}. Overflow, i.e.\ carry out, is not stored anywhere,
|
|
and is lost.
|
|
|
|
add_sssaaaaaa(high_sum, mid_sum, low_sum, high_addend_1, mid_addend_1,
|
|
low_addend_1, high_addend_2, mid_addend_2, low_addend_2)
|
|
|
|
Adds two three limb integers. Carry out is lost.
|
|
|
|
sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
|
|
high_subtrahend, low_subtrahend)
|
|
|
|
Subtracts two limb integers, composed by \code{HIGH_MINUEND_1} and
|
|
\code{LOW_MINUEND_1}, and \code{HIGH_SUBTRAHEND_2} and
|
|
\code{LOW_SUBTRAHEND_2}, respectively. The result is placed in\\
|
|
\code{HIGH_DIFFERENCE} and \code{LOW_DIFFERENCE}. Overflow, i.e.\
|
|
carry out is not stored anywhere, and is lost.
|
|
|
|
invert_limb(invxl, xl)
|
|
|
|
Computes an approximate inverse \code{invxl} of the limb \code{xl},
|
|
with an implicit leading~$1$. More formally it computes
|
|
\begin{lstlisting}[language=c]
|
|
invxl = (B^2 - B*x - 1)/x = (B^2 - 1)/x - B
|
|
\end{lstlisting}
|
|
|
|
Note that $x$ must be normalised, i.e.\ with msb set. This inverse
|
|
makes use of the following theorem of Torbjorn Granlund and Peter
|
|
Montgomery~\citep[Lemma~8.1]{GraMon1994}:
|
|
|
|
Let $d$ be normalised, $d < B$, i.e.\ it fits in a word, and suppose
|
|
that $m d < B^2 \leq (m+1) d$. Let $0 \leq n \leq B d - 1$. Write
|
|
$n = n_2 B + n_1 B/2 + n_0$ with $n_1 = 0$ or $1$ and $n_0 < B/2$.
|
|
Suppose $q_1 B + q_0 = n_2 B + (n_2 + n_1) (m - B) + n_1 (d-B/2) + n_0$
|
|
and $0 \leq q_0 < B$. Then $0 \leq q_1 < B$ and $0 \leq n - q_1 d < 2 d$.
|
|
|
|
In the theorem, $m$ is the inverse of $d$. If we let
|
|
\code{m = invxl + B} and $d = x$ we have $m d = B^2 - 1 < B^2$ and
|
|
$(m+1) x = B^2 + d - 1 \geq B^2$.
|
|
|
|
The theorem is often applied as follows: note that $n_0$ and $n_1 (d-B/2)$
|
|
are both less than $B/2$. Also note that $n_1 (m-B) < B$. Thus the sum of
|
|
all these terms contributes at most $1$ to $q_1$. We are left with
|
|
$n_2 B + n_2 (m-B)$. But note that $(m-B)$ is precisely our precomputed
|
|
inverse \code{invxl}. If we write $q_1 B + q_0 = n_2 B + n_2 (m-B)$,
|
|
then from the theorem, we have $0 \leq n - q_1 d < 3 d$, i.e.\ the
|
|
quotient is out by at most $2$ and is always either correct or too small.
|
|
|
|
udiv_qrnnd_preinv(q, r, nh, nl, d, di)
|
|
|
|
As for \code{udiv_qrnnd()} but takes a precomputed inverse \code{di} as
|
|
computed by \code{invert_limb()}. The algorithm, in terms of the theorem
|
|
above, is:
|
|
|
|
\begin{lstlisting}[language=c]
|
|
nadj = n1*(d-B/2) + n0
|
|
xh, xl = (n2+n1)*(m-B)
|
|
xh, xl += nadj + n2*B ( xh, xl = n2*B + (n2+n1)*(m-B) + n1*(d-B/2) + n0 )
|
|
_q1 = B - xh - 1
|
|
xh, xl = _q1*d + nh, nl - B*d = nh, nl - q1*d - d so that xh = 0 or -1
|
|
r = xl + xh*d where xh is 0 if q1 is off by 1, otherwise -1
|
|
q = xh - _q1 = xh + 1 + n2
|
|
\end{lstlisting}
|
|
|