/*============================================================================= Copyright (C) 2007, 2008 David Harvey (zn_poly) Copyright (C) 2013 William Hart All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =============================================================================*/ #include #include #include "flint.h" #include "nmod_vec.h" #include "nmod_poly.h" /* Multiplication/squaring using Kronecker substitution at 2^b, -2^b, 2^(-b) and -2^(-b). */ void _nmod_poly_mul_KS4(mp_ptr res, mp_srcptr op1, slong n1, mp_srcptr op2, slong n2, nmod_t mod) { int sqr, v3m_neg; ulong bits, b, w, a1, a2, a3; slong n1o, n1e, n2o, n2e, n3o, n3e, n3, k1, k2, k3; mp_ptr v1_buf0, v2_buf0, v1_buf1, v2_buf1, v1_buf2, v2_buf2, v1_buf3, v2_buf3, v1_buf4, v2_buf4; mp_ptr v1on, v1en, v1pn, v1mn, v2on, v2en, v2pn, v2mn, v3on, v3en, v3pn, v3mn; mp_ptr v1or, v1er, v1pr, v1mr, v2or, v2er, v2pr, v2mr, v3or, v3er, v3pr, v3mr; mp_ptr z, zn, zr; if (n2 == 1) { /* code below needs n2 > 1, so fall back on scalar multiplication */ _nmod_vec_scalar_mul_nmod(res, op1, n1, op2[0], mod); return; } sqr = (op1 == op2 && n1 == n2); /* bits in each output coefficient */ bits = 2 * (FLINT_BITS - mod.norm) + FLINT_CLOG2(n2); /* we're evaluating at x = B, -B, 1/B, -1/B, where B = 2^b, and b = ceil(bits / 4) */ b = (bits + 3) / 4; /* number of ulongs required to store each base-B^2 digit */ w = (2*b - 1)/FLINT_BITS + 1; /* Write f1(x) = f1e(x^2) + x * f1o(x^2) f2(x) = f2e(x^2) + x * f2o(x^2) h(x) = he(x^2) + x * ho(x^2) "e" = even, "o" = odd */ n1o = n1 / 2; n1e = n1 - n1o; n2o = n2 / 2; n2e = n2 - n2o; n3 = n1 + n2 - 1; /* length of h */ n3o = n3 / 2; n3e = n3 - n3o; /* Put k1 = number of limbs needed to store f1(B) and |f1(-B)|. In f1(B), the leading coefficient starts at bit position b * (n1 - 1) and has length 2b, and the coefficients overlap so we need an extra bit for the carry: this gives (n1 + 1) * b + 1 bits. Ditto for f2. */ k1 = ((n1 + 1) * b)/FLINT_BITS + 1; k2 = ((n2 + 1) * b)/FLINT_BITS + 1; k3 = k1 + k2; /* allocate space */ v1_buf0 = _nmod_vec_init(5*k3); /* k1 limbs */ v2_buf0 = v1_buf0 + k1; /* k2 limbs */ v1_buf1 = v2_buf0 + k2; /* k1 limbs */ v2_buf1 = v1_buf1 + k1; /* k2 limbs */ v1_buf2 = v2_buf1 + k2; /* k1 limbs */ v2_buf2 = v1_buf2 + k1; /* k2 limbs */ v1_buf3 = v2_buf2 + k2; /* k1 limbs */ v2_buf3 = v1_buf3 + k1; /* k2 limbs */ v1_buf4 = v2_buf3 + k2; /* k1 limbs */ v2_buf4 = v1_buf4 + k1; /* k2 limbs */ /* arrange overlapping buffers to minimise memory use "p" = plus, "m" = minus "n" = normal order, "r" = reciprocal order */ v1en = v1_buf0; v1on = v1_buf1; v1pn = v1_buf2; v1mn = v1_buf0; v2en = v2_buf0; v2on = v2_buf1; v2pn = v2_buf2; v2mn = v2_buf0; v3pn = v1_buf1; v3mn = v1_buf2; v3en = v1_buf0; v3on = v1_buf1; v1er = v1_buf2; v1or = v1_buf3; v1pr = v1_buf4; v1mr = v1_buf2; v2er = v2_buf2; v2or = v2_buf3; v2pr = v2_buf4; v2mr = v2_buf2; v3pr = v1_buf3; v3mr = v1_buf4; v3er = v1_buf2; v3or = v1_buf3; z = _nmod_vec_init(2*w*(n3e + 1)); zn = z; zr = z + w*(n3e + 1); /* ------------------------------------------------------------------------- "normal" evaluation points */ if (!sqr) { /* multiplication version */ /* evaluate f1e(B^2) and B * f1o(B^2) We need max(2 * b*n1e, 2 * b*n1o + b) bits for this packing step, which is safe since (n1 + 1) * b + 1 >= max(2 * b*n1e, 2 * b*n1o + b). Ditto for f2 below. */ _nmod_poly_KS2_pack(v1en, op1, n1e, 2, 2 * b, 0, k1); _nmod_poly_KS2_pack(v1on, op1 + 1, n1o, 2, 2 * b, b, k1); /* compute f1(B) = f1e(B^2) + B * f1o(B^2) and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| */ mpn_add_n (v1pn, v1en, v1on, k1); v3m_neg = signed_mpn_sub_n(v1mn, v1en, v1on, k1); /* evaluate f2e(B^2) and B * f2o(B^2) */ _nmod_poly_KS2_pack(v2en, op2, n2e, 2, 2 * b, 0, k2); _nmod_poly_KS2_pack(v2on, op2 + 1, n2o, 2, 2 * b, b, k2); /* compute f2(B) = f2e(B^2) + B * f2o(B^2) and |f2(-B)| = |f2e(B^2) - B * f2o(B^2)| */ mpn_add_n(v2pn, v2en, v2on, k2); v3m_neg ^= signed_mpn_sub_n(v2mn, v2en, v2on, k2); /* compute h(B) = f1(B) * f2(B) and |h(-B)| = |f1(-B)| * |f2(-B)| hn_neg is set if h(-B) is negative */ mpn_mul(v3pn, v1pn, k1, v2pn, k2); mpn_mul(v3mn, v1mn, k1, v2mn, k2); } else { /* squaring version */ /* evaluate f1e(B^2) and B * f1o(B^2) */ _nmod_poly_KS2_pack(v1en, op1, n1e, 2, 2 * b, 0, k1); _nmod_poly_KS2_pack(v1on, op1 + 1, n1o, 2, 2 * b, b, k1); /* compute f1(B) = f1e(B^2) + B * f1o(B^2) and |f1(-B)| = |f1e(B^2) - B * f1o(B^2)| */ mpn_add_n (v1pn, v1en, v1on, k1); signed_mpn_sub_n(v1mn, v1en, v1on, k1); /* compute h(B) = f1(B)^2 and h(-B) = |f1(-B)|^2 hn_neg is cleared since h(-B) is never negative */ mpn_mul(v3pn, v1pn, k1, v1pn, k1); mpn_mul(v3mn, v1mn, k1, v1mn, k1); v3m_neg = 0; } /* Each coefficient of h(B) is up to 4b bits long, so h(B) needs at most ((n1 + n2 + 2) * b + 1) bits. (The extra +1 is to accommodate carries generated by overlapping coefficients.) The buffer has at least ((n1 + n2 + 2) * b + 2) bits. Therefore we can safely store 2*h(B) etc. */ /* compute 2 * he(B^2) = h(B) + h(-B) and B * 2 * ho(B^2) = h(B) - h(-B) */ if (v3m_neg) { mpn_sub_n(v3en, v3pn, v3mn, k3); mpn_add_n (v3on, v3pn, v3mn, k3); } else { mpn_add_n (v3en, v3pn, v3mn, k3); mpn_sub_n (v3on, v3pn, v3mn, k3); } /* ------------------------------------------------------------------------- "reciprocal" evaluation points */ /* correction factors to take into account that if a polynomial has even length, its even and odd coefficients are swapped when the polynomial is reversed */ a1 = (n1 & 1) ? 0 : b; a2 = (n2 & 1) ? 0 : b; a3 = (n3 & 1) ? 0 : b; if (!sqr) { /* multiplication version */ /* evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2) */ _nmod_poly_KS2_pack(v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1); _nmod_poly_KS2_pack(v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1); /* compute B^(n1-1) * f1(1/B) = B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2) and |B^(n1-1) * f1(-1/B)| = |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)| */ mpn_add_n(v1pr, v1er, v1or, k1); v3m_neg = signed_mpn_sub_n(v1mr, v1er, v1or, k1); /* evaluate B^(n2-1) * f2e(1/B^2) and B^(n2-2) * f2o(1/B^2) */ _nmod_poly_KS2_pack(v2er, op2 + 2*(n2e - 1), n2e, -2, 2 * b, a2, k2); _nmod_poly_KS2_pack(v2or, op2 + 1 + 2*(n2o - 1), n2o, -2, 2 * b, b - a2, k2); /* compute B^(n2-1) * f2(1/B) = B^(n2-1) * f2e(1/B^2) + B^(n2-2) * f2o(1/B^2) and |B^(n1-1) * f2(-1/B)| = |B^(n2-1) * f2e(1/B^2) - B^(n2-2) * f2o(1/B^2)| */ mpn_add_n (v2pr, v2er, v2or, k2); v3m_neg ^= signed_mpn_sub_n(v2mr, v2er, v2or, k2); /* compute B^(n3-1) * h(1/B) = (B^(n1-1) * f1(1/B)) * (B^(n2-1) * f2(1/B)) and |B^(n3-1) * h(-1/B)| = |B^(n1-1) * f1(-1/B)| * |B^(n2-1) * f2(-1/B)| hr_neg is set if h(-1/B) is negative */ mpn_mul(v3pr, v1pr, k1, v2pr, k2); mpn_mul(v3mr, v1mr, k1, v2mr, k2); } else { /* squaring version */ /* evaluate B^(n1-1) * f1e(1/B^2) and B^(n1-2) * f1o(1/B^2) */ _nmod_poly_KS2_pack(v1er, op1 + 2*(n1e - 1), n1e, -2, 2 * b, a1, k1); _nmod_poly_KS2_pack(v1or, op1 + 1 + 2*(n1o - 1), n1o, -2, 2 * b, b - a1, k1); /* compute B^(n1-1) * f1(1/B) = B^(n1-1) * f1e(1/B^2) + B^(n1-2) * f1o(1/B^2) and |B^(n1-1) * f1(-1/B)| = |B^(n1-1) * f1e(1/B^2) - B^(n1-2) * f1o(1/B^2)| */ mpn_add_n(v1pr, v1er, v1or, k1); signed_mpn_sub_n(v1mr, v1er, v1or, k1); /* compute B^(n3-1) * h(1/B) = (B^(n1-1) * f1(1/B))^2 and B^(n3-1) * h(-1/B) = |B^(n1-1) * f1(-1/B)|^2 hr_neg is cleared since h(-1/B) is never negative */ mpn_mul(v3pr, v1pr, k1, v1pr, k1); mpn_mul(v3mr, v1mr, k1, v1mr, k1); v3m_neg = 0; } /* compute 2 * B^(n3-1) * he(1/B^2) = B^(n3-1) * h(1/B) + B^(n3-1) * h(-1/B) and 2 * B^(n3-2) * ho(1/B^2) = B^(n3-1) * h(1/B) - B^(n3-1) * h(-1/B) */ if (v3m_neg) { mpn_sub_n(v3er, v3pr, v3mr, k3); mpn_add_n(v3or, v3pr, v3mr, k3); } else { mpn_add_n (v3er, v3pr, v3mr, k3); mpn_sub_n (v3or, v3pr, v3mr, k3); } /* ------------------------------------------------------------------------- combine "normal" and "reciprocal" information */ /* decompose he(B^2) and B^(2*(n3e-1)) * he(1/B^2) into base-B^2 digits */ _nmod_poly_KS2_unpack(zn, v3en, n3e + 1, 2 * b, 1); _nmod_poly_KS2_unpack(zr, v3er, n3e + 1, 2 * b, a3 + 1); /* combine he(B^2) and he(1/B^2) information to get even coefficients of h */ _nmod_poly_KS2_recover_reduce(res, 2, zn, zr, n3e, 2 * b, mod); /* decompose ho(B^2) and B^(2*(n3o-1)) * ho(1/B^2) into base-B^2 digits */ _nmod_poly_KS2_unpack(zn, v3on, n3o + 1, 2 * b, b + 1); _nmod_poly_KS2_unpack(zr, v3or, n3o + 1, 2 * b, b - a3 + 1); /* combine ho(B^2) and ho(1/B^2) information to get odd coefficients of h */ _nmod_poly_KS2_recover_reduce(res + 1, 2, zn, zr, n3o, 2 * b, mod); _nmod_vec_clear(z); _nmod_vec_clear(v1_buf0); } void nmod_poly_mul_KS4(nmod_poly_t res, const nmod_poly_t poly1, const nmod_poly_t poly2) { slong len_out; if ((poly1->length == 0) || (poly2->length == 0)) { nmod_poly_zero(res); return; } len_out = poly1->length + poly2->length - 1; if (res == poly1 || res == poly2) { nmod_poly_t temp; nmod_poly_init2_preinv(temp, poly1->mod.n, poly1->mod.ninv, len_out); if (poly1->length >= poly2->length) _nmod_poly_mul_KS4(temp->coeffs, poly1->coeffs, poly1->length, poly2->coeffs, poly2->length, poly1->mod); else _nmod_poly_mul_KS4(temp->coeffs, poly2->coeffs, poly2->length, poly1->coeffs, poly1->length, poly1->mod); nmod_poly_swap(res, temp); nmod_poly_clear(temp); } else { nmod_poly_fit_length(res, len_out); if (poly1->length >= poly2->length) _nmod_poly_mul_KS4(res->coeffs, poly1->coeffs, poly1->length, poly2->coeffs, poly2->length, poly1->mod); else _nmod_poly_mul_KS4(res->coeffs, poly2->coeffs, poly2->length, poly1->coeffs, poly1->length, poly1->mod); } res->length = len_out; _nmod_poly_normalise(res); }