pqc/external/flint-2.4.3/nmod_mat/mul_classical.c

/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2010,2012 Fredrik Johansson

******************************************************************************/

#include <stdlib.h>
#include <gmp.h>
#include "flint.h"
#include "nmod_mat.h"
#include "nmod_vec.h"

/*
with op = 0, computes D = A*B
with op = 1, computes D = C + A*B
with op = -1, computes D = C - A*B
*/

static __inline__ void
_nmod_mat_addmul_basic(mp_ptr * D, mp_ptr * const C, mp_ptr * const A,
    mp_ptr * const B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)
{
    slong i, j;
    mp_limb_t c;

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
        {
            c = _nmod_vec_dot_ptr(A[i], B, j, k, mod, nlimbs);

            if (op == 1)
                c = nmod_add(C[i][j], c, mod);
            else if (op == -1)
                c = nmod_sub(C[i][j], c, mod);

            D[i][j] = c;
        }
    }
}

static __inline__ void
_nmod_mat_addmul_transpose(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,
    const mp_ptr * B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)
{
    mp_ptr tmp;
    mp_limb_t c;
    slong i, j;

    tmp = flint_malloc(sizeof(mp_limb_t) * k * n);

    for (i = 0; i < k; i++)
        for (j = 0; j < n; j++)
            tmp[j*k + i] = B[i][j];

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
        {
            c = _nmod_vec_dot(A[i], tmp + j*k, k, mod, nlimbs);

            if (op == 1)
                c = nmod_add(C[i][j], c, mod);
            else if (op == -1)
                c = nmod_sub(C[i][j], c, mod);

            D[i][j] = c;
        }
    }

    flint_free(tmp);
}

/* requires nlimbs = 1 */
void
_nmod_mat_addmul_packed(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,
    const mp_ptr * B, slong M, slong N, slong K, int op, nmod_t mod, int nlimbs)
{
    slong i, j, k;
    slong Kpack;
    int pack, pack_bits;
    mp_limb_t c, d, mask;
    mp_ptr tmp;
    mp_ptr Aptr, Tptr;

    /* bound unreduced entry */
    c = N * (mod.n-1) * (mod.n-1);
    pack_bits = FLINT_BIT_COUNT(c);
    pack = FLINT_BITS / pack_bits;
    Kpack = (K + pack - 1) / pack;

    if (pack_bits == FLINT_BITS)
        mask = UWORD(-1);
    else
        mask = (UWORD(1) << pack_bits) - 1;

    tmp = _nmod_vec_init(Kpack * N);

    /* pack and transpose B */
    for (i = 0; i < Kpack; i++)
    {
        for (k = 0; k < N; k++)
        {
            c = B[k][i * pack];

            for (j = 1; j < pack && i * pack + j < K; j++)
                c |= B[k][i * pack + j] << (pack_bits * j);

            tmp[i * N + k] = c;
        }
    }

    /* multiply */
    for (i = 0; i < M; i++)
    {
        for (j = 0; j < Kpack; j++)
        {
            Aptr = A[i];
            Tptr = tmp + j * N;

            c = 0;

            /* unroll by 4 */
            for (k = 0; k + 4 <= N; k += 4)
            {
                c += Aptr[k + 0] * Tptr[k + 0];
                c += Aptr[k + 1] * Tptr[k + 1];
                c += Aptr[k + 2] * Tptr[k + 2];
                c += Aptr[k + 3] * Tptr[k + 3];
            }

            for ( ; k < N; k++)
                c += Aptr[k] * Tptr[k];

            /* unpack and reduce */
            for (k = 0; k < pack && j * pack + k < K; k++)
            {
                d = (c >> (k * pack_bits)) & mask;
                NMOD_RED(d, d, mod);

                if (op == 1)
                    d = nmod_add(C[i][j * pack + k], d, mod);
                else if (op == -1)
                    d = nmod_sub(C[i][j * pack + k], d, mod);

                D[i][j * pack + k] = d;
            }
        }
    }

    _nmod_vec_clear(tmp);
}


void
_nmod_mat_mul_classical(nmod_mat_t D, const nmod_mat_t C,
                                const nmod_mat_t A, const nmod_mat_t B, int op)
{
    slong m, k, n;
    int nlimbs;
    nmod_t mod;

    mod = A->mod;
    m = A->r;
    k = A->c;
    n = B->c;

    if (k == 0)
    {
        if (op == 0)
            nmod_mat_zero(D);
        else
            nmod_mat_set(D, C);
        return;
    }

    nlimbs = _nmod_vec_dot_bound_limbs(k, mod);

    if (nlimbs == 1 && m > 10 && k > 10 && n > 10)
    {
        _nmod_mat_addmul_packed(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
    else if (m < NMOD_MAT_MUL_TRANSPOSE_CUTOFF
        || n < NMOD_MAT_MUL_TRANSPOSE_CUTOFF
        || k < NMOD_MAT_MUL_TRANSPOSE_CUTOFF)
    {
        _nmod_mat_addmul_basic(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
    else
    {
        _nmod_mat_addmul_transpose(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
}


void
nmod_mat_mul_classical(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
{
    _nmod_mat_mul_classical(C, NULL, A, B, 0);
}
ALL: Add flint 2014-05-18 22:03:37 +00:00			`/*=============================================================================`

			`This file is part of FLINT.`

			`FLINT is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`FLINT is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with FLINT; if not, write to the Free Software`
			`Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA`

			`=============================================================================*/`
			`/******************************************************************************`

			`Copyright (C) 2010,2012 Fredrik Johansson`

			`******************************************************************************/`

			`#include <stdlib.h>`
			`#include <gmp.h>`
			`#include "flint.h"`
			`#include "nmod_mat.h"`
			`#include "nmod_vec.h"`

			`/*`
			`with op = 0, computes D = A*B`
			`with op = 1, computes D = C + A*B`
			`with op = -1, computes D = C - A*B`
			`*/`

			`static __inline__ void`
			`_nmod_mat_addmul_basic(mp_ptr * D, mp_ptr * const C, mp_ptr * const A,`
			`mp_ptr * const B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)`
			`{`
			`slong i, j;`
			`mp_limb_t c;`

			`for (i = 0; i < m; i++)`
			`{`
			`for (j = 0; j < n; j++)`
			`{`
			`c = _nmod_vec_dot_ptr(A[i], B, j, k, mod, nlimbs);`

			`if (op == 1)`
			`c = nmod_add(C[i][j], c, mod);`
			`else if (op == -1)`
			`c = nmod_sub(C[i][j], c, mod);`

			`D[i][j] = c;`
			`}`
			`}`
			`}`

			`static __inline__ void`
			`_nmod_mat_addmul_transpose(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,`
			`const mp_ptr * B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)`
			`{`
			`mp_ptr tmp;`
			`mp_limb_t c;`
			`slong i, j;`

			`tmp = flint_malloc(sizeof(mp_limb_t) * k * n);`

			`for (i = 0; i < k; i++)`
			`for (j = 0; j < n; j++)`
			`tmp[j*k + i] = B[i][j];`

			`for (i = 0; i < m; i++)`
			`{`
			`for (j = 0; j < n; j++)`
			`{`
			`c = _nmod_vec_dot(A[i], tmp + j*k, k, mod, nlimbs);`

			`if (op == 1)`
			`c = nmod_add(C[i][j], c, mod);`
			`else if (op == -1)`
			`c = nmod_sub(C[i][j], c, mod);`

			`D[i][j] = c;`
			`}`
			`}`

			`flint_free(tmp);`
			`}`

			`/* requires nlimbs = 1 */`
			`void`
			`_nmod_mat_addmul_packed(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,`
			`const mp_ptr * B, slong M, slong N, slong K, int op, nmod_t mod, int nlimbs)`
			`{`
			`slong i, j, k;`
			`slong Kpack;`
			`int pack, pack_bits;`
			`mp_limb_t c, d, mask;`
			`mp_ptr tmp;`
			`mp_ptr Aptr, Tptr;`

			`/* bound unreduced entry */`
			`c = N * (mod.n-1) * (mod.n-1);`
			`pack_bits = FLINT_BIT_COUNT(c);`
			`pack = FLINT_BITS / pack_bits;`
			`Kpack = (K + pack - 1) / pack;`

			`if (pack_bits == FLINT_BITS)`
			`mask = UWORD(-1);`
			`else`
			`mask = (UWORD(1) << pack_bits) - 1;`

			`tmp = _nmod_vec_init(Kpack * N);`

			`/* pack and transpose B */`
			`for (i = 0; i < Kpack; i++)`
			`{`
			`for (k = 0; k < N; k++)`
			`{`
			`c = B[k][i * pack];`

			`for (j = 1; j < pack && i * pack + j < K; j++)`
			`c \|= B[k][i * pack + j] << (pack_bits * j);`

			`tmp[i * N + k] = c;`
			`}`
			`}`

			`/* multiply */`
			`for (i = 0; i < M; i++)`
			`{`
			`for (j = 0; j < Kpack; j++)`
			`{`
			`Aptr = A[i];`
			`Tptr = tmp + j * N;`

			`c = 0;`

			`/* unroll by 4 */`
			`for (k = 0; k + 4 <= N; k += 4)`
			`{`
			`c += Aptr[k + 0] * Tptr[k + 0];`
			`c += Aptr[k + 1] * Tptr[k + 1];`
			`c += Aptr[k + 2] * Tptr[k + 2];`
			`c += Aptr[k + 3] * Tptr[k + 3];`
			`}`

			`for ( ; k < N; k++)`
			`c += Aptr[k] * Tptr[k];`

			`/* unpack and reduce */`
			`for (k = 0; k < pack && j * pack + k < K; k++)`
			`{`
			`d = (c >> (k * pack_bits)) & mask;`
			`NMOD_RED(d, d, mod);`

			`if (op == 1)`
			`d = nmod_add(C[i][j * pack + k], d, mod);`
			`else if (op == -1)`
			`d = nmod_sub(C[i][j * pack + k], d, mod);`

			`D[i][j * pack + k] = d;`
			`}`
			`}`
			`}`

			`_nmod_vec_clear(tmp);`
			`}`



			`void`
			`_nmod_mat_mul_classical(nmod_mat_t D, const nmod_mat_t C,`
			`const nmod_mat_t A, const nmod_mat_t B, int op)`
			`{`
			`slong m, k, n;`
			`int nlimbs;`
			`nmod_t mod;`

			`mod = A->mod;`
			`m = A->r;`
			`k = A->c;`
			`n = B->c;`

			`if (k == 0)`
			`{`
			`if (op == 0)`
			`nmod_mat_zero(D);`
			`else`
			`nmod_mat_set(D, C);`
			`return;`
			`}`

			`nlimbs = _nmod_vec_dot_bound_limbs(k, mod);`

			`if (nlimbs == 1 && m > 10 && k > 10 && n > 10)`
			`{`
			`_nmod_mat_addmul_packed(D->rows, (op == 0) ? NULL : C->rows,`
			`A->rows, B->rows, m, k, n, op, D->mod, nlimbs);`
			`}`
			`else if (m < NMOD_MAT_MUL_TRANSPOSE_CUTOFF`
			`\|\| n < NMOD_MAT_MUL_TRANSPOSE_CUTOFF`
			`\|\| k < NMOD_MAT_MUL_TRANSPOSE_CUTOFF)`
			`{`
			`_nmod_mat_addmul_basic(D->rows, (op == 0) ? NULL : C->rows,`
			`A->rows, B->rows, m, k, n, op, D->mod, nlimbs);`
			`}`
			`else`
			`{`
			`_nmod_mat_addmul_transpose(D->rows, (op == 0) ? NULL : C->rows,`
			`A->rows, B->rows, m, k, n, op, D->mod, nlimbs);`
			`}`
			`}`


			`void`
			`nmod_mat_mul_classical(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)`
			`{`
			`_nmod_mat_mul_classical(C, NULL, A, B, 0);`
			`}`