/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2010,2012 Fredrik Johansson

******************************************************************************/

#include <stdlib.h>
#include <gmp.h>
#include "flint.h"
#include "nmod_mat.h"
#include "nmod_vec.h"

/*
with op = 0, computes D = A*B
with op = 1, computes D = C + A*B
with op = -1, computes D = C - A*B
*/

static __inline__ void
_nmod_mat_addmul_basic(mp_ptr * D, mp_ptr * const C, mp_ptr * const A,
    mp_ptr * const B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)
{
    slong i, j;
    mp_limb_t c;

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
        {
            c = _nmod_vec_dot_ptr(A[i], B, j, k, mod, nlimbs);

            if (op == 1)
                c = nmod_add(C[i][j], c, mod);
            else if (op == -1)
                c = nmod_sub(C[i][j], c, mod);

            D[i][j] = c;
        }
    }
}

static __inline__ void
_nmod_mat_addmul_transpose(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,
    const mp_ptr * B, slong m, slong k, slong n, int op, nmod_t mod, int nlimbs)
{
    mp_ptr tmp;
    mp_limb_t c;
    slong i, j;

    tmp = flint_malloc(sizeof(mp_limb_t) * k * n);

    for (i = 0; i < k; i++)
        for (j = 0; j < n; j++)
            tmp[j*k + i] = B[i][j];

    for (i = 0; i < m; i++)
    {
        for (j = 0; j < n; j++)
        {
            c = _nmod_vec_dot(A[i], tmp + j*k, k, mod, nlimbs);

            if (op == 1)
                c = nmod_add(C[i][j], c, mod);
            else if (op == -1)
                c = nmod_sub(C[i][j], c, mod);

            D[i][j] = c;
        }
    }

    flint_free(tmp);
}

/* requires nlimbs = 1 */
void
_nmod_mat_addmul_packed(mp_ptr * D, const mp_ptr * C, const mp_ptr * A,
    const mp_ptr * B, slong M, slong N, slong K, int op, nmod_t mod, int nlimbs)
{
    slong i, j, k;
    slong Kpack;
    int pack, pack_bits;
    mp_limb_t c, d, mask;
    mp_ptr tmp;
    mp_ptr Aptr, Tptr;

    /* bound unreduced entry */
    c = N * (mod.n-1) * (mod.n-1);
    pack_bits = FLINT_BIT_COUNT(c);
    pack = FLINT_BITS / pack_bits;
    Kpack = (K + pack - 1) / pack;

    if (pack_bits == FLINT_BITS)
        mask = UWORD(-1);
    else
        mask = (UWORD(1) << pack_bits) - 1;

    tmp = _nmod_vec_init(Kpack * N);

    /* pack and transpose B */
    for (i = 0; i < Kpack; i++)
    {
        for (k = 0; k < N; k++)
        {
            c = B[k][i * pack];

            for (j = 1; j < pack && i * pack + j < K; j++)
                c |= B[k][i * pack + j] << (pack_bits * j);

            tmp[i * N + k] = c;
        }
    }

    /* multiply */
    for (i = 0; i < M; i++)
    {
        for (j = 0; j < Kpack; j++)
        {
            Aptr = A[i];
            Tptr = tmp + j * N;

            c = 0;

            /* unroll by 4 */
            for (k = 0; k + 4 <= N; k += 4)
            {
                c += Aptr[k + 0] * Tptr[k + 0];
                c += Aptr[k + 1] * Tptr[k + 1];
                c += Aptr[k + 2] * Tptr[k + 2];
                c += Aptr[k + 3] * Tptr[k + 3];
            }

            for ( ; k < N; k++)
                c += Aptr[k] * Tptr[k];

            /* unpack and reduce */
            for (k = 0; k < pack && j * pack + k < K; k++)
            {
                d = (c >> (k * pack_bits)) & mask;
                NMOD_RED(d, d, mod);

                if (op == 1)
                    d = nmod_add(C[i][j * pack + k], d, mod);
                else if (op == -1)
                    d = nmod_sub(C[i][j * pack + k], d, mod);

                D[i][j * pack + k] = d;
            }
        }
    }

    _nmod_vec_clear(tmp);
}



void
_nmod_mat_mul_classical(nmod_mat_t D, const nmod_mat_t C,
                                const nmod_mat_t A, const nmod_mat_t B, int op)
{
    slong m, k, n;
    int nlimbs;
    nmod_t mod;

    mod = A->mod;
    m = A->r;
    k = A->c;
    n = B->c;

    if (k == 0)
    {
        if (op == 0)
            nmod_mat_zero(D);
        else
            nmod_mat_set(D, C);
        return;
    }

    nlimbs = _nmod_vec_dot_bound_limbs(k, mod);

    if (nlimbs == 1 && m > 10 && k > 10 && n > 10)
    {
        _nmod_mat_addmul_packed(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
    else if (m < NMOD_MAT_MUL_TRANSPOSE_CUTOFF
        || n < NMOD_MAT_MUL_TRANSPOSE_CUTOFF
        || k < NMOD_MAT_MUL_TRANSPOSE_CUTOFF)
    {
        _nmod_mat_addmul_basic(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
    else
    {
        _nmod_mat_addmul_transpose(D->rows, (op == 0) ? NULL : C->rows,
            A->rows, B->rows, m, k, n, op, D->mod, nlimbs);
    }
}


void
nmod_mat_mul_classical(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
{
    _nmod_mat_mul_classical(C, NULL, A, B, 0);
}