971 lines
25 KiB
C
971 lines
25 KiB
C
/*============================================================================
|
|
Copyright 2006 Jason Papadopoulos.
|
|
Copyright 2006, 2011 William Hart.
|
|
|
|
This file is part of FLINT.
|
|
|
|
FLINT is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
FLINT is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with FLINT; if not, write to the Free Software
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
===============================================================================
|
|
|
|
Optionally, please be nice and tell me if you find this source to be
|
|
useful. Again optionally, if you add to the functionality present here
|
|
please consider making those additions public too, so that others may
|
|
benefit from your work.
|
|
--jasonp@boo.net 9/8/06
|
|
|
|
The following modifications were made by William Hart:
|
|
-added the utility function get_null_entry
|
|
-reformatted original code so it would operate as a standalone
|
|
filter and block Lanczos module
|
|
--------------------------------------------------------------------*/
|
|
|
|
|
|
#define ulong ulongxx /* interferes with system includes */
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#undef ulong
|
|
#define ulong mp_limb_t
|
|
|
|
#include <gmp.h>
|
|
#include "flint.h"
|
|
#include "ulong_extras.h"
|
|
#include "qsieve.h"
|
|
|
|
#define BIT(x) (((uint64_t)(1)) << (x))
|
|
|
|
static const uint64_t bitmask[64] = {
|
|
BIT( 0), BIT( 1), BIT( 2), BIT( 3), BIT( 4), BIT( 5), BIT( 6), BIT( 7),
|
|
BIT( 8), BIT( 9), BIT(10), BIT(11), BIT(12), BIT(13), BIT(14), BIT(15),
|
|
BIT(16), BIT(17), BIT(18), BIT(19), BIT(20), BIT(21), BIT(22), BIT(23),
|
|
BIT(24), BIT(25), BIT(26), BIT(27), BIT(28), BIT(29), BIT(30), BIT(31),
|
|
BIT(32), BIT(33), BIT(34), BIT(35), BIT(36), BIT(37), BIT(38), BIT(39),
|
|
BIT(40), BIT(41), BIT(42), BIT(43), BIT(44), BIT(45), BIT(46), BIT(47),
|
|
BIT(48), BIT(49), BIT(50), BIT(51), BIT(52), BIT(53), BIT(54), BIT(55),
|
|
BIT(56), BIT(57), BIT(58), BIT(59), BIT(60), BIT(61), BIT(62), BIT(63),
|
|
};
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
uint64_t get_null_entry(uint64_t * nullrows, slong i, slong l) {
|
|
|
|
/* Returns true if the entry with indices i,l is 1 in the
|
|
supplied 64xN matrix. This is used to read the nullspace
|
|
vectors which are output by the Lanczos routine */
|
|
|
|
return nullrows[i]&bitmask[l];
|
|
}
|
|
|
|
/*--------------------------------------------------------------------*/
|
|
void reduce_matrix(qs_t qs_inf, slong *nrows, slong *ncols, la_col_t *cols) {
|
|
|
|
/* Perform light filtering on the nrows x ncols
|
|
matrix specified by cols[]. The processing here is
|
|
limited to deleting columns that contain a singleton
|
|
row, then resizing the matrix to have a few more
|
|
columns than rows. Because deleting a column reduces
|
|
the counts in several different rows, the process
|
|
must iterate to convergence.
|
|
|
|
Note that this step is not intended to make the Lanczos
|
|
iteration run any faster (though it will); it's just
|
|
that if we don't go to this trouble then there are
|
|
factorizations for which the matrix step will fail
|
|
outright */
|
|
|
|
slong r, c, i, j, k;
|
|
slong passes;
|
|
slong *counts;
|
|
slong reduced_rows;
|
|
slong reduced_cols;
|
|
|
|
/* count the number of nonzero entries in each row */
|
|
|
|
counts = (slong *)flint_calloc((size_t)*nrows, sizeof(slong));
|
|
for (i = 0; i < *ncols; i++) {
|
|
for (j = 0; j < cols[i].weight; j++)
|
|
counts[cols[i].data[j]]++;
|
|
}
|
|
|
|
reduced_rows = *nrows;
|
|
reduced_cols = *ncols;
|
|
passes = 0;
|
|
|
|
do {
|
|
r = reduced_rows;
|
|
|
|
/* remove any columns that contain the only entry
|
|
in one or more rows, then update the row counts
|
|
to reflect the missing column. Iterate until
|
|
no more columns can be deleted */
|
|
|
|
do {
|
|
c = reduced_cols;
|
|
for (i = j = 0; i < reduced_cols; i++) {
|
|
la_col_t *col = cols + i;
|
|
for (k = 0; k < col->weight; k++) {
|
|
if (counts[col->data[k]] < 2)
|
|
break;
|
|
}
|
|
|
|
if (k < col->weight) {
|
|
for (k = 0; k < col->weight; k++) {
|
|
counts[col->data[k]]--;
|
|
}
|
|
free_col(col);
|
|
clear_col(col);
|
|
}
|
|
else {
|
|
cols[j++] = cols[i];
|
|
if (j-1 != i) clear_col(col);
|
|
}
|
|
}
|
|
reduced_cols = j;
|
|
} while (c != reduced_cols);
|
|
|
|
/* count the number of rows that contain a
|
|
nonzero entry */
|
|
|
|
for (i = reduced_rows = 0; i < *nrows; i++) {
|
|
if (counts[i])
|
|
reduced_rows++;
|
|
}
|
|
|
|
/* Because deleting a column reduces the weight
|
|
of many rows, the number of nonzero rows may
|
|
be much less than the number of columns. Delete
|
|
more columns until the matrix has the correct
|
|
aspect ratio. Columns at the end of cols[] are
|
|
the heaviest, so delete those (and update the
|
|
row counts again) */
|
|
|
|
if (reduced_cols > reduced_rows + qs_inf->extra_rels) {
|
|
for (i = reduced_rows + qs_inf->extra_rels;
|
|
i < reduced_cols; i++) {
|
|
|
|
la_col_t *col = cols + i;
|
|
for (j = 0; j < col->weight; j++) {
|
|
counts[col->data[j]]--;
|
|
}
|
|
free_col(col);
|
|
clear_col(col);
|
|
}
|
|
reduced_cols = reduced_rows + qs_inf->extra_rels;
|
|
}
|
|
|
|
/* if any columns were deleted in the previous step,
|
|
then the matrix is less dense and more columns
|
|
can be deleted; iterate until no further deletions
|
|
are possible */
|
|
|
|
passes++;
|
|
|
|
} while (r != reduced_rows);
|
|
|
|
#if (QS_DEBUG & 128)
|
|
flint_printf("reduce to %wd x %wd in %wd passes\n",
|
|
reduced_rows, reduced_cols, passes);
|
|
#endif
|
|
|
|
flint_free(counts);
|
|
|
|
/* record the final matrix size. Note that we can't touch
|
|
nrows because all the column data (and the sieving relations
|
|
that produced it) would have to be updated */
|
|
|
|
*ncols = reduced_cols;
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
static void mul_64x64_64x64(uint64_t *a, uint64_t *b, uint64_t *c ) {
|
|
|
|
/* c[][] = x[][] * y[][], where all operands are 64 x 64
|
|
(i.e. contain 64 words of 64 bits each). The result
|
|
may overwrite a or b. */
|
|
|
|
uint64_t ai, bj, accum;
|
|
uint64_t tmp[64];
|
|
ulong i, j;
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
j = 0;
|
|
accum = 0;
|
|
ai = a[i];
|
|
|
|
while (ai) {
|
|
bj = b[j];
|
|
if( ai & 1 )
|
|
accum ^= bj;
|
|
ai >>= 1;
|
|
j++;
|
|
}
|
|
|
|
tmp[i] = accum;
|
|
}
|
|
memcpy(c, tmp, sizeof(tmp));
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
static void precompute_Nx64_64x64(uint64_t *x, uint64_t *c) {
|
|
|
|
/* Let x[][] be a 64 x 64 matrix in GF(2), represented
|
|
as 64 words of 64 bits each. Let c[][] be an 8 x 256
|
|
matrix of 64-bit words. This code fills c[][] with
|
|
a bunch of "partial matrix multiplies". For 0<=i<256,
|
|
the j_th row of c[][] contains the matrix product
|
|
|
|
( i << (8*j) ) * x[][]
|
|
|
|
where the quantity in parentheses is considered a
|
|
1 x 64 vector of elements in GF(2). The resulting
|
|
table can dramatically speed up matrix multiplies
|
|
by x[][]. */
|
|
|
|
uint64_t accum, xk;
|
|
ulong i, j, k, index;
|
|
|
|
for (j = 0; j < 8; j++) {
|
|
for (i = 0; i < 256; i++) {
|
|
k = 0;
|
|
index = i;
|
|
accum = 0;
|
|
while (index) {
|
|
xk = x[k];
|
|
if (index & 1)
|
|
accum ^= xk;
|
|
index >>= 1;
|
|
k++;
|
|
}
|
|
c[i] = accum;
|
|
}
|
|
|
|
x += 8;
|
|
c += 256;
|
|
}
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
static void mul_Nx64_64x64_acc(uint64_t *v, uint64_t *x, uint64_t *c,
|
|
uint64_t *y, slong n) {
|
|
|
|
/* let v[][] be a n x 64 matrix with elements in GF(2),
|
|
represented as an array of n 64-bit words. Let c[][]
|
|
be an 8 x 256 scratch matrix of 64-bit words.
|
|
This code multiplies v[][] by the 64x64 matrix
|
|
x[][], then XORs the n x 64 result into y[][] */
|
|
|
|
slong i;
|
|
uint64_t word;
|
|
|
|
precompute_Nx64_64x64(x, c);
|
|
|
|
for (i = 0; i < n; i++) {
|
|
word = v[i];
|
|
y[i] ^= c[ 0*256 + ((word>> 0) & 0xff) ]
|
|
^ c[ 1*256 + ((word>> 8) & 0xff) ]
|
|
^ c[ 2*256 + ((word>>16) & 0xff) ]
|
|
^ c[ 3*256 + ((word>>24) & 0xff) ]
|
|
^ c[ 4*256 + ((word>>32) & 0xff) ]
|
|
^ c[ 5*256 + ((word>>40) & 0xff) ]
|
|
^ c[ 6*256 + ((word>>48) & 0xff) ]
|
|
^ c[ 7*256 + ((word>>56) ) ];
|
|
}
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
static void mul_64xN_Nx64(uint64_t *x, uint64_t *y,
|
|
uint64_t *c, uint64_t *xy, slong n) {
|
|
|
|
/* Let x and y be n x 64 matrices. This routine computes
|
|
the 64 x 64 matrix xy[][] given by transpose(x) * y.
|
|
c[][] is a 256 x 8 scratch matrix of 64-bit words. */
|
|
|
|
slong i;
|
|
|
|
memset(c, 0, 256 * 8 * sizeof(uint64_t));
|
|
memset(xy, 0, 64 * sizeof(uint64_t));
|
|
|
|
for (i = 0; i < n; i++) {
|
|
uint64_t xi = x[i];
|
|
uint64_t yi = y[i];
|
|
c[ 0*256 + ( xi & 0xff) ] ^= yi;
|
|
c[ 1*256 + ((xi >> 8) & 0xff) ] ^= yi;
|
|
c[ 2*256 + ((xi >> 16) & 0xff) ] ^= yi;
|
|
c[ 3*256 + ((xi >> 24) & 0xff) ] ^= yi;
|
|
c[ 4*256 + ((xi >> 32) & 0xff) ] ^= yi;
|
|
c[ 5*256 + ((xi >> 40) & 0xff) ] ^= yi;
|
|
c[ 6*256 + ((xi >> 48) & 0xff) ] ^= yi;
|
|
c[ 7*256 + ((xi >> 56) ) ] ^= yi;
|
|
}
|
|
|
|
|
|
for(i = 0; i < 8; i++) {
|
|
|
|
ulong j;
|
|
uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
|
|
|
|
a0 = a1 = a2 = a3 = 0;
|
|
a4 = a5 = a6 = a7 = 0;
|
|
|
|
for (j = 0; j < 256; j++) {
|
|
if ((j >> i) & 1) {
|
|
a0 ^= c[0*256 + j];
|
|
a1 ^= c[1*256 + j];
|
|
a2 ^= c[2*256 + j];
|
|
a3 ^= c[3*256 + j];
|
|
a4 ^= c[4*256 + j];
|
|
a5 ^= c[5*256 + j];
|
|
a6 ^= c[6*256 + j];
|
|
a7 ^= c[7*256 + j];
|
|
}
|
|
}
|
|
|
|
xy[ 0] = a0; xy[ 8] = a1; xy[16] = a2; xy[24] = a3;
|
|
xy[32] = a4; xy[40] = a5; xy[48] = a6; xy[56] = a7;
|
|
xy++;
|
|
}
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
static slong find_nonsingular_sub(uint64_t *t, slong *s,
|
|
slong *last_s, slong last_dim,
|
|
uint64_t *w) {
|
|
|
|
/* given a 64x64 matrix t[][] (i.e. sixty-four
|
|
64-bit words) and a list of 'last_dim' column
|
|
indices enumerated in last_s[]:
|
|
|
|
- find a submatrix of t that is invertible
|
|
- invert it and copy to w[][]
|
|
- enumerate in s[] the columns represented in w[][] */
|
|
|
|
slong i, j;
|
|
slong dim;
|
|
slong cols[64];
|
|
uint64_t M[64][2];
|
|
uint64_t mask, *row_i, *row_j;
|
|
uint64_t m0, m1;
|
|
|
|
/* M = [t | I] for I the 64x64 identity matrix */
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
M[i][0] = t[i];
|
|
M[i][1] = bitmask[i];
|
|
}
|
|
|
|
/* put the column indices from last_s[] into the
|
|
back of cols[], and copy to the beginning of cols[]
|
|
any column indices not in last_s[] */
|
|
|
|
mask = 0;
|
|
for (i = 0; i < last_dim; i++) {
|
|
cols[63 - i] = last_s[i];
|
|
mask |= bitmask[last_s[i]];
|
|
}
|
|
for (i = j = 0; i < 64; i++) {
|
|
if (!(mask & bitmask[i]))
|
|
cols[j++] = i;
|
|
}
|
|
|
|
/* compute the inverse of t[][] */
|
|
|
|
for (i = dim = 0; i < 64; i++) {
|
|
|
|
/* find the next pivot row and put in row i */
|
|
|
|
mask = bitmask[cols[i]];
|
|
row_i = M[cols[i]];
|
|
|
|
for (j = i; j < 64; j++) {
|
|
row_j = M[cols[j]];
|
|
if (row_j[0] & mask) {
|
|
m0 = row_j[0];
|
|
m1 = row_j[1];
|
|
row_j[0] = row_i[0];
|
|
row_j[1] = row_i[1];
|
|
row_i[0] = m0;
|
|
row_i[1] = m1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* if a pivot row was found, eliminate the pivot
|
|
column from all other rows */
|
|
|
|
if (j < 64) {
|
|
for (j = 0; j < 64; j++) {
|
|
row_j = M[cols[j]];
|
|
if ((row_i != row_j) && (row_j[0] & mask)) {
|
|
row_j[0] ^= row_i[0];
|
|
row_j[1] ^= row_i[1];
|
|
}
|
|
}
|
|
|
|
/* add the pivot column to the list of
|
|
accepted columns */
|
|
|
|
s[dim++] = cols[i];
|
|
continue;
|
|
}
|
|
|
|
/* otherwise, use the right-hand half of M[]
|
|
to compensate for the absence of a pivot column */
|
|
|
|
for (j = i; j < 64; j++) {
|
|
row_j = M[cols[j]];
|
|
if (row_j[1] & mask) {
|
|
m0 = row_j[0];
|
|
m1 = row_j[1];
|
|
row_j[0] = row_i[0];
|
|
row_j[1] = row_i[1];
|
|
row_i[0] = m0;
|
|
row_i[1] = m1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (j == 64) {
|
|
#if (QS_DEBUG & 128)
|
|
flint_printf("lanczos error: submatrix "
|
|
"is not invertible\n");
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
/* eliminate the pivot column from the other rows
|
|
of the inverse */
|
|
|
|
for (j = 0; j < 64; j++) {
|
|
row_j = M[cols[j]];
|
|
if ((row_i != row_j) && (row_j[1] & mask)) {
|
|
row_j[0] ^= row_i[0];
|
|
row_j[1] ^= row_i[1];
|
|
}
|
|
}
|
|
|
|
/* wipe out the pivot row */
|
|
|
|
row_i[0] = row_i[1] = 0;
|
|
}
|
|
|
|
/* the right-hand half of M[] is the desired inverse */
|
|
|
|
for (i = 0; i < 64; i++)
|
|
w[i] = M[i][1];
|
|
|
|
/* The block Lanczos recurrence depends on all columns
|
|
of t[][] appearing in s[] and/or last_s[].
|
|
Verify that condition here */
|
|
|
|
mask = 0;
|
|
for (i = 0; i < dim; i++)
|
|
mask |= bitmask[s[i]];
|
|
for (i = 0; i < last_dim; i++)
|
|
mask |= bitmask[last_s[i]];
|
|
|
|
if (mask != (uint64_t)(-1)) {
|
|
#if (QS_DEBUG & 128)
|
|
flint_printf("lanczos error: not all columns used\n");
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
return dim;
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
void mul_MxN_Nx64(slong vsize, slong dense_rows,
|
|
slong ncols, la_col_t *A,
|
|
uint64_t *x, uint64_t *b) {
|
|
|
|
/* Multiply the vector x[] by the matrix A (stored
|
|
columnwise) and put the result in b[]. vsize
|
|
refers to the number of uint64_t's allocated for
|
|
x[] and b[]; vsize is probably different from ncols */
|
|
|
|
slong i, j;
|
|
|
|
memset(b, 0, vsize * sizeof(uint64_t));
|
|
|
|
for (i = 0; i < ncols; i++) {
|
|
la_col_t *col = A + i;
|
|
slong *row_entries = col->data;
|
|
uint64_t tmp = x[i];
|
|
|
|
for (j = 0; j < col->weight; j++) {
|
|
b[row_entries[j]] ^= tmp;
|
|
}
|
|
}
|
|
|
|
if (dense_rows) {
|
|
for (i = 0; i < ncols; i++) {
|
|
la_col_t *col = A + i;
|
|
slong *row_entries = col->data + col->weight;
|
|
uint64_t tmp = x[i];
|
|
|
|
for (j = 0; j < dense_rows; j++) {
|
|
if (row_entries[j / 32] &
|
|
((slong)1 << (j % 32))) {
|
|
b[j] ^= tmp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*-------------------------------------------------------------------*/
|
|
void mul_trans_MxN_Nx64(slong dense_rows, slong ncols,
|
|
la_col_t *A, uint64_t *x, uint64_t *b) {
|
|
|
|
/* Multiply the vector x[] by the transpose of the
|
|
matrix A and put the result in b[]. Since A is stored
|
|
by columns, this is just a matrix-vector product */
|
|
|
|
slong i, j;
|
|
|
|
for (i = 0; i < ncols; i++) {
|
|
la_col_t *col = A + i;
|
|
slong *row_entries = col->data;
|
|
uint64_t accum = 0;
|
|
|
|
for (j = 0; j < col->weight; j++) {
|
|
accum ^= x[row_entries[j]];
|
|
}
|
|
b[i] = accum;
|
|
}
|
|
|
|
if (dense_rows) {
|
|
for (i = 0; i < ncols; i++) {
|
|
la_col_t *col = A + i;
|
|
slong *row_entries = col->data + col->weight;
|
|
uint64_t accum = b[i];
|
|
|
|
for (j = 0; j < dense_rows; j++) {
|
|
if (row_entries[j / 32] &
|
|
((slong)1 << (j % 32))) {
|
|
accum ^= x[j];
|
|
}
|
|
}
|
|
b[i] = accum;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
static void transpose_vector(slong ncols, uint64_t *v, uint64_t **trans) {
|
|
|
|
/* Hideously inefficent routine to transpose a
|
|
vector v[] of 64-bit words into a 2-D array
|
|
trans[][] of 64-bit words */
|
|
|
|
slong i, j;
|
|
slong col;
|
|
uint64_t mask, word;
|
|
|
|
for (i = 0; i < ncols; i++) {
|
|
col = i / 64;
|
|
mask = bitmask[i % 64];
|
|
word = v[i];
|
|
j = 0;
|
|
while (word) {
|
|
if (word & 1)
|
|
trans[j][col] |= mask;
|
|
word = word >> 1;
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
void combine_cols(slong ncols,
|
|
uint64_t *x, uint64_t *v,
|
|
uint64_t *ax, uint64_t *av) {
|
|
|
|
/* Once the block Lanczos iteration has finished,
|
|
x[] and v[] will contain mostly nullspace vectors
|
|
between them, as well as possibly some columns
|
|
that are linear combinations of nullspace vectors.
|
|
Given vectors ax[] and av[] that are the result of
|
|
multiplying x[] and v[] by the matrix, this routine
|
|
will use Gauss elimination on the columns of [ax | av]
|
|
to find all of the linearly dependent columns. The
|
|
column operations needed to accomplish this are mir-
|
|
rored in [x | v] and the columns that are independent
|
|
are skipped. Finally, the dependent columns are copied
|
|
back into x[] and represent the nullspace vector output
|
|
of the block Lanczos code.
|
|
|
|
v[] and av[] can be NULL, in which case the elimination
|
|
process assumes 64 dependencies instead of 128 */
|
|
|
|
slong i, j, k, bitpos, col, col_words, num_deps;
|
|
uint64_t mask;
|
|
uint64_t *matrix[128], *amatrix[128], *tmp;
|
|
|
|
num_deps = 128;
|
|
if (v == NULL || av == NULL)
|
|
num_deps = 64;
|
|
|
|
col_words = (ncols + 63) / 64;
|
|
|
|
for (i = 0; i < num_deps; i++) {
|
|
matrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
|
|
sizeof(uint64_t));
|
|
amatrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
|
|
sizeof(uint64_t));
|
|
}
|
|
|
|
/* operations on columns can more conveniently become
|
|
operations on rows if all the vectors are first
|
|
transposed */
|
|
|
|
transpose_vector(ncols, x, matrix);
|
|
transpose_vector(ncols, ax, amatrix);
|
|
if (num_deps == 128) {
|
|
transpose_vector(ncols, v, matrix + 64);
|
|
transpose_vector(ncols, av, amatrix + 64);
|
|
}
|
|
|
|
/* Keep eliminating rows until the unprocessed part
|
|
of amatrix[][] is all zero. The rows where this
|
|
happens correspond to linearly dependent vectors
|
|
in the nullspace */
|
|
|
|
for (i = bitpos = 0; i < num_deps && bitpos < ncols; bitpos++) {
|
|
|
|
/* find the next pivot row */
|
|
|
|
mask = bitmask[bitpos % 64];
|
|
col = bitpos / 64;
|
|
for (j = i; j < num_deps; j++) {
|
|
if (amatrix[j][col] & mask) {
|
|
tmp = matrix[i];
|
|
matrix[i] = matrix[j];
|
|
matrix[j] = tmp;
|
|
tmp = amatrix[i];
|
|
amatrix[i] = amatrix[j];
|
|
amatrix[j] = tmp;
|
|
break;
|
|
}
|
|
}
|
|
if (j == num_deps)
|
|
continue;
|
|
|
|
/* a pivot was found; eliminate it from the
|
|
remaining rows */
|
|
|
|
for (j++; j < num_deps; j++) {
|
|
if (amatrix[j][col] & mask) {
|
|
|
|
/* Note that the entire row, *not*
|
|
just the nonzero part of it, must
|
|
be eliminated; this is because the
|
|
corresponding (dense) row of matrix[][]
|
|
must have the same operation applied */
|
|
|
|
for (k = 0; k < col_words; k++) {
|
|
amatrix[j][k] ^= amatrix[i][k];
|
|
matrix[j][k] ^= matrix[i][k];
|
|
}
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
|
|
/* transpose rows i to 64 back into x[] */
|
|
|
|
for (j = 0; j < ncols; j++) {
|
|
uint64_t word = 0;
|
|
|
|
col = j / 64;
|
|
mask = bitmask[j % 64];
|
|
|
|
for (k = i; k < 64; k++) {
|
|
if (matrix[k][col] & mask)
|
|
word |= bitmask[k];
|
|
}
|
|
x[j] = word;
|
|
}
|
|
|
|
for (i = 0; i < num_deps; i++) {
|
|
flint_free(matrix[i]);
|
|
flint_free(amatrix[i]);
|
|
}
|
|
}
|
|
|
|
/*-----------------------------------------------------------------------*/
|
|
uint64_t * block_lanczos(flint_rand_t state, slong nrows,
|
|
slong dense_rows, slong ncols, la_col_t *B) {
|
|
|
|
/* Solve Bx = 0 for some nonzero x; the computed
|
|
solution, containing up to 64 of these nullspace
|
|
vectors, is returned */
|
|
|
|
uint64_t *vnext, *v[3], *x, *v0;
|
|
uint64_t *winv[3];
|
|
uint64_t *vt_a_v[2], *vt_a2_v[2];
|
|
uint64_t *scratch;
|
|
uint64_t *d, *e, *f, *f2;
|
|
uint64_t *tmp;
|
|
slong s[2][64];
|
|
slong i, iter;
|
|
slong n = ncols;
|
|
slong dim0, dim1;
|
|
uint64_t mask0, mask1;
|
|
slong vsize;
|
|
|
|
/* allocate all of the size-n variables. Note that because
|
|
B has been preprocessed to ignore singleton rows, the
|
|
number of rows may really be less than nrows and may
|
|
be greater than ncols. vsize is the maximum of these
|
|
two numbers */
|
|
|
|
vsize = FLINT_MAX(nrows, ncols);
|
|
v[0] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
v[1] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
v[2] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
vnext = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
x = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
v0 = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
|
|
scratch = (uint64_t *)flint_malloc(FLINT_MAX(vsize, 256 * 8) * sizeof(uint64_t));
|
|
|
|
/* allocate all the 64x64 variables */
|
|
|
|
winv[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
winv[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
winv[2] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
vt_a_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
vt_a_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
vt_a2_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
vt_a2_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
d = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
e = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
f = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
f2 = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
|
|
|
|
/* The iterations computes v[0], vt_a_v[0],
|
|
vt_a2_v[0], s[0] and winv[0]. Subscripts larger
|
|
than zero represent past versions of these
|
|
quantities, which start off empty (except for
|
|
the past version of s[], which contains all
|
|
the column indices */
|
|
|
|
memset(v[1], 0, vsize * sizeof(uint64_t));
|
|
memset(v[2], 0, vsize * sizeof(uint64_t));
|
|
for (i = 0; i < 64; i++) {
|
|
s[1][i] = i;
|
|
vt_a_v[1][i] = 0;
|
|
vt_a2_v[1][i] = 0;
|
|
winv[1][i] = 0;
|
|
winv[2][i] = 0;
|
|
}
|
|
dim0 = 0;
|
|
dim1 = 64;
|
|
mask1 = (uint64_t)(-1);
|
|
iter = 0;
|
|
|
|
/* The computed solution 'x' starts off random,
|
|
and v[0] starts off as B*x. This initial copy
|
|
of v[0] must be saved off separately */
|
|
|
|
for (i = 0; i < n; i++)
|
|
#if FLINT_BITS==64
|
|
v[0][i] = (uint64_t) n_randlimb(state);
|
|
#else
|
|
v[0][i] = (uint64_t) n_randlimb(state) + ((uint64_t) n_randlimb(state) << 32);
|
|
#endif
|
|
|
|
memcpy(x, v[0], vsize * sizeof(uint64_t));
|
|
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
|
|
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, v[0]);
|
|
memcpy(v0, v[0], vsize * sizeof(uint64_t));
|
|
|
|
/* perform the iteration */
|
|
|
|
while (1) {
|
|
iter++;
|
|
|
|
/* multiply the current v[0] by a symmetrized
|
|
version of B, or B'B (apostrophe means
|
|
transpose). Use "A" to refer to B'B */
|
|
|
|
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
|
|
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, vnext);
|
|
|
|
/* compute v0'*A*v0 and (A*v0)'(A*v0) */
|
|
|
|
mul_64xN_Nx64(v[0], vnext, scratch, vt_a_v[0], n);
|
|
mul_64xN_Nx64(vnext, vnext, scratch, vt_a2_v[0], n);
|
|
|
|
/* if the former is orthogonal to itself, then
|
|
the iteration has finished */
|
|
|
|
for (i = 0; i < 64; i++) {
|
|
if (vt_a_v[0][i] != 0)
|
|
break;
|
|
}
|
|
if (i == 64) {
|
|
break;
|
|
}
|
|
|
|
/* Find the size-'dim0' nonsingular submatrix
|
|
of v0'*A*v0, invert it, and list the column
|
|
indices present in the submatrix */
|
|
|
|
dim0 = find_nonsingular_sub(vt_a_v[0], s[0],
|
|
s[1], dim1, winv[0]);
|
|
if (dim0 == 0)
|
|
break;
|
|
|
|
/* mask0 contains one set bit for every column
|
|
that participates in the inverted submatrix
|
|
computed above */
|
|
|
|
mask0 = 0;
|
|
for (i = 0; i < dim0; i++)
|
|
mask0 |= bitmask[s[0][i]];
|
|
|
|
/* compute d */
|
|
|
|
for (i = 0; i < 64; i++)
|
|
d[i] = (vt_a2_v[0][i] & mask0) ^ vt_a_v[0][i];
|
|
|
|
mul_64x64_64x64(winv[0], d, d);
|
|
|
|
for (i = 0; i < 64; i++)
|
|
d[i] = d[i] ^ bitmask[i];
|
|
|
|
/* compute e */
|
|
|
|
mul_64x64_64x64(winv[1], vt_a_v[0], e);
|
|
|
|
for (i = 0; i < 64; i++)
|
|
e[i] = e[i] & mask0;
|
|
|
|
/* compute f */
|
|
|
|
mul_64x64_64x64(vt_a_v[1], winv[1], f);
|
|
|
|
for (i = 0; i < 64; i++)
|
|
f[i] = f[i] ^ bitmask[i];
|
|
|
|
mul_64x64_64x64(winv[2], f, f);
|
|
|
|
for (i = 0; i < 64; i++)
|
|
f2[i] = ((vt_a2_v[1][i] & mask1) ^
|
|
vt_a_v[1][i]) & mask0;
|
|
|
|
mul_64x64_64x64(f, f2, f);
|
|
|
|
/* compute the next v */
|
|
|
|
for (i = 0; i < n; i++)
|
|
vnext[i] = vnext[i] & mask0;
|
|
|
|
mul_Nx64_64x64_acc(v[0], d, scratch, vnext, n);
|
|
mul_Nx64_64x64_acc(v[1], e, scratch, vnext, n);
|
|
mul_Nx64_64x64_acc(v[2], f, scratch, vnext, n);
|
|
|
|
/* update the computed solution 'x' */
|
|
|
|
mul_64xN_Nx64(v[0], v0, scratch, d, n);
|
|
mul_64x64_64x64(winv[0], d, d);
|
|
mul_Nx64_64x64_acc(v[0], d, scratch, x, n);
|
|
|
|
/* rotate all the variables */
|
|
|
|
tmp = v[2];
|
|
v[2] = v[1];
|
|
v[1] = v[0];
|
|
v[0] = vnext;
|
|
vnext = tmp;
|
|
|
|
tmp = winv[2];
|
|
winv[2] = winv[1];
|
|
winv[1] = winv[0];
|
|
winv[0] = tmp;
|
|
|
|
tmp = vt_a_v[1]; vt_a_v[1] = vt_a_v[0]; vt_a_v[0] = tmp;
|
|
|
|
tmp = vt_a2_v[1]; vt_a2_v[1] = vt_a2_v[0]; vt_a2_v[0] = tmp;
|
|
|
|
memcpy(s[1], s[0], 64 * sizeof(slong));
|
|
mask1 = mask0;
|
|
dim1 = dim0;
|
|
}
|
|
|
|
#if (QS_DEBUG & 128)
|
|
flint_printf("lanczos halted after %wd iterations\n", iter);
|
|
#endif
|
|
|
|
/* free unneeded storage */
|
|
|
|
|
|
flint_free(vnext);
|
|
flint_free(scratch);
|
|
flint_free(v0);
|
|
flint_free(vt_a_v[0]);
|
|
flint_free(vt_a_v[1]);
|
|
flint_free(vt_a2_v[0]);
|
|
flint_free(vt_a2_v[1]);
|
|
flint_free(winv[0]);
|
|
flint_free(winv[1]);
|
|
flint_free(winv[2]);
|
|
flint_free(d);
|
|
flint_free(e);
|
|
flint_free(f);
|
|
flint_free(f2);
|
|
|
|
/* if a recoverable failure occurred, start everything
|
|
over again */
|
|
|
|
if (dim0 == 0) {
|
|
#if (QS_DEBUG & 128)
|
|
flint_printf("linear algebra failed; retrying...\n");
|
|
#endif
|
|
flint_free(x);
|
|
flint_free(v[0]);
|
|
flint_free(v[1]);
|
|
flint_free(v[2]);
|
|
return NULL;
|
|
}
|
|
|
|
/* convert the output of the iteration to an actual
|
|
collection of nullspace vectors */
|
|
|
|
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[1]);
|
|
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], v[2]);
|
|
|
|
combine_cols(ncols, x, v[0], v[1], v[2]);
|
|
|
|
/* verify that these really are linear dependencies of B */
|
|
|
|
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[0]);
|
|
|
|
for (i = 0; i < ncols; i++) {
|
|
if (v[0][i] != 0)
|
|
break;
|
|
}
|
|
if (i < ncols) {
|
|
flint_printf("lanczos error: dependencies don't work %wd\n",i);
|
|
abort();
|
|
}
|
|
|
|
flint_free(v[0]);
|
|
flint_free(v[1]);
|
|
flint_free(v[2]);
|
|
return x;
|
|
}
|