pqc/external/flint-2.4.3/qsieve/block_lanczos.c
2014-05-24 23:16:06 +02:00

971 lines
25 KiB
C

/*============================================================================
Copyright 2006 Jason Papadopoulos.
Copyright 2006, 2011 William Hart.
This file is part of FLINT.
FLINT is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
FLINT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with FLINT; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
===============================================================================
Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may
benefit from your work.
--jasonp@boo.net 9/8/06
The following modifications were made by William Hart:
-added the utility function get_null_entry
-reformatted original code so it would operate as a standalone
filter and block Lanczos module
--------------------------------------------------------------------*/
#define ulong ulongxx /* interferes with system includes */
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#undef ulong
#define ulong mp_limb_t
#include <gmp.h>
#include "flint.h"
#include "ulong_extras.h"
#include "qsieve.h"
#define BIT(x) (((uint64_t)(1)) << (x))
static const uint64_t bitmask[64] = {
BIT( 0), BIT( 1), BIT( 2), BIT( 3), BIT( 4), BIT( 5), BIT( 6), BIT( 7),
BIT( 8), BIT( 9), BIT(10), BIT(11), BIT(12), BIT(13), BIT(14), BIT(15),
BIT(16), BIT(17), BIT(18), BIT(19), BIT(20), BIT(21), BIT(22), BIT(23),
BIT(24), BIT(25), BIT(26), BIT(27), BIT(28), BIT(29), BIT(30), BIT(31),
BIT(32), BIT(33), BIT(34), BIT(35), BIT(36), BIT(37), BIT(38), BIT(39),
BIT(40), BIT(41), BIT(42), BIT(43), BIT(44), BIT(45), BIT(46), BIT(47),
BIT(48), BIT(49), BIT(50), BIT(51), BIT(52), BIT(53), BIT(54), BIT(55),
BIT(56), BIT(57), BIT(58), BIT(59), BIT(60), BIT(61), BIT(62), BIT(63),
};
/*--------------------------------------------------------------------*/
uint64_t get_null_entry(uint64_t * nullrows, slong i, slong l) {
/* Returns true if the entry with indices i,l is 1 in the
supplied 64xN matrix. This is used to read the nullspace
vectors which are output by the Lanczos routine */
return nullrows[i]&bitmask[l];
}
/*--------------------------------------------------------------------*/
void reduce_matrix(qs_t qs_inf, slong *nrows, slong *ncols, la_col_t *cols) {
/* Perform light filtering on the nrows x ncols
matrix specified by cols[]. The processing here is
limited to deleting columns that contain a singleton
row, then resizing the matrix to have a few more
columns than rows. Because deleting a column reduces
the counts in several different rows, the process
must iterate to convergence.
Note that this step is not intended to make the Lanczos
iteration run any faster (though it will); it's just
that if we don't go to this trouble then there are
factorizations for which the matrix step will fail
outright */
slong r, c, i, j, k;
slong passes;
slong *counts;
slong reduced_rows;
slong reduced_cols;
/* count the number of nonzero entries in each row */
counts = (slong *)flint_calloc((size_t)*nrows, sizeof(slong));
for (i = 0; i < *ncols; i++) {
for (j = 0; j < cols[i].weight; j++)
counts[cols[i].data[j]]++;
}
reduced_rows = *nrows;
reduced_cols = *ncols;
passes = 0;
do {
r = reduced_rows;
/* remove any columns that contain the only entry
in one or more rows, then update the row counts
to reflect the missing column. Iterate until
no more columns can be deleted */
do {
c = reduced_cols;
for (i = j = 0; i < reduced_cols; i++) {
la_col_t *col = cols + i;
for (k = 0; k < col->weight; k++) {
if (counts[col->data[k]] < 2)
break;
}
if (k < col->weight) {
for (k = 0; k < col->weight; k++) {
counts[col->data[k]]--;
}
free_col(col);
clear_col(col);
}
else {
cols[j++] = cols[i];
if (j-1 != i) clear_col(col);
}
}
reduced_cols = j;
} while (c != reduced_cols);
/* count the number of rows that contain a
nonzero entry */
for (i = reduced_rows = 0; i < *nrows; i++) {
if (counts[i])
reduced_rows++;
}
/* Because deleting a column reduces the weight
of many rows, the number of nonzero rows may
be much less than the number of columns. Delete
more columns until the matrix has the correct
aspect ratio. Columns at the end of cols[] are
the heaviest, so delete those (and update the
row counts again) */
if (reduced_cols > reduced_rows + qs_inf->extra_rels) {
for (i = reduced_rows + qs_inf->extra_rels;
i < reduced_cols; i++) {
la_col_t *col = cols + i;
for (j = 0; j < col->weight; j++) {
counts[col->data[j]]--;
}
free_col(col);
clear_col(col);
}
reduced_cols = reduced_rows + qs_inf->extra_rels;
}
/* if any columns were deleted in the previous step,
then the matrix is less dense and more columns
can be deleted; iterate until no further deletions
are possible */
passes++;
} while (r != reduced_rows);
#if (QS_DEBUG & 128)
flint_printf("reduce to %wd x %wd in %wd passes\n",
reduced_rows, reduced_cols, passes);
#endif
flint_free(counts);
/* record the final matrix size. Note that we can't touch
nrows because all the column data (and the sieving relations
that produced it) would have to be updated */
*ncols = reduced_cols;
}
/*-------------------------------------------------------------------*/
static void mul_64x64_64x64(uint64_t *a, uint64_t *b, uint64_t *c ) {
/* c[][] = x[][] * y[][], where all operands are 64 x 64
(i.e. contain 64 words of 64 bits each). The result
may overwrite a or b. */
uint64_t ai, bj, accum;
uint64_t tmp[64];
ulong i, j;
for (i = 0; i < 64; i++) {
j = 0;
accum = 0;
ai = a[i];
while (ai) {
bj = b[j];
if( ai & 1 )
accum ^= bj;
ai >>= 1;
j++;
}
tmp[i] = accum;
}
memcpy(c, tmp, sizeof(tmp));
}
/*-----------------------------------------------------------------------*/
static void precompute_Nx64_64x64(uint64_t *x, uint64_t *c) {
/* Let x[][] be a 64 x 64 matrix in GF(2), represented
as 64 words of 64 bits each. Let c[][] be an 8 x 256
matrix of 64-bit words. This code fills c[][] with
a bunch of "partial matrix multiplies". For 0<=i<256,
the j_th row of c[][] contains the matrix product
( i << (8*j) ) * x[][]
where the quantity in parentheses is considered a
1 x 64 vector of elements in GF(2). The resulting
table can dramatically speed up matrix multiplies
by x[][]. */
uint64_t accum, xk;
ulong i, j, k, index;
for (j = 0; j < 8; j++) {
for (i = 0; i < 256; i++) {
k = 0;
index = i;
accum = 0;
while (index) {
xk = x[k];
if (index & 1)
accum ^= xk;
index >>= 1;
k++;
}
c[i] = accum;
}
x += 8;
c += 256;
}
}
/*-------------------------------------------------------------------*/
static void mul_Nx64_64x64_acc(uint64_t *v, uint64_t *x, uint64_t *c,
uint64_t *y, slong n) {
/* let v[][] be a n x 64 matrix with elements in GF(2),
represented as an array of n 64-bit words. Let c[][]
be an 8 x 256 scratch matrix of 64-bit words.
This code multiplies v[][] by the 64x64 matrix
x[][], then XORs the n x 64 result into y[][] */
slong i;
uint64_t word;
precompute_Nx64_64x64(x, c);
for (i = 0; i < n; i++) {
word = v[i];
y[i] ^= c[ 0*256 + ((word>> 0) & 0xff) ]
^ c[ 1*256 + ((word>> 8) & 0xff) ]
^ c[ 2*256 + ((word>>16) & 0xff) ]
^ c[ 3*256 + ((word>>24) & 0xff) ]
^ c[ 4*256 + ((word>>32) & 0xff) ]
^ c[ 5*256 + ((word>>40) & 0xff) ]
^ c[ 6*256 + ((word>>48) & 0xff) ]
^ c[ 7*256 + ((word>>56) ) ];
}
}
/*-------------------------------------------------------------------*/
static void mul_64xN_Nx64(uint64_t *x, uint64_t *y,
uint64_t *c, uint64_t *xy, slong n) {
/* Let x and y be n x 64 matrices. This routine computes
the 64 x 64 matrix xy[][] given by transpose(x) * y.
c[][] is a 256 x 8 scratch matrix of 64-bit words. */
slong i;
memset(c, 0, 256 * 8 * sizeof(uint64_t));
memset(xy, 0, 64 * sizeof(uint64_t));
for (i = 0; i < n; i++) {
uint64_t xi = x[i];
uint64_t yi = y[i];
c[ 0*256 + ( xi & 0xff) ] ^= yi;
c[ 1*256 + ((xi >> 8) & 0xff) ] ^= yi;
c[ 2*256 + ((xi >> 16) & 0xff) ] ^= yi;
c[ 3*256 + ((xi >> 24) & 0xff) ] ^= yi;
c[ 4*256 + ((xi >> 32) & 0xff) ] ^= yi;
c[ 5*256 + ((xi >> 40) & 0xff) ] ^= yi;
c[ 6*256 + ((xi >> 48) & 0xff) ] ^= yi;
c[ 7*256 + ((xi >> 56) ) ] ^= yi;
}
for(i = 0; i < 8; i++) {
ulong j;
uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
a0 = a1 = a2 = a3 = 0;
a4 = a5 = a6 = a7 = 0;
for (j = 0; j < 256; j++) {
if ((j >> i) & 1) {
a0 ^= c[0*256 + j];
a1 ^= c[1*256 + j];
a2 ^= c[2*256 + j];
a3 ^= c[3*256 + j];
a4 ^= c[4*256 + j];
a5 ^= c[5*256 + j];
a6 ^= c[6*256 + j];
a7 ^= c[7*256 + j];
}
}
xy[ 0] = a0; xy[ 8] = a1; xy[16] = a2; xy[24] = a3;
xy[32] = a4; xy[40] = a5; xy[48] = a6; xy[56] = a7;
xy++;
}
}
/*-------------------------------------------------------------------*/
static slong find_nonsingular_sub(uint64_t *t, slong *s,
slong *last_s, slong last_dim,
uint64_t *w) {
/* given a 64x64 matrix t[][] (i.e. sixty-four
64-bit words) and a list of 'last_dim' column
indices enumerated in last_s[]:
- find a submatrix of t that is invertible
- invert it and copy to w[][]
- enumerate in s[] the columns represented in w[][] */
slong i, j;
slong dim;
slong cols[64];
uint64_t M[64][2];
uint64_t mask, *row_i, *row_j;
uint64_t m0, m1;
/* M = [t | I] for I the 64x64 identity matrix */
for (i = 0; i < 64; i++) {
M[i][0] = t[i];
M[i][1] = bitmask[i];
}
/* put the column indices from last_s[] into the
back of cols[], and copy to the beginning of cols[]
any column indices not in last_s[] */
mask = 0;
for (i = 0; i < last_dim; i++) {
cols[63 - i] = last_s[i];
mask |= bitmask[last_s[i]];
}
for (i = j = 0; i < 64; i++) {
if (!(mask & bitmask[i]))
cols[j++] = i;
}
/* compute the inverse of t[][] */
for (i = dim = 0; i < 64; i++) {
/* find the next pivot row and put in row i */
mask = bitmask[cols[i]];
row_i = M[cols[i]];
for (j = i; j < 64; j++) {
row_j = M[cols[j]];
if (row_j[0] & mask) {
m0 = row_j[0];
m1 = row_j[1];
row_j[0] = row_i[0];
row_j[1] = row_i[1];
row_i[0] = m0;
row_i[1] = m1;
break;
}
}
/* if a pivot row was found, eliminate the pivot
column from all other rows */
if (j < 64) {
for (j = 0; j < 64; j++) {
row_j = M[cols[j]];
if ((row_i != row_j) && (row_j[0] & mask)) {
row_j[0] ^= row_i[0];
row_j[1] ^= row_i[1];
}
}
/* add the pivot column to the list of
accepted columns */
s[dim++] = cols[i];
continue;
}
/* otherwise, use the right-hand half of M[]
to compensate for the absence of a pivot column */
for (j = i; j < 64; j++) {
row_j = M[cols[j]];
if (row_j[1] & mask) {
m0 = row_j[0];
m1 = row_j[1];
row_j[0] = row_i[0];
row_j[1] = row_i[1];
row_i[0] = m0;
row_i[1] = m1;
break;
}
}
if (j == 64) {
#if (QS_DEBUG & 128)
flint_printf("lanczos error: submatrix "
"is not invertible\n");
#endif
return 0;
}
/* eliminate the pivot column from the other rows
of the inverse */
for (j = 0; j < 64; j++) {
row_j = M[cols[j]];
if ((row_i != row_j) && (row_j[1] & mask)) {
row_j[0] ^= row_i[0];
row_j[1] ^= row_i[1];
}
}
/* wipe out the pivot row */
row_i[0] = row_i[1] = 0;
}
/* the right-hand half of M[] is the desired inverse */
for (i = 0; i < 64; i++)
w[i] = M[i][1];
/* The block Lanczos recurrence depends on all columns
of t[][] appearing in s[] and/or last_s[].
Verify that condition here */
mask = 0;
for (i = 0; i < dim; i++)
mask |= bitmask[s[i]];
for (i = 0; i < last_dim; i++)
mask |= bitmask[last_s[i]];
if (mask != (uint64_t)(-1)) {
#if (QS_DEBUG & 128)
flint_printf("lanczos error: not all columns used\n");
#endif
return 0;
}
return dim;
}
/*-------------------------------------------------------------------*/
void mul_MxN_Nx64(slong vsize, slong dense_rows,
slong ncols, la_col_t *A,
uint64_t *x, uint64_t *b) {
/* Multiply the vector x[] by the matrix A (stored
columnwise) and put the result in b[]. vsize
refers to the number of uint64_t's allocated for
x[] and b[]; vsize is probably different from ncols */
slong i, j;
memset(b, 0, vsize * sizeof(uint64_t));
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data;
uint64_t tmp = x[i];
for (j = 0; j < col->weight; j++) {
b[row_entries[j]] ^= tmp;
}
}
if (dense_rows) {
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data + col->weight;
uint64_t tmp = x[i];
for (j = 0; j < dense_rows; j++) {
if (row_entries[j / 32] &
((slong)1 << (j % 32))) {
b[j] ^= tmp;
}
}
}
}
}
/*-------------------------------------------------------------------*/
void mul_trans_MxN_Nx64(slong dense_rows, slong ncols,
la_col_t *A, uint64_t *x, uint64_t *b) {
/* Multiply the vector x[] by the transpose of the
matrix A and put the result in b[]. Since A is stored
by columns, this is just a matrix-vector product */
slong i, j;
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data;
uint64_t accum = 0;
for (j = 0; j < col->weight; j++) {
accum ^= x[row_entries[j]];
}
b[i] = accum;
}
if (dense_rows) {
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data + col->weight;
uint64_t accum = b[i];
for (j = 0; j < dense_rows; j++) {
if (row_entries[j / 32] &
((slong)1 << (j % 32))) {
accum ^= x[j];
}
}
b[i] = accum;
}
}
}
/*-----------------------------------------------------------------------*/
static void transpose_vector(slong ncols, uint64_t *v, uint64_t **trans) {
/* Hideously inefficent routine to transpose a
vector v[] of 64-bit words into a 2-D array
trans[][] of 64-bit words */
slong i, j;
slong col;
uint64_t mask, word;
for (i = 0; i < ncols; i++) {
col = i / 64;
mask = bitmask[i % 64];
word = v[i];
j = 0;
while (word) {
if (word & 1)
trans[j][col] |= mask;
word = word >> 1;
j++;
}
}
}
/*-----------------------------------------------------------------------*/
void combine_cols(slong ncols,
uint64_t *x, uint64_t *v,
uint64_t *ax, uint64_t *av) {
/* Once the block Lanczos iteration has finished,
x[] and v[] will contain mostly nullspace vectors
between them, as well as possibly some columns
that are linear combinations of nullspace vectors.
Given vectors ax[] and av[] that are the result of
multiplying x[] and v[] by the matrix, this routine
will use Gauss elimination on the columns of [ax | av]
to find all of the linearly dependent columns. The
column operations needed to accomplish this are mir-
rored in [x | v] and the columns that are independent
are skipped. Finally, the dependent columns are copied
back into x[] and represent the nullspace vector output
of the block Lanczos code.
v[] and av[] can be NULL, in which case the elimination
process assumes 64 dependencies instead of 128 */
slong i, j, k, bitpos, col, col_words, num_deps;
uint64_t mask;
uint64_t *matrix[128], *amatrix[128], *tmp;
num_deps = 128;
if (v == NULL || av == NULL)
num_deps = 64;
col_words = (ncols + 63) / 64;
for (i = 0; i < num_deps; i++) {
matrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
sizeof(uint64_t));
amatrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
sizeof(uint64_t));
}
/* operations on columns can more conveniently become
operations on rows if all the vectors are first
transposed */
transpose_vector(ncols, x, matrix);
transpose_vector(ncols, ax, amatrix);
if (num_deps == 128) {
transpose_vector(ncols, v, matrix + 64);
transpose_vector(ncols, av, amatrix + 64);
}
/* Keep eliminating rows until the unprocessed part
of amatrix[][] is all zero. The rows where this
happens correspond to linearly dependent vectors
in the nullspace */
for (i = bitpos = 0; i < num_deps && bitpos < ncols; bitpos++) {
/* find the next pivot row */
mask = bitmask[bitpos % 64];
col = bitpos / 64;
for (j = i; j < num_deps; j++) {
if (amatrix[j][col] & mask) {
tmp = matrix[i];
matrix[i] = matrix[j];
matrix[j] = tmp;
tmp = amatrix[i];
amatrix[i] = amatrix[j];
amatrix[j] = tmp;
break;
}
}
if (j == num_deps)
continue;
/* a pivot was found; eliminate it from the
remaining rows */
for (j++; j < num_deps; j++) {
if (amatrix[j][col] & mask) {
/* Note that the entire row, *not*
just the nonzero part of it, must
be eliminated; this is because the
corresponding (dense) row of matrix[][]
must have the same operation applied */
for (k = 0; k < col_words; k++) {
amatrix[j][k] ^= amatrix[i][k];
matrix[j][k] ^= matrix[i][k];
}
}
}
i++;
}
/* transpose rows i to 64 back into x[] */
for (j = 0; j < ncols; j++) {
uint64_t word = 0;
col = j / 64;
mask = bitmask[j % 64];
for (k = i; k < 64; k++) {
if (matrix[k][col] & mask)
word |= bitmask[k];
}
x[j] = word;
}
for (i = 0; i < num_deps; i++) {
flint_free(matrix[i]);
flint_free(amatrix[i]);
}
}
/*-----------------------------------------------------------------------*/
uint64_t * block_lanczos(flint_rand_t state, slong nrows,
slong dense_rows, slong ncols, la_col_t *B) {
/* Solve Bx = 0 for some nonzero x; the computed
solution, containing up to 64 of these nullspace
vectors, is returned */
uint64_t *vnext, *v[3], *x, *v0;
uint64_t *winv[3];
uint64_t *vt_a_v[2], *vt_a2_v[2];
uint64_t *scratch;
uint64_t *d, *e, *f, *f2;
uint64_t *tmp;
slong s[2][64];
slong i, iter;
slong n = ncols;
slong dim0, dim1;
uint64_t mask0, mask1;
slong vsize;
/* allocate all of the size-n variables. Note that because
B has been preprocessed to ignore singleton rows, the
number of rows may really be less than nrows and may
be greater than ncols. vsize is the maximum of these
two numbers */
vsize = FLINT_MAX(nrows, ncols);
v[0] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
v[1] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
v[2] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
vnext = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
x = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
v0 = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t));
scratch = (uint64_t *)flint_malloc(FLINT_MAX(vsize, 256 * 8) * sizeof(uint64_t));
/* allocate all the 64x64 variables */
winv[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
winv[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
winv[2] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
vt_a_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
vt_a_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
vt_a2_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
vt_a2_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
d = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
e = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
f = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
f2 = (uint64_t *)flint_malloc(64 * sizeof(uint64_t));
/* The iterations computes v[0], vt_a_v[0],
vt_a2_v[0], s[0] and winv[0]. Subscripts larger
than zero represent past versions of these
quantities, which start off empty (except for
the past version of s[], which contains all
the column indices */
memset(v[1], 0, vsize * sizeof(uint64_t));
memset(v[2], 0, vsize * sizeof(uint64_t));
for (i = 0; i < 64; i++) {
s[1][i] = i;
vt_a_v[1][i] = 0;
vt_a2_v[1][i] = 0;
winv[1][i] = 0;
winv[2][i] = 0;
}
dim0 = 0;
dim1 = 64;
mask1 = (uint64_t)(-1);
iter = 0;
/* The computed solution 'x' starts off random,
and v[0] starts off as B*x. This initial copy
of v[0] must be saved off separately */
for (i = 0; i < n; i++)
#if FLINT_BITS==64
v[0][i] = (uint64_t) n_randlimb(state);
#else
v[0][i] = (uint64_t) n_randlimb(state) + ((uint64_t) n_randlimb(state) << 32);
#endif
memcpy(x, v[0], vsize * sizeof(uint64_t));
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, v[0]);
memcpy(v0, v[0], vsize * sizeof(uint64_t));
/* perform the iteration */
while (1) {
iter++;
/* multiply the current v[0] by a symmetrized
version of B, or B'B (apostrophe means
transpose). Use "A" to refer to B'B */
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, vnext);
/* compute v0'*A*v0 and (A*v0)'(A*v0) */
mul_64xN_Nx64(v[0], vnext, scratch, vt_a_v[0], n);
mul_64xN_Nx64(vnext, vnext, scratch, vt_a2_v[0], n);
/* if the former is orthogonal to itself, then
the iteration has finished */
for (i = 0; i < 64; i++) {
if (vt_a_v[0][i] != 0)
break;
}
if (i == 64) {
break;
}
/* Find the size-'dim0' nonsingular submatrix
of v0'*A*v0, invert it, and list the column
indices present in the submatrix */
dim0 = find_nonsingular_sub(vt_a_v[0], s[0],
s[1], dim1, winv[0]);
if (dim0 == 0)
break;
/* mask0 contains one set bit for every column
that participates in the inverted submatrix
computed above */
mask0 = 0;
for (i = 0; i < dim0; i++)
mask0 |= bitmask[s[0][i]];
/* compute d */
for (i = 0; i < 64; i++)
d[i] = (vt_a2_v[0][i] & mask0) ^ vt_a_v[0][i];
mul_64x64_64x64(winv[0], d, d);
for (i = 0; i < 64; i++)
d[i] = d[i] ^ bitmask[i];
/* compute e */
mul_64x64_64x64(winv[1], vt_a_v[0], e);
for (i = 0; i < 64; i++)
e[i] = e[i] & mask0;
/* compute f */
mul_64x64_64x64(vt_a_v[1], winv[1], f);
for (i = 0; i < 64; i++)
f[i] = f[i] ^ bitmask[i];
mul_64x64_64x64(winv[2], f, f);
for (i = 0; i < 64; i++)
f2[i] = ((vt_a2_v[1][i] & mask1) ^
vt_a_v[1][i]) & mask0;
mul_64x64_64x64(f, f2, f);
/* compute the next v */
for (i = 0; i < n; i++)
vnext[i] = vnext[i] & mask0;
mul_Nx64_64x64_acc(v[0], d, scratch, vnext, n);
mul_Nx64_64x64_acc(v[1], e, scratch, vnext, n);
mul_Nx64_64x64_acc(v[2], f, scratch, vnext, n);
/* update the computed solution 'x' */
mul_64xN_Nx64(v[0], v0, scratch, d, n);
mul_64x64_64x64(winv[0], d, d);
mul_Nx64_64x64_acc(v[0], d, scratch, x, n);
/* rotate all the variables */
tmp = v[2];
v[2] = v[1];
v[1] = v[0];
v[0] = vnext;
vnext = tmp;
tmp = winv[2];
winv[2] = winv[1];
winv[1] = winv[0];
winv[0] = tmp;
tmp = vt_a_v[1]; vt_a_v[1] = vt_a_v[0]; vt_a_v[0] = tmp;
tmp = vt_a2_v[1]; vt_a2_v[1] = vt_a2_v[0]; vt_a2_v[0] = tmp;
memcpy(s[1], s[0], 64 * sizeof(slong));
mask1 = mask0;
dim1 = dim0;
}
#if (QS_DEBUG & 128)
flint_printf("lanczos halted after %wd iterations\n", iter);
#endif
/* free unneeded storage */
flint_free(vnext);
flint_free(scratch);
flint_free(v0);
flint_free(vt_a_v[0]);
flint_free(vt_a_v[1]);
flint_free(vt_a2_v[0]);
flint_free(vt_a2_v[1]);
flint_free(winv[0]);
flint_free(winv[1]);
flint_free(winv[2]);
flint_free(d);
flint_free(e);
flint_free(f);
flint_free(f2);
/* if a recoverable failure occurred, start everything
over again */
if (dim0 == 0) {
#if (QS_DEBUG & 128)
flint_printf("linear algebra failed; retrying...\n");
#endif
flint_free(x);
flint_free(v[0]);
flint_free(v[1]);
flint_free(v[2]);
return NULL;
}
/* convert the output of the iteration to an actual
collection of nullspace vectors */
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[1]);
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], v[2]);
combine_cols(ncols, x, v[0], v[1], v[2]);
/* verify that these really are linear dependencies of B */
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[0]);
for (i = 0; i < ncols; i++) {
if (v[0][i] != 0)
break;
}
if (i < ncols) {
flint_printf("lanczos error: dependencies don't work %wd\n",i);
abort();
}
flint_free(v[0]);
flint_free(v[1]);
flint_free(v[2]);
return x;
}