/*============================================================================ Copyright 2006 Jason Papadopoulos. Copyright 2006, 2011 William Hart. This file is part of FLINT. FLINT is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. FLINT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with FLINT; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA =============================================================================== Optionally, please be nice and tell me if you find this source to be useful. Again optionally, if you add to the functionality present here please consider making those additions public too, so that others may benefit from your work. --jasonp@boo.net 9/8/06 The following modifications were made by William Hart: -added the utility function get_null_entry -reformatted original code so it would operate as a standalone filter and block Lanczos module --------------------------------------------------------------------*/ #define ulong ulongxx /* interferes with system includes */ #include #include #include #undef ulong #define ulong mp_limb_t #include #include "flint.h" #include "ulong_extras.h" #include "qsieve.h" #define BIT(x) (((uint64_t)(1)) << (x)) static const uint64_t bitmask[64] = { BIT( 0), BIT( 1), BIT( 2), BIT( 3), BIT( 4), BIT( 5), BIT( 6), BIT( 7), BIT( 8), BIT( 9), BIT(10), BIT(11), BIT(12), BIT(13), BIT(14), BIT(15), BIT(16), BIT(17), BIT(18), BIT(19), BIT(20), BIT(21), BIT(22), BIT(23), BIT(24), BIT(25), BIT(26), BIT(27), BIT(28), BIT(29), BIT(30), BIT(31), BIT(32), BIT(33), BIT(34), BIT(35), BIT(36), BIT(37), BIT(38), BIT(39), BIT(40), BIT(41), BIT(42), BIT(43), BIT(44), BIT(45), BIT(46), BIT(47), BIT(48), BIT(49), BIT(50), BIT(51), BIT(52), BIT(53), BIT(54), BIT(55), BIT(56), BIT(57), BIT(58), BIT(59), BIT(60), BIT(61), BIT(62), BIT(63), }; /*--------------------------------------------------------------------*/ uint64_t get_null_entry(uint64_t * nullrows, slong i, slong l) { /* Returns true if the entry with indices i,l is 1 in the supplied 64xN matrix. This is used to read the nullspace vectors which are output by the Lanczos routine */ return nullrows[i]&bitmask[l]; } /*--------------------------------------------------------------------*/ void reduce_matrix(qs_t qs_inf, slong *nrows, slong *ncols, la_col_t *cols) { /* Perform light filtering on the nrows x ncols matrix specified by cols[]. The processing here is limited to deleting columns that contain a singleton row, then resizing the matrix to have a few more columns than rows. Because deleting a column reduces the counts in several different rows, the process must iterate to convergence. Note that this step is not intended to make the Lanczos iteration run any faster (though it will); it's just that if we don't go to this trouble then there are factorizations for which the matrix step will fail outright */ slong r, c, i, j, k; slong passes; slong *counts; slong reduced_rows; slong reduced_cols; /* count the number of nonzero entries in each row */ counts = (slong *)flint_calloc((size_t)*nrows, sizeof(slong)); for (i = 0; i < *ncols; i++) { for (j = 0; j < cols[i].weight; j++) counts[cols[i].data[j]]++; } reduced_rows = *nrows; reduced_cols = *ncols; passes = 0; do { r = reduced_rows; /* remove any columns that contain the only entry in one or more rows, then update the row counts to reflect the missing column. Iterate until no more columns can be deleted */ do { c = reduced_cols; for (i = j = 0; i < reduced_cols; i++) { la_col_t *col = cols + i; for (k = 0; k < col->weight; k++) { if (counts[col->data[k]] < 2) break; } if (k < col->weight) { for (k = 0; k < col->weight; k++) { counts[col->data[k]]--; } free_col(col); clear_col(col); } else { cols[j++] = cols[i]; if (j-1 != i) clear_col(col); } } reduced_cols = j; } while (c != reduced_cols); /* count the number of rows that contain a nonzero entry */ for (i = reduced_rows = 0; i < *nrows; i++) { if (counts[i]) reduced_rows++; } /* Because deleting a column reduces the weight of many rows, the number of nonzero rows may be much less than the number of columns. Delete more columns until the matrix has the correct aspect ratio. Columns at the end of cols[] are the heaviest, so delete those (and update the row counts again) */ if (reduced_cols > reduced_rows + qs_inf->extra_rels) { for (i = reduced_rows + qs_inf->extra_rels; i < reduced_cols; i++) { la_col_t *col = cols + i; for (j = 0; j < col->weight; j++) { counts[col->data[j]]--; } free_col(col); clear_col(col); } reduced_cols = reduced_rows + qs_inf->extra_rels; } /* if any columns were deleted in the previous step, then the matrix is less dense and more columns can be deleted; iterate until no further deletions are possible */ passes++; } while (r != reduced_rows); #if (QS_DEBUG & 128) flint_printf("reduce to %wd x %wd in %wd passes\n", reduced_rows, reduced_cols, passes); #endif flint_free(counts); /* record the final matrix size. Note that we can't touch nrows because all the column data (and the sieving relations that produced it) would have to be updated */ *ncols = reduced_cols; } /*-------------------------------------------------------------------*/ static void mul_64x64_64x64(uint64_t *a, uint64_t *b, uint64_t *c ) { /* c[][] = x[][] * y[][], where all operands are 64 x 64 (i.e. contain 64 words of 64 bits each). The result may overwrite a or b. */ uint64_t ai, bj, accum; uint64_t tmp[64]; ulong i, j; for (i = 0; i < 64; i++) { j = 0; accum = 0; ai = a[i]; while (ai) { bj = b[j]; if( ai & 1 ) accum ^= bj; ai >>= 1; j++; } tmp[i] = accum; } memcpy(c, tmp, sizeof(tmp)); } /*-----------------------------------------------------------------------*/ static void precompute_Nx64_64x64(uint64_t *x, uint64_t *c) { /* Let x[][] be a 64 x 64 matrix in GF(2), represented as 64 words of 64 bits each. Let c[][] be an 8 x 256 matrix of 64-bit words. This code fills c[][] with a bunch of "partial matrix multiplies". For 0<=i<256, the j_th row of c[][] contains the matrix product ( i << (8*j) ) * x[][] where the quantity in parentheses is considered a 1 x 64 vector of elements in GF(2). The resulting table can dramatically speed up matrix multiplies by x[][]. */ uint64_t accum, xk; ulong i, j, k, index; for (j = 0; j < 8; j++) { for (i = 0; i < 256; i++) { k = 0; index = i; accum = 0; while (index) { xk = x[k]; if (index & 1) accum ^= xk; index >>= 1; k++; } c[i] = accum; } x += 8; c += 256; } } /*-------------------------------------------------------------------*/ static void mul_Nx64_64x64_acc(uint64_t *v, uint64_t *x, uint64_t *c, uint64_t *y, slong n) { /* let v[][] be a n x 64 matrix with elements in GF(2), represented as an array of n 64-bit words. Let c[][] be an 8 x 256 scratch matrix of 64-bit words. This code multiplies v[][] by the 64x64 matrix x[][], then XORs the n x 64 result into y[][] */ slong i; uint64_t word; precompute_Nx64_64x64(x, c); for (i = 0; i < n; i++) { word = v[i]; y[i] ^= c[ 0*256 + ((word>> 0) & 0xff) ] ^ c[ 1*256 + ((word>> 8) & 0xff) ] ^ c[ 2*256 + ((word>>16) & 0xff) ] ^ c[ 3*256 + ((word>>24) & 0xff) ] ^ c[ 4*256 + ((word>>32) & 0xff) ] ^ c[ 5*256 + ((word>>40) & 0xff) ] ^ c[ 6*256 + ((word>>48) & 0xff) ] ^ c[ 7*256 + ((word>>56) ) ]; } } /*-------------------------------------------------------------------*/ static void mul_64xN_Nx64(uint64_t *x, uint64_t *y, uint64_t *c, uint64_t *xy, slong n) { /* Let x and y be n x 64 matrices. This routine computes the 64 x 64 matrix xy[][] given by transpose(x) * y. c[][] is a 256 x 8 scratch matrix of 64-bit words. */ slong i; memset(c, 0, 256 * 8 * sizeof(uint64_t)); memset(xy, 0, 64 * sizeof(uint64_t)); for (i = 0; i < n; i++) { uint64_t xi = x[i]; uint64_t yi = y[i]; c[ 0*256 + ( xi & 0xff) ] ^= yi; c[ 1*256 + ((xi >> 8) & 0xff) ] ^= yi; c[ 2*256 + ((xi >> 16) & 0xff) ] ^= yi; c[ 3*256 + ((xi >> 24) & 0xff) ] ^= yi; c[ 4*256 + ((xi >> 32) & 0xff) ] ^= yi; c[ 5*256 + ((xi >> 40) & 0xff) ] ^= yi; c[ 6*256 + ((xi >> 48) & 0xff) ] ^= yi; c[ 7*256 + ((xi >> 56) ) ] ^= yi; } for(i = 0; i < 8; i++) { ulong j; uint64_t a0, a1, a2, a3, a4, a5, a6, a7; a0 = a1 = a2 = a3 = 0; a4 = a5 = a6 = a7 = 0; for (j = 0; j < 256; j++) { if ((j >> i) & 1) { a0 ^= c[0*256 + j]; a1 ^= c[1*256 + j]; a2 ^= c[2*256 + j]; a3 ^= c[3*256 + j]; a4 ^= c[4*256 + j]; a5 ^= c[5*256 + j]; a6 ^= c[6*256 + j]; a7 ^= c[7*256 + j]; } } xy[ 0] = a0; xy[ 8] = a1; xy[16] = a2; xy[24] = a3; xy[32] = a4; xy[40] = a5; xy[48] = a6; xy[56] = a7; xy++; } } /*-------------------------------------------------------------------*/ static slong find_nonsingular_sub(uint64_t *t, slong *s, slong *last_s, slong last_dim, uint64_t *w) { /* given a 64x64 matrix t[][] (i.e. sixty-four 64-bit words) and a list of 'last_dim' column indices enumerated in last_s[]: - find a submatrix of t that is invertible - invert it and copy to w[][] - enumerate in s[] the columns represented in w[][] */ slong i, j; slong dim; slong cols[64]; uint64_t M[64][2]; uint64_t mask, *row_i, *row_j; uint64_t m0, m1; /* M = [t | I] for I the 64x64 identity matrix */ for (i = 0; i < 64; i++) { M[i][0] = t[i]; M[i][1] = bitmask[i]; } /* put the column indices from last_s[] into the back of cols[], and copy to the beginning of cols[] any column indices not in last_s[] */ mask = 0; for (i = 0; i < last_dim; i++) { cols[63 - i] = last_s[i]; mask |= bitmask[last_s[i]]; } for (i = j = 0; i < 64; i++) { if (!(mask & bitmask[i])) cols[j++] = i; } /* compute the inverse of t[][] */ for (i = dim = 0; i < 64; i++) { /* find the next pivot row and put in row i */ mask = bitmask[cols[i]]; row_i = M[cols[i]]; for (j = i; j < 64; j++) { row_j = M[cols[j]]; if (row_j[0] & mask) { m0 = row_j[0]; m1 = row_j[1]; row_j[0] = row_i[0]; row_j[1] = row_i[1]; row_i[0] = m0; row_i[1] = m1; break; } } /* if a pivot row was found, eliminate the pivot column from all other rows */ if (j < 64) { for (j = 0; j < 64; j++) { row_j = M[cols[j]]; if ((row_i != row_j) && (row_j[0] & mask)) { row_j[0] ^= row_i[0]; row_j[1] ^= row_i[1]; } } /* add the pivot column to the list of accepted columns */ s[dim++] = cols[i]; continue; } /* otherwise, use the right-hand half of M[] to compensate for the absence of a pivot column */ for (j = i; j < 64; j++) { row_j = M[cols[j]]; if (row_j[1] & mask) { m0 = row_j[0]; m1 = row_j[1]; row_j[0] = row_i[0]; row_j[1] = row_i[1]; row_i[0] = m0; row_i[1] = m1; break; } } if (j == 64) { #if (QS_DEBUG & 128) flint_printf("lanczos error: submatrix " "is not invertible\n"); #endif return 0; } /* eliminate the pivot column from the other rows of the inverse */ for (j = 0; j < 64; j++) { row_j = M[cols[j]]; if ((row_i != row_j) && (row_j[1] & mask)) { row_j[0] ^= row_i[0]; row_j[1] ^= row_i[1]; } } /* wipe out the pivot row */ row_i[0] = row_i[1] = 0; } /* the right-hand half of M[] is the desired inverse */ for (i = 0; i < 64; i++) w[i] = M[i][1]; /* The block Lanczos recurrence depends on all columns of t[][] appearing in s[] and/or last_s[]. Verify that condition here */ mask = 0; for (i = 0; i < dim; i++) mask |= bitmask[s[i]]; for (i = 0; i < last_dim; i++) mask |= bitmask[last_s[i]]; if (mask != (uint64_t)(-1)) { #if (QS_DEBUG & 128) flint_printf("lanczos error: not all columns used\n"); #endif return 0; } return dim; } /*-------------------------------------------------------------------*/ void mul_MxN_Nx64(slong vsize, slong dense_rows, slong ncols, la_col_t *A, uint64_t *x, uint64_t *b) { /* Multiply the vector x[] by the matrix A (stored columnwise) and put the result in b[]. vsize refers to the number of uint64_t's allocated for x[] and b[]; vsize is probably different from ncols */ slong i, j; memset(b, 0, vsize * sizeof(uint64_t)); for (i = 0; i < ncols; i++) { la_col_t *col = A + i; slong *row_entries = col->data; uint64_t tmp = x[i]; for (j = 0; j < col->weight; j++) { b[row_entries[j]] ^= tmp; } } if (dense_rows) { for (i = 0; i < ncols; i++) { la_col_t *col = A + i; slong *row_entries = col->data + col->weight; uint64_t tmp = x[i]; for (j = 0; j < dense_rows; j++) { if (row_entries[j / 32] & ((slong)1 << (j % 32))) { b[j] ^= tmp; } } } } } /*-------------------------------------------------------------------*/ void mul_trans_MxN_Nx64(slong dense_rows, slong ncols, la_col_t *A, uint64_t *x, uint64_t *b) { /* Multiply the vector x[] by the transpose of the matrix A and put the result in b[]. Since A is stored by columns, this is just a matrix-vector product */ slong i, j; for (i = 0; i < ncols; i++) { la_col_t *col = A + i; slong *row_entries = col->data; uint64_t accum = 0; for (j = 0; j < col->weight; j++) { accum ^= x[row_entries[j]]; } b[i] = accum; } if (dense_rows) { for (i = 0; i < ncols; i++) { la_col_t *col = A + i; slong *row_entries = col->data + col->weight; uint64_t accum = b[i]; for (j = 0; j < dense_rows; j++) { if (row_entries[j / 32] & ((slong)1 << (j % 32))) { accum ^= x[j]; } } b[i] = accum; } } } /*-----------------------------------------------------------------------*/ static void transpose_vector(slong ncols, uint64_t *v, uint64_t **trans) { /* Hideously inefficent routine to transpose a vector v[] of 64-bit words into a 2-D array trans[][] of 64-bit words */ slong i, j; slong col; uint64_t mask, word; for (i = 0; i < ncols; i++) { col = i / 64; mask = bitmask[i % 64]; word = v[i]; j = 0; while (word) { if (word & 1) trans[j][col] |= mask; word = word >> 1; j++; } } } /*-----------------------------------------------------------------------*/ void combine_cols(slong ncols, uint64_t *x, uint64_t *v, uint64_t *ax, uint64_t *av) { /* Once the block Lanczos iteration has finished, x[] and v[] will contain mostly nullspace vectors between them, as well as possibly some columns that are linear combinations of nullspace vectors. Given vectors ax[] and av[] that are the result of multiplying x[] and v[] by the matrix, this routine will use Gauss elimination on the columns of [ax | av] to find all of the linearly dependent columns. The column operations needed to accomplish this are mir- rored in [x | v] and the columns that are independent are skipped. Finally, the dependent columns are copied back into x[] and represent the nullspace vector output of the block Lanczos code. v[] and av[] can be NULL, in which case the elimination process assumes 64 dependencies instead of 128 */ slong i, j, k, bitpos, col, col_words, num_deps; uint64_t mask; uint64_t *matrix[128], *amatrix[128], *tmp; num_deps = 128; if (v == NULL || av == NULL) num_deps = 64; col_words = (ncols + 63) / 64; for (i = 0; i < num_deps; i++) { matrix[i] = (uint64_t *)flint_calloc((size_t)col_words, sizeof(uint64_t)); amatrix[i] = (uint64_t *)flint_calloc((size_t)col_words, sizeof(uint64_t)); } /* operations on columns can more conveniently become operations on rows if all the vectors are first transposed */ transpose_vector(ncols, x, matrix); transpose_vector(ncols, ax, amatrix); if (num_deps == 128) { transpose_vector(ncols, v, matrix + 64); transpose_vector(ncols, av, amatrix + 64); } /* Keep eliminating rows until the unprocessed part of amatrix[][] is all zero. The rows where this happens correspond to linearly dependent vectors in the nullspace */ for (i = bitpos = 0; i < num_deps && bitpos < ncols; bitpos++) { /* find the next pivot row */ mask = bitmask[bitpos % 64]; col = bitpos / 64; for (j = i; j < num_deps; j++) { if (amatrix[j][col] & mask) { tmp = matrix[i]; matrix[i] = matrix[j]; matrix[j] = tmp; tmp = amatrix[i]; amatrix[i] = amatrix[j]; amatrix[j] = tmp; break; } } if (j == num_deps) continue; /* a pivot was found; eliminate it from the remaining rows */ for (j++; j < num_deps; j++) { if (amatrix[j][col] & mask) { /* Note that the entire row, *not* just the nonzero part of it, must be eliminated; this is because the corresponding (dense) row of matrix[][] must have the same operation applied */ for (k = 0; k < col_words; k++) { amatrix[j][k] ^= amatrix[i][k]; matrix[j][k] ^= matrix[i][k]; } } } i++; } /* transpose rows i to 64 back into x[] */ for (j = 0; j < ncols; j++) { uint64_t word = 0; col = j / 64; mask = bitmask[j % 64]; for (k = i; k < 64; k++) { if (matrix[k][col] & mask) word |= bitmask[k]; } x[j] = word; } for (i = 0; i < num_deps; i++) { flint_free(matrix[i]); flint_free(amatrix[i]); } } /*-----------------------------------------------------------------------*/ uint64_t * block_lanczos(flint_rand_t state, slong nrows, slong dense_rows, slong ncols, la_col_t *B) { /* Solve Bx = 0 for some nonzero x; the computed solution, containing up to 64 of these nullspace vectors, is returned */ uint64_t *vnext, *v[3], *x, *v0; uint64_t *winv[3]; uint64_t *vt_a_v[2], *vt_a2_v[2]; uint64_t *scratch; uint64_t *d, *e, *f, *f2; uint64_t *tmp; slong s[2][64]; slong i, iter; slong n = ncols; slong dim0, dim1; uint64_t mask0, mask1; slong vsize; /* allocate all of the size-n variables. Note that because B has been preprocessed to ignore singleton rows, the number of rows may really be less than nrows and may be greater than ncols. vsize is the maximum of these two numbers */ vsize = FLINT_MAX(nrows, ncols); v[0] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); v[1] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); v[2] = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); vnext = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); x = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); v0 = (uint64_t *)flint_malloc(vsize * sizeof(uint64_t)); scratch = (uint64_t *)flint_malloc(FLINT_MAX(vsize, 256 * 8) * sizeof(uint64_t)); /* allocate all the 64x64 variables */ winv[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); winv[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); winv[2] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); vt_a_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); vt_a_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); vt_a2_v[0] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); vt_a2_v[1] = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); d = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); e = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); f = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); f2 = (uint64_t *)flint_malloc(64 * sizeof(uint64_t)); /* The iterations computes v[0], vt_a_v[0], vt_a2_v[0], s[0] and winv[0]. Subscripts larger than zero represent past versions of these quantities, which start off empty (except for the past version of s[], which contains all the column indices */ memset(v[1], 0, vsize * sizeof(uint64_t)); memset(v[2], 0, vsize * sizeof(uint64_t)); for (i = 0; i < 64; i++) { s[1][i] = i; vt_a_v[1][i] = 0; vt_a2_v[1][i] = 0; winv[1][i] = 0; winv[2][i] = 0; } dim0 = 0; dim1 = 64; mask1 = (uint64_t)(-1); iter = 0; /* The computed solution 'x' starts off random, and v[0] starts off as B*x. This initial copy of v[0] must be saved off separately */ for (i = 0; i < n; i++) #if FLINT_BITS==64 v[0][i] = (uint64_t) n_randlimb(state); #else v[0][i] = (uint64_t) n_randlimb(state) + ((uint64_t) n_randlimb(state) << 32); #endif memcpy(x, v[0], vsize * sizeof(uint64_t)); mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch); mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, v[0]); memcpy(v0, v[0], vsize * sizeof(uint64_t)); /* perform the iteration */ while (1) { iter++; /* multiply the current v[0] by a symmetrized version of B, or B'B (apostrophe means transpose). Use "A" to refer to B'B */ mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch); mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, vnext); /* compute v0'*A*v0 and (A*v0)'(A*v0) */ mul_64xN_Nx64(v[0], vnext, scratch, vt_a_v[0], n); mul_64xN_Nx64(vnext, vnext, scratch, vt_a2_v[0], n); /* if the former is orthogonal to itself, then the iteration has finished */ for (i = 0; i < 64; i++) { if (vt_a_v[0][i] != 0) break; } if (i == 64) { break; } /* Find the size-'dim0' nonsingular submatrix of v0'*A*v0, invert it, and list the column indices present in the submatrix */ dim0 = find_nonsingular_sub(vt_a_v[0], s[0], s[1], dim1, winv[0]); if (dim0 == 0) break; /* mask0 contains one set bit for every column that participates in the inverted submatrix computed above */ mask0 = 0; for (i = 0; i < dim0; i++) mask0 |= bitmask[s[0][i]]; /* compute d */ for (i = 0; i < 64; i++) d[i] = (vt_a2_v[0][i] & mask0) ^ vt_a_v[0][i]; mul_64x64_64x64(winv[0], d, d); for (i = 0; i < 64; i++) d[i] = d[i] ^ bitmask[i]; /* compute e */ mul_64x64_64x64(winv[1], vt_a_v[0], e); for (i = 0; i < 64; i++) e[i] = e[i] & mask0; /* compute f */ mul_64x64_64x64(vt_a_v[1], winv[1], f); for (i = 0; i < 64; i++) f[i] = f[i] ^ bitmask[i]; mul_64x64_64x64(winv[2], f, f); for (i = 0; i < 64; i++) f2[i] = ((vt_a2_v[1][i] & mask1) ^ vt_a_v[1][i]) & mask0; mul_64x64_64x64(f, f2, f); /* compute the next v */ for (i = 0; i < n; i++) vnext[i] = vnext[i] & mask0; mul_Nx64_64x64_acc(v[0], d, scratch, vnext, n); mul_Nx64_64x64_acc(v[1], e, scratch, vnext, n); mul_Nx64_64x64_acc(v[2], f, scratch, vnext, n); /* update the computed solution 'x' */ mul_64xN_Nx64(v[0], v0, scratch, d, n); mul_64x64_64x64(winv[0], d, d); mul_Nx64_64x64_acc(v[0], d, scratch, x, n); /* rotate all the variables */ tmp = v[2]; v[2] = v[1]; v[1] = v[0]; v[0] = vnext; vnext = tmp; tmp = winv[2]; winv[2] = winv[1]; winv[1] = winv[0]; winv[0] = tmp; tmp = vt_a_v[1]; vt_a_v[1] = vt_a_v[0]; vt_a_v[0] = tmp; tmp = vt_a2_v[1]; vt_a2_v[1] = vt_a2_v[0]; vt_a2_v[0] = tmp; memcpy(s[1], s[0], 64 * sizeof(slong)); mask1 = mask0; dim1 = dim0; } #if (QS_DEBUG & 128) flint_printf("lanczos halted after %wd iterations\n", iter); #endif /* free unneeded storage */ flint_free(vnext); flint_free(scratch); flint_free(v0); flint_free(vt_a_v[0]); flint_free(vt_a_v[1]); flint_free(vt_a2_v[0]); flint_free(vt_a2_v[1]); flint_free(winv[0]); flint_free(winv[1]); flint_free(winv[2]); flint_free(d); flint_free(e); flint_free(f); flint_free(f2); /* if a recoverable failure occurred, start everything over again */ if (dim0 == 0) { #if (QS_DEBUG & 128) flint_printf("linear algebra failed; retrying...\n"); #endif flint_free(x); flint_free(v[0]); flint_free(v[1]); flint_free(v[2]); return NULL; } /* convert the output of the iteration to an actual collection of nullspace vectors */ mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[1]); mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], v[2]); combine_cols(ncols, x, v[0], v[1], v[2]); /* verify that these really are linear dependencies of B */ mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[0]); for (i = 0; i < ncols; i++) { if (v[0][i] != 0) break; } if (i < ncols) { flint_printf("lanczos error: dependencies don't work %wd\n",i); abort(); } flint_free(v[0]); flint_free(v[1]); flint_free(v[2]); return x; }