#include <string.h>
#include "qsieve.h"
#define BIT(x) (((uint64_t)(1)) << (x))
static const uint64_t bitmask[64] = {
BIT( 0), BIT( 1), BIT( 2), BIT( 3), BIT( 4), BIT( 5), BIT( 6), BIT( 7),
BIT( 8), BIT( 9), BIT(10), BIT(11), BIT(12), BIT(13), BIT(14), BIT(15),
BIT(16), BIT(17), BIT(18), BIT(19), BIT(20), BIT(21), BIT(22), BIT(23),
BIT(24), BIT(25), BIT(26), BIT(27), BIT(28), BIT(29), BIT(30), BIT(31),
BIT(32), BIT(33), BIT(34), BIT(35), BIT(36), BIT(37), BIT(38), BIT(39),
BIT(40), BIT(41), BIT(42), BIT(43), BIT(44), BIT(45), BIT(46), BIT(47),
BIT(48), BIT(49), BIT(50), BIT(51), BIT(52), BIT(53), BIT(54), BIT(55),
BIT(56), BIT(57), BIT(58), BIT(59), BIT(60), BIT(61), BIT(62), BIT(63),
};
uint64_t get_null_entry(uint64_t * nullrows, slong i, slong l) {
return nullrows[i]&bitmask[l];
}
void reduce_matrix(qs_t qs_inf, slong *nrows, slong *ncols, la_col_t *cols) {
slong r, c, i, j, k;
slong *counts;
slong reduced_rows;
slong reduced_cols;
#if QS_DEBUG
slong passes = 0;
#endif
counts = (slong *)flint_calloc((size_t)*nrows, sizeof(slong));
for (i = 0; i < *ncols; i++) {
for (j = 0; j < cols[i].weight; j++)
counts[cols[i].data[j]]++;
}
reduced_rows = *nrows;
reduced_cols = *ncols;
do {
r = reduced_rows;
do {
c = reduced_cols;
for (i = j = 0; i < reduced_cols; i++) {
la_col_t *col = cols + i;
for (k = 0; k < col->weight; k++) {
if (counts[col->data[k]] < 2)
break;
}
if (k < col->weight) {
for (k = 0; k < col->weight; k++) {
counts[col->data[k]]--;
}
free_col(col);
clear_col(col);
}
else {
cols[j++] = cols[i];
if (j-1 != i) clear_col(col);
}
}
reduced_cols = j;
} while (c != reduced_cols);
for (i = reduced_rows = 0; i < *nrows; i++) {
if (counts[i])
reduced_rows++;
}
if (reduced_cols > reduced_rows + qs_inf->extra_rels) {
for (i = reduced_rows + qs_inf->extra_rels;
i < reduced_cols; i++) {
la_col_t *col = cols + i;
for (j = 0; j < col->weight; j++) {
counts[col->data[j]]--;
}
free_col(col);
clear_col(col);
}
reduced_cols = reduced_rows + qs_inf->extra_rels;
}
#if QS_DEBUG
passes++;
#endif
} while (r != reduced_rows);
#if QS_DEBUG
flint_printf("reduce to %wd x %wd in %wd passes\n",
reduced_rows, reduced_cols, passes);
#endif
for (i = 0, j = 0; i < *nrows; i++)
{
if (counts[i] > 0)
counts[i] = j++;
}
for (i = 0; i < reduced_cols; i++)
{
la_col_t *col = cols + i;
for (j = 0; j < col->weight; j++)
col->data[j] = counts[col->data[j]];
}
flint_free(counts);
*ncols = reduced_cols;
*nrows = reduced_rows;
}
static void mul_64x64_64x64(uint64_t *a, uint64_t *b, uint64_t *c ) {
uint64_t ai, bj, accum;
uint64_t tmp[64];
ulong i, j;
for (i = 0; i < 64; i++) {
j = 0;
accum = 0;
ai = a[i];
while (ai) {
bj = b[j];
if( ai & 1 )
accum ^= bj;
ai >>= 1;
j++;
}
tmp[i] = accum;
}
memcpy(c, tmp, sizeof(tmp));
}
static void precompute_Nx64_64x64(uint64_t *x, uint64_t *c) {
uint64_t accum, xk;
ulong i, j, k, index;
for (j = 0; j < 8; j++) {
for (i = 0; i < 256; i++) {
k = 0;
index = i;
accum = 0;
while (index) {
xk = x[k];
if (index & 1)
accum ^= xk;
index >>= 1;
k++;
}
c[i] = accum;
}
x += 8;
c += 256;
}
}
static void mul_Nx64_64x64_acc(uint64_t *v, uint64_t *x, uint64_t *c,
uint64_t *y, slong n) {
slong i;
uint64_t word;
precompute_Nx64_64x64(x, c);
for (i = 0; i < n; i++) {
word = v[i];
y[i] ^= c[ 0*256 + ((word>> 0) & 0xff) ]
^ c[ 1*256 + ((word>> 8) & 0xff) ]
^ c[ 2*256 + ((word>>16) & 0xff) ]
^ c[ 3*256 + ((word>>24) & 0xff) ]
^ c[ 4*256 + ((word>>32) & 0xff) ]
^ c[ 5*256 + ((word>>40) & 0xff) ]
^ c[ 6*256 + ((word>>48) & 0xff) ]
^ c[ 7*256 + ((word>>56) ) ];
}
}
static void mul_64xN_Nx64(uint64_t *x, uint64_t *y,
uint64_t *c, uint64_t *xy, slong n) {
slong i;
memset(c, 0, 256 * 8 * sizeof(uint64_t));
memset(xy, 0, 64 * sizeof(uint64_t));
for (i = 0; i < n; i++) {
uint64_t xi = x[i];
uint64_t yi = y[i];
c[ 0*256 + ( xi & 0xff) ] ^= yi;
c[ 1*256 + ((xi >> 8) & 0xff) ] ^= yi;
c[ 2*256 + ((xi >> 16) & 0xff) ] ^= yi;
c[ 3*256 + ((xi >> 24) & 0xff) ] ^= yi;
c[ 4*256 + ((xi >> 32) & 0xff) ] ^= yi;
c[ 5*256 + ((xi >> 40) & 0xff) ] ^= yi;
c[ 6*256 + ((xi >> 48) & 0xff) ] ^= yi;
c[ 7*256 + ((xi >> 56) ) ] ^= yi;
}
for(i = 0; i < 8; i++) {
ulong j;
uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
a0 = a1 = a2 = a3 = 0;
a4 = a5 = a6 = a7 = 0;
for (j = 0; j < 256; j++) {
if ((j >> i) & 1) {
a0 ^= c[0*256 + j];
a1 ^= c[1*256 + j];
a2 ^= c[2*256 + j];
a3 ^= c[3*256 + j];
a4 ^= c[4*256 + j];
a5 ^= c[5*256 + j];
a6 ^= c[6*256 + j];
a7 ^= c[7*256 + j];
}
}
xy[ 0] = a0; xy[ 8] = a1; xy[16] = a2; xy[24] = a3;
xy[32] = a4; xy[40] = a5; xy[48] = a6; xy[56] = a7;
xy++;
}
}
static slong find_nonsingular_sub(uint64_t *t, slong *s,
slong *last_s, slong last_dim,
uint64_t *w) {
slong i, j;
slong dim;
slong cols[64];
uint64_t M[64][2];
uint64_t mask, *row_i, *row_j;
uint64_t m0, m1;
for (i = 0; i < 64; i++) {
M[i][0] = t[i];
M[i][1] = bitmask[i];
}
mask = 0;
for (i = 0; i < last_dim; i++) {
cols[63 - i] = last_s[i];
mask |= bitmask[last_s[i]];
}
for (i = j = 0; i < 64; i++) {
if (!(mask & bitmask[i]))
cols[j++] = i;
}
for (i = dim = 0; i < 64; i++) {
mask = bitmask[cols[i]];
row_i = M[cols[i]];
for (j = i; j < 64; j++) {
row_j = M[cols[j]];
if (row_j[0] & mask) {
m0 = row_j[0];
m1 = row_j[1];
row_j[0] = row_i[0];
row_j[1] = row_i[1];
row_i[0] = m0;
row_i[1] = m1;
break;
}
}
if (j < 64) {
for (j = 0; j < 64; j++) {
row_j = M[cols[j]];
if ((row_i != row_j) && (row_j[0] & mask)) {
row_j[0] ^= row_i[0];
row_j[1] ^= row_i[1];
}
}
s[dim++] = cols[i];
continue;
}
for (j = i; j < 64; j++) {
row_j = M[cols[j]];
if (row_j[1] & mask) {
m0 = row_j[0];
m1 = row_j[1];
row_j[0] = row_i[0];
row_j[1] = row_i[1];
row_i[0] = m0;
row_i[1] = m1;
break;
}
}
if (j == 64) {
#if QS_DEBUG
flint_printf("lanczos error: submatrix "
"is not invertible\n");
#endif
return 0;
}
for (j = 0; j < 64; j++) {
row_j = M[cols[j]];
if ((row_i != row_j) && (row_j[1] & mask)) {
row_j[0] ^= row_i[0];
row_j[1] ^= row_i[1];
}
}
row_i[0] = row_i[1] = 0;
}
for (i = 0; i < 64; i++)
w[i] = M[i][1];
mask = 0;
for (i = 0; i < dim; i++)
mask |= bitmask[s[i]];
for (i = 0; i < last_dim; i++)
mask |= bitmask[last_s[i]];
if (mask != (uint64_t)(-1)) {
#if QS_DEBUG
flint_printf("lanczos error: not all columns used\n");
#endif
return 0;
}
return dim;
}
static void mul_MxN_Nx64(slong vsize, slong dense_rows,
slong ncols, la_col_t *A,
uint64_t *x, uint64_t *b) {
slong i, j;
memset(b, 0, vsize * sizeof(uint64_t));
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data;
uint64_t tmp = x[i];
for (j = 0; j < col->weight; j++) {
b[row_entries[j]] ^= tmp;
}
}
if (dense_rows) {
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data + col->weight;
uint64_t tmp = x[i];
for (j = 0; j < dense_rows; j++) {
if (row_entries[j / 32] &
((slong)1 << (j % 32))) {
b[j] ^= tmp;
}
}
}
}
}
static void mul_trans_MxN_Nx64(slong dense_rows, slong ncols,
la_col_t *A, uint64_t *x, uint64_t *b) {
slong i, j;
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data;
uint64_t accum = 0;
for (j = 0; j < col->weight; j++) {
accum ^= x[row_entries[j]];
}
b[i] = accum;
}
if (dense_rows) {
for (i = 0; i < ncols; i++) {
la_col_t *col = A + i;
slong *row_entries = col->data + col->weight;
uint64_t accum = b[i];
for (j = 0; j < dense_rows; j++) {
if (row_entries[j / 32] &
((slong)1 << (j % 32))) {
accum ^= x[j];
}
}
b[i] = accum;
}
}
}
static void transpose_vector(slong ncols, uint64_t *v, uint64_t **trans) {
slong i, j;
slong col;
uint64_t mask, word;
for (i = 0; i < ncols; i++) {
col = i / 64;
mask = bitmask[i % 64];
word = v[i];
j = 0;
while (word) {
if (word & 1)
trans[j][col] |= mask;
word = word >> 1;
j++;
}
}
}
static void combine_cols(slong ncols,
uint64_t *x, uint64_t *v,
uint64_t *ax, uint64_t *av) {
slong i, j, k, bitpos, col, col_words, num_deps;
uint64_t mask;
uint64_t *matrix[128], *amatrix[128], *tmp;
num_deps = 128;
if (v == NULL || av == NULL)
num_deps = 64;
col_words = (ncols + 63) / 64;
for (i = 0; i < num_deps; i++) {
matrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
sizeof(uint64_t));
amatrix[i] = (uint64_t *)flint_calloc((size_t)col_words,
sizeof(uint64_t));
}
transpose_vector(ncols, x, matrix);
transpose_vector(ncols, ax, amatrix);
if (num_deps == 128) {
transpose_vector(ncols, v, matrix + 64);
transpose_vector(ncols, av, amatrix + 64);
}
for (i = bitpos = 0; i < num_deps && bitpos < ncols; bitpos++) {
mask = bitmask[bitpos % 64];
col = bitpos / 64;
for (j = i; j < num_deps; j++) {
if (amatrix[j][col] & mask) {
tmp = matrix[i];
matrix[i] = matrix[j];
matrix[j] = tmp;
tmp = amatrix[i];
amatrix[i] = amatrix[j];
amatrix[j] = tmp;
break;
}
}
if (j == num_deps)
continue;
for (j++; j < num_deps; j++) {
if (amatrix[j][col] & mask) {
for (k = 0; k < col_words; k++) {
amatrix[j][k] ^= amatrix[i][k];
matrix[j][k] ^= matrix[i][k];
}
}
}
i++;
}
for (j = 0; j < ncols; j++) {
uint64_t word = 0;
col = j / 64;
mask = bitmask[j % 64];
for (k = i; k < 64; k++) {
if (matrix[k][col] & mask)
word |= bitmask[k];
}
x[j] = word;
}
for (i = 0; i < num_deps; i++) {
flint_free(matrix[i]);
flint_free(amatrix[i]);
}
}
uint64_t * block_lanczos(flint_rand_t state, slong nrows,
slong dense_rows, slong ncols, la_col_t *B) {
uint64_t *vnext, *v[3], *x, *v0;
uint64_t *winv[3];
uint64_t *vt_a_v[2], *vt_a2_v[2];
uint64_t *scratch;
uint64_t *d, *e, *f, *f2;
uint64_t *tmp;
slong s[2][64];
slong i;
slong n = ncols;
slong dim0, dim1;
uint64_t mask0, mask1;
slong vsize;
vsize = FLINT_MAX(nrows, ncols);
v[0] = flint_malloc(vsize * sizeof(uint64_t));
v[1] = flint_calloc(vsize, sizeof(uint64_t));
v[2] = flint_calloc(vsize, sizeof(uint64_t));
vnext = flint_malloc(vsize * sizeof(uint64_t));
x = flint_malloc(vsize * sizeof(uint64_t));
v0 = flint_malloc(vsize * sizeof(uint64_t));
scratch = flint_malloc(FLINT_MAX(vsize, 256 * 8) * sizeof(uint64_t));
winv[0] = flint_malloc(64 * sizeof(uint64_t));
winv[1] = flint_calloc(64, sizeof(uint64_t));
winv[2] = flint_calloc(64, sizeof(uint64_t));
vt_a_v[0] = flint_malloc(64 * sizeof(uint64_t));
vt_a_v[1] = flint_calloc(64, sizeof(uint64_t));
vt_a2_v[0] = flint_malloc(64 * sizeof(uint64_t));
vt_a2_v[1] = flint_calloc(64, sizeof(uint64_t));
d = flint_malloc(64 * sizeof(uint64_t));
e = flint_malloc(64 * sizeof(uint64_t));
f = flint_malloc(64 * sizeof(uint64_t));
f2 = flint_malloc(64 * sizeof(uint64_t));
for (i = 0; i < 64; i++)
s[1][i] = i;
dim0 = 0;
dim1 = 64;
mask1 = (uint64_t)(-1);
#if QS_DEBUG
slong iter = 0;
#endif
for (i = 0; i < n; i++)
#if FLINT_BITS==64
v[0][i] = (uint64_t) n_randlimb(state);
#else
v[0][i] = (uint64_t) n_randlimb(state) + ((uint64_t) n_randlimb(state) << 32);
#endif
memcpy(x, v[0], vsize * sizeof(uint64_t));
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, v[0]);
memcpy(v0, v[0], vsize * sizeof(uint64_t));
while (1) {
#if QS_DEBUG
iter++;
#endif
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], scratch);
mul_trans_MxN_Nx64(dense_rows, ncols, B, scratch, vnext);
mul_64xN_Nx64(v[0], vnext, scratch, vt_a_v[0], n);
mul_64xN_Nx64(vnext, vnext, scratch, vt_a2_v[0], n);
for (i = 0; i < 64; i++) {
if (vt_a_v[0][i] != 0)
break;
}
if (i == 64) {
break;
}
dim0 = find_nonsingular_sub(vt_a_v[0], s[0],
s[1], dim1, winv[0]);
if (dim0 == 0)
break;
mask0 = 0;
for (i = 0; i < dim0; i++)
mask0 |= bitmask[s[0][i]];
for (i = 0; i < 64; i++)
d[i] = (vt_a2_v[0][i] & mask0) ^ vt_a_v[0][i];
mul_64x64_64x64(winv[0], d, d);
for (i = 0; i < 64; i++)
d[i] = d[i] ^ bitmask[i];
mul_64x64_64x64(winv[1], vt_a_v[0], e);
for (i = 0; i < 64; i++)
e[i] = e[i] & mask0;
mul_64x64_64x64(vt_a_v[1], winv[1], f);
for (i = 0; i < 64; i++)
f[i] = f[i] ^ bitmask[i];
mul_64x64_64x64(winv[2], f, f);
for (i = 0; i < 64; i++)
f2[i] = ((vt_a2_v[1][i] & mask1) ^
vt_a_v[1][i]) & mask0;
mul_64x64_64x64(f, f2, f);
for (i = 0; i < n; i++)
vnext[i] = vnext[i] & mask0;
mul_Nx64_64x64_acc(v[0], d, scratch, vnext, n);
mul_Nx64_64x64_acc(v[1], e, scratch, vnext, n);
mul_Nx64_64x64_acc(v[2], f, scratch, vnext, n);
mul_64xN_Nx64(v[0], v0, scratch, d, n);
mul_64x64_64x64(winv[0], d, d);
mul_Nx64_64x64_acc(v[0], d, scratch, x, n);
tmp = v[2];
v[2] = v[1];
v[1] = v[0];
v[0] = vnext;
vnext = tmp;
tmp = winv[2];
winv[2] = winv[1];
winv[1] = winv[0];
winv[0] = tmp;
tmp = vt_a_v[1]; vt_a_v[1] = vt_a_v[0]; vt_a_v[0] = tmp;
tmp = vt_a2_v[1]; vt_a2_v[1] = vt_a2_v[0]; vt_a2_v[0] = tmp;
memcpy(s[1], s[0], 64 * sizeof(slong));
mask1 = mask0;
dim1 = dim0;
}
#if QS_DEBUG
flint_printf("lanczos halted after %wd iterations\n", iter);
#endif
flint_free(vnext);
flint_free(scratch);
flint_free(v0);
flint_free(vt_a_v[0]);
flint_free(vt_a_v[1]);
flint_free(vt_a2_v[0]);
flint_free(vt_a2_v[1]);
flint_free(winv[0]);
flint_free(winv[1]);
flint_free(winv[2]);
flint_free(d);
flint_free(e);
flint_free(f);
flint_free(f2);
if (dim0 == 0) {
#if QS_DEBUG
flint_printf("linear algebra failed; retrying...\n");
#endif
flint_free(x);
flint_free(v[0]);
flint_free(v[1]);
flint_free(v[2]);
return NULL;
}
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[1]);
mul_MxN_Nx64(vsize, dense_rows, ncols, B, v[0], v[2]);
combine_cols(ncols, x, v[0], v[1], v[2]);
mul_MxN_Nx64(vsize, dense_rows, ncols, B, x, v[0]);
for (i = 0; i < ncols; i++) {
if (v[0][i] != 0)
break;
}
if (i < ncols)
flint_throw(FLINT_ERROR, "lanczos error: dependencies don't work %wd\n", i);
flint_free(v[0]);
flint_free(v[1]);
flint_free(v[2]);
return x;
}