1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#ifndef GCD_UNSIGNED_H
#define GCD_UNSIGNED_H
//threshold is 0 to calculate the normal gcd
template<int size> void gcd_unsigned_slow(
array<fixed_integer<uint64, size>, 2>& ab,
array<fixed_integer<uint64, size>, 2>& uv,
int& parity,
const fixed_integer<uint64, size>& threshold=fixed_integer<uint64, size>(integer(0))
) {
assert(ab[0]>threshold);
while (ab[1]>threshold) {
fixed_integer<uint64, size> q(ab[0]/ab[1]);
fixed_integer<uint64, size> r(ab[0]%ab[1]);
ab[0]=ab[1];
ab[1]=r;
//this is the absolute value of the cofactor matrix
auto u1_new=uv[0] + q*uv[1];
uv[0]=uv[1];
uv[1]=u1_new;
parity=-parity;
}
}
//todo
//test this by making two numbers that have a specified quotient sequence. can add big quotients then
//to generate numbers with a certain quotient sequence:
//euclidean algorithm: q=a/b ; a'=b ; b'=a-q*b ; terminates when b'=0
//initially b'=0 and all qs are known
//first iteration: b'=a-q*b=0 ; a=q*b ; select some b and this will determine a
//next: b'=a-q*b ; a'=b ; b'=a-q*a' ; b'+q*a'=a
//uv is <1,0> to calculate |u| and <0,1> to calculate |v|
//parity is negated for each quotient
template<int size> void gcd_unsigned(
array<fixed_integer<uint64, size>, 2>& ab,
array<fixed_integer<uint64, size>, 2>& uv,
int& parity,
fixed_integer<uint64, size> threshold=fixed_integer<uint64, size>(integer(0))
) {
typedef fixed_integer<uint64, size> int_t;
static int test_asm_counter=0;
++test_asm_counter;
bool test_asm_run=true;
bool test_asm_print=(test_asm_counter%1000==0);
bool debug_output=false;
assert(ab[0]>=ab[1] && !ab[1].is_negative());
assert(!ab[0].is_negative() && !ab[1].is_negative());
assert(!uv[0].is_negative() && !uv[1].is_negative());
auto ab_start=ab;
auto uv_start=uv;
int parity_start=parity;
int a_num_bits_old=-1;
int iter=0;
vector<array<array<uint64, 2>, 2>> matricies;
vector<int> local_parities;
bool valid=true;
while (true) {
assert(ab[0]>=ab[1] && !ab[1].is_negative());
if (debug_output) {
print( "" );
print( "" );
print( "====================================" );
for (int x=0;x<size;++x) print( "a limb", x, ab[0][x] );
print( "" );
for (int x=0;x<size;++x) print( "b limb", x, ab[1][x] );
print( "" );
for (int x=0;x<size;++x) print( "threshold limb", x, threshold[x] );
print( "" );
}
if (ab[0]<=threshold) {
valid=false;
print( " gcd_unsigned slow 1" );
break;
}
if (ab[1]<=threshold) {
if (debug_output) print( "ab[1]<=threshold" );
break;
}
//there is a cached num limbs for a. the num limbs for b and ab_threshold is smaller
//to calculate the new cached num limbs:
//-look at previous value. if limb is 0, go on to the next lowest limb. a cannot be 0 but should still tolerate this without crashing
//-unroll this two times
//-if more than 2 iterations are required, use a loop
//-a can only decrease in size so its true size can't be larger
//-this also calculates the head limb of a. need the top 3 head limbs. they are 0-padded if a is less than 3 nonzero limbs
//-the 3 head limbs are used to do the shift
//-this also truncates threshold and compares a[1] with the truncated value. it will exit if they are equal. this is not
// exactly the same as the c++ code
//-should probably implement this in c++ first then to make the two codes the same
int a_num_bits=ab[0].num_bits();
int shift_amount=a_num_bits-128; //todo //changed this to 128 bits
if (shift_amount<0) {
shift_amount=0;
}
//print( "gcd_unsigned", a_num_bits, a_num_bits_old-a_num_bits );
a_num_bits_old=a_num_bits;
array<uint128, 2> ab_head={
uint128(ab[0].window(shift_amount)) | (uint128(ab[0].window(shift_amount+64))<<64),
uint128(ab[1].window(shift_amount)) | (uint128(ab[1].window(shift_amount+64))<<64)
};
//assert((ab_head[0]>>127)==0);
//assert((ab_head[1]>>127)==0);
uint128 threshold_head=uint128(threshold.window(shift_amount)) | (uint128(threshold.window(shift_amount+64))<<64);
//assert((threshold_head>>127)==0);
//don't actually need to do this
//it will compare threshold_head with > so it will already exit if they are equal
//if (shift_amount!=0) {
// ++threshold_head;
//}
if (debug_output) print( "a_num_bits:", a_num_bits );
if (debug_output) print( "a last index:", (a_num_bits+63/64)-1 );
if (debug_output) print( "shift_amount:", shift_amount );
if (debug_output) print( "ab_head[0]:", uint64(ab_head[0]), uint64(ab_head[0]>>64) );
if (debug_output) print( "ab_head[1]:", uint64(ab_head[1]), uint64(ab_head[1]>>64) );
if (debug_output) print( "threshold_head:", uint64(threshold_head), uint64(threshold_head>>64) );
array<array<uint64, 2>, 2> uv_uint64;
int local_parity; //1 if odd, 0 if even
if (gcd_128(ab_head, uv_uint64, local_parity, shift_amount!=0, threshold_head)) {
//int local_parity=(uv_double[1][1]<0)? 1 : 0; //sign bit
bool even=(local_parity==0);
if (debug_output) print( "u:", uv_uint64[0][0], uv_uint64[1][0] );
if (debug_output) print( "v:", uv_uint64[0][1], uv_uint64[1][1] );
if (debug_output) print( "local parity:", local_parity );
uint64 uv_00=uv_uint64[0][0];
uint64 uv_01=uv_uint64[0][1];
uint64 uv_10=uv_uint64[1][0];
uint64 uv_11=uv_uint64[1][1];
//can use a_num_bits to make these smaller. this is at most a 2x speedup for these mutliplications which probably doesn't matter
//can do this with an unsigned subtraction and just swap the pointers
//
//this is an unsigned subtraction with the input pointers swapped to make the result nonnegative
//
//this uses mulx/adox/adcx if available for the multiplication
//will unroll the multiplication loop but early-exit based on the number of limbs in a (calculated before). this gives each
//branch its own branch predictor entry. each branch is at a multiple of 4 limbs. don't need to pad a
int_t a_new_1=ab[0]; a_new_1*=uv_00; a_new_1.set_negative(!even);
int_t a_new_2=ab[1]; a_new_2*=uv_01; a_new_2.set_negative(even);
int_t b_new_1=ab[0]; b_new_1*=uv_10; b_new_1.set_negative(even);
int_t b_new_2=ab[1]; b_new_2*=uv_11; b_new_2.set_negative(!even);
//both of these are subtractions; the signs determine the direction. the result is nonnegative
int_t a_new;
int_t b_new;
if (!even) {
a_new=int_t(a_new_2 + a_new_1);
b_new=int_t(b_new_1 + b_new_2);
} else {
a_new=int_t(a_new_1 + a_new_2);
b_new=int_t(b_new_2 + b_new_1);
}
//this allows the add function to be optimized
assert(!a_new.is_negative());
assert(!b_new.is_negative());
//do not do any of this stuff; instead return an array of matricies
//the array is processed while it is being generated so it is cache line aligned, has a counter, etc
ab[0]=a_new;
ab[1]=b_new;
//bx and by are nonnegative
auto dot=[&](uint64 ax, uint64 ay, int_t bx, int_t by) -> int_t {
bx*=ax;
by*=ay;
return int_t(bx+by);
};
int_t new_uv_0=dot(uv_00, uv_01, uv[0], uv[1]);
int_t new_uv_1=dot(uv_10, uv_11, uv[0], uv[1]);
uv[0]=new_uv_0;
uv[1]=new_uv_1;
//local_parity is 0 even, 1 odd
//want 1 even, -1 odd
//todo: don't do this; just make it 0 even, 1 odd
parity*=1-local_parity-local_parity;
matricies.push_back(uv_uint64);
local_parities.push_back(local_parity);
} else {
//can just make the gcd fail if this happens in the asm code
print( " gcd_unsigned slow" );
//todo assert(false); //very unlikely to happen if there are no bugs
valid=false;
break;
/*had_slow=true;
fixed_integer<uint64, size> q(ab[0]/ab[1]);
fixed_integer<uint64, size> r(ab[0]%ab[1]);
ab[0]=ab[1];
ab[1]=r;
//this is the absolute value of the cofactor matrix
auto u1_new=uv[0] + q*uv[1];
uv[0]=uv[1];
uv[1]=u1_new;
parity=-parity;*/
}
++iter;
}
{
auto ab2=ab_start;
auto uv2=uv_start;
int parity2=parity_start;
gcd_unsigned_slow(ab2, uv2, parity2, threshold);
if (valid) {
assert(integer(ab[0]) == integer(ab2[0]));
assert(integer(ab[1]) == integer(ab2[1]));
assert(integer(uv[0]) == integer(uv2[0]));
assert(integer(uv[1]) == integer(uv2[1]));
assert(parity==parity2);
} else {
ab=ab2;
uv=uv2;
parity=parity2;
}
}
#ifdef TEST_ASM
if (test_asm_run) {
if (test_asm_print) {
print( "test asm gcd_unsigned", test_asm_counter );
}
asm_code::asm_func_gcd_unsigned_data asm_data;
const int asm_size=gcd_size;
const int asm_max_iter=gcd_max_iterations;
assert(size>=1 && size<=asm_size);
fixed_integer<uint64, asm_size> asm_a(ab_start[0]);
fixed_integer<uint64, asm_size> asm_b(ab_start[1]);
fixed_integer<uint64, asm_size> asm_a_2;
fixed_integer<uint64, asm_size> asm_b_2;
fixed_integer<uint64, asm_size> asm_threshold(threshold);
uint64 asm_uv_counter_start=1234;
uint64 asm_uv_counter=asm_uv_counter_start;
array<array<uint64, 8>, asm_max_iter+1> asm_uv;
asm_data.a=&asm_a[0];
asm_data.b=&asm_b[0];
asm_data.a_2=&asm_a_2[0];
asm_data.b_2=&asm_b_2[0];
asm_data.threshold=&asm_threshold[0];
asm_data.uv_counter_start=asm_uv_counter_start;
asm_data.out_uv_counter_addr=&asm_uv_counter;
asm_data.out_uv_addr=(uint64*)&asm_uv[1];
asm_data.iter=-2; //uninitialized
asm_data.a_end_index=size-1;
int error_code=hasAVX2()?
asm_code::asm_avx2_func_gcd_unsigned(&asm_data):
asm_code::asm_cel_func_gcd_unsigned(&asm_data);
auto asm_get_uv=[&](int i) {
array<array<uint64, 2>, 2> res;
res[0][0]=asm_uv[i+1][0];
res[1][0]=asm_uv[i+1][1];
res[0][1]=asm_uv[i+1][2];
res[1][1]=asm_uv[i+1][3];
return res;
};
auto asm_get_parity=[&](int i) {
uint64 r=asm_uv[i+1][4];
assert(r==0 || r==1);
return bool(r);
};
auto asm_get_exit_flag=[&](int i) {
uint64 r=asm_uv[i+1][5];
assert(r==0 || r==1);
return bool(r);
};
if (error_code==0) {
assert(valid);
assert(asm_data.iter>=0 && asm_data.iter<=asm_max_iter); //total number of iterations performed
bool is_even=((asm_data.iter-1)&1)==0; //parity of last iteration (can be -1)
fixed_integer<uint64, asm_size>& asm_a_res=(is_even)? asm_a_2 : asm_a;
fixed_integer<uint64, asm_size>& asm_b_res=(is_even)? asm_b_2 : asm_b;
assert(integer(asm_a_res) == integer(ab[0]));
assert(integer(asm_b_res) == integer(ab[1]));
for (int x=0;x<=matricies.size();++x) {
assert( asm_get_exit_flag(x-1) == (x==matricies.size()) );
if (x!=matricies.size()) {
assert(asm_get_parity(x)==local_parities[x]);
assert(asm_get_uv(x)==matricies[x]);
}
}
assert(matricies.size()==asm_data.iter);
assert(asm_uv_counter==asm_uv_counter_start+asm_data.iter-1); //the last iteration that updated the counter is iter-1
} else {
if (!valid) {
print( "test asm gcd_unsigned error", error_code );
}
}
}
#endif
assert(integer(ab[0])>integer(threshold));
assert(integer(ab[1])<=integer(threshold));
}
// end Headerguard GCD_UNSIGNED_H
#endif