#include <gmp.h>
#include "ulong_extras.h"
#include "fft_small.h"
#include "profiler.h"
void flint_print_d_fixed(double x, ulong l)
{
ulong i;
TMP_INIT;
TMP_START;
char* s = TMP_ARRAY_ALLOC(l + 1, char);
for (i = 0; i < l; i++)
s[i] = ' ';
s[l] = 0;
ulong y = fabs(rint(x));
while (l > 0)
{
s[--l] = '0' + (y%10);
y = y/10;
if (y == 0)
break;
}
printf("%s", s);
TMP_END;
}
void flint_print_d_fixed_dot(double x, ulong l, ulong r)
{
ulong i;
TMP_INIT;
TMP_START;
char* s = TMP_ARRAY_ALLOC(l + r + 2, char);
for (i = 0; i < l + r + 1; i++)
s[i] = ' ';
s[l + r + 1] = 0;
s[l] = '.';
ulong y = fabs(rint(x*n_pow(10, r)));
while (r > 0)
{
--r;
s[l+1+r] = '0' + (y%10);
y = y/10;
}
while (l > 0)
{
--l;
s[l] = '0' + (y%10);
y = y/10;
if (y == 0)
break;
}
printf("%s", s);
TMP_END;
}
void profile_mul(
mpn_ctx_t Q,
int min_lg_bit_size, int max_lg_bit_size,
int for_latex,
int with_precomp)
{
timeit_t timer;
int first = 1;
min_lg_bit_size = FLINT_MAX(min_lg_bit_size, 12);
for (int lg_bit_size = min_lg_bit_size; lg_bit_size < max_lg_bit_size; lg_bit_size++)
{
ulong max_len = n_pow2(1+lg_bit_size-6);
ulong* data = FLINT_ARRAY_ALLOC(max_len*2, ulong);
for (ulong i = 0; i < max_len; i++)
data[i] = -(ulong)(1+i);
double total_precomp = 0;
double max_precomp = 0;
ulong nprecomp_samples = 0;
for (int ci = 0; ci < 10; ci++)
{
ulong cn = max_len/2*pow(2.0, ci*1.0/10);
ulong cbits = cn*64;
double lgcbits = log2(cbits);
ulong nsamples = 0;
double total_time = 0;
double max_time = 0;
double min_time = 1.0e100;
for (ulong an = (cn+1)/2; an <= 3*cn/4; an += 1+an/12)
{
ulong bn = cn - an;
if (!(bn <= an && an < cn))
continue;
nsamples++;
ulong* a = data;
ulong* b = data + an;
ulong* c = data + max_len;
double precomp = 0;
if (with_precomp)
{
timeit_start_us(timer);
mpn_ctx_mpn_mul(Q, c, a, an, b, bn);
timeit_stop_us(timer);
precomp = (double)timer->wall;
nprecomp_samples++;
}
ulong nreps = 1 + 30000000/(cn*n_clog2(cn));
timeit_start_us(timer);
for (ulong rep = 0; rep < nreps; rep++)
mpn_ctx_mpn_mul(Q, c, a, an, b, bn);
timeit_stop_us(timer);
double time = ((double)timer->wall)/nreps;
precomp -= time;
precomp = precomp*1e5/(lgcbits*cbits);
max_precomp = FLINT_MAX(max_precomp, precomp);
total_precomp += precomp;
time = time*1e5/(lgcbits*cbits);
total_time += time;
min_time = FLINT_MIN(min_time, time);
max_time = FLINT_MAX(max_time, time);
}
if (nsamples < 1)
continue;
if (for_latex)
{
if (!first)
flint_printf(",");
flint_printf("(%f,%f),", lgcbits, min_time);
flint_printf("(%f,%f),", lgcbits, total_time/nsamples);
flint_printf("(%f,%f)", lgcbits, max_time);
}
else
{
flint_print_d_fixed_dot(lgcbits, 2, 2);
flint_printf(":");
flint_print_d_fixed_dot(total_time/nsamples, 3, 3);
flint_print_d_fixed_dot(min_time, 3, 3);
flint_print_d_fixed_dot(max_time, 3, 3);
flint_printf("\n");
}
fflush(stdout);
first = 0;
}
if (!for_latex && nprecomp_samples > 0)
{
flint_printf("precomp: avg %f, max %f\n", total_precomp/nprecomp_samples, max_precomp);
}
flint_free(data);
}
}
void profile_mul_gmp(
int min_lg_bit_size, int max_lg_bit_size,
int for_latex,
int with_precomp)
{
timeit_t timer;
int first = 1;
min_lg_bit_size = FLINT_MAX(min_lg_bit_size, 12);
for (int lg_bit_size = min_lg_bit_size; lg_bit_size < max_lg_bit_size; lg_bit_size++)
{
ulong max_len = n_pow2(1+lg_bit_size-6);
ulong* data = FLINT_ARRAY_ALLOC(max_len*2, ulong);
for (ulong i = 0; i < max_len; i++)
data[i] = -(ulong)(1+i);
double total_precomp = 0;
double max_precomp = 0;
ulong nprecomp_samples = 0;
for (int ci = 0; ci < 10; ci++)
{
ulong cn = max_len/2*pow(2.0, ci*1.0/10);
ulong cbits = cn*64;
double lgcbits = log2(cbits);
ulong nsamples = 0;
double total_time = 0;
double max_time = 0;
double min_time = 1.0e100;
for (ulong an = (cn+1)/2; an <= 3*cn/4; an += 1+an/12)
{
ulong bn = cn - an;
if (!(bn <= an && an < cn))
continue;
nsamples++;
ulong* a = data;
ulong* b = data + an;
ulong* c = data + max_len;
double precomp = 0;
if (with_precomp)
{
timeit_start(timer);
mpn_mul(c, a, an, b, bn);
timeit_stop(timer);
precomp = (double)timer->wall;
nprecomp_samples++;
}
ulong nreps = 1 + 20000000/(cn*n_clog2(cn));
timeit_start(timer);
for (ulong rep = 0; rep < nreps; rep++)
mpn_mul(c, a, an, b, bn);
timeit_stop(timer);
precomp -= ((double)timer->wall)/nreps;
max_precomp = FLINT_MAX(max_precomp, precomp*1e8/(lgcbits*cbits));
total_precomp += precomp*1e8/(lgcbits*cbits);
double time = ((double)timer->wall)*1e8/(lgcbits*cbits*nreps);
total_time += time;
min_time = FLINT_MIN(min_time, time);
max_time = FLINT_MAX(max_time, time);
}
if (nsamples < 1)
continue;
if (for_latex)
{
if (!first)
flint_printf(",");
flint_printf("(%f,%f),", lgcbits, min_time);
flint_printf("(%f,%f),", lgcbits, total_time/nsamples);
flint_printf("(%f,%f)", lgcbits, max_time);
}
else
{
flint_print_d_fixed_dot(lgcbits, 2, 2);
flint_printf(":");
flint_print_d_fixed_dot(total_time/nsamples, 3, 3);
flint_print_d_fixed_dot(min_time, 3, 3);
flint_print_d_fixed_dot(max_time, 3, 3);
flint_printf("\n");
}
fflush(stdout);
first = 0;
}
if (!for_latex && nprecomp_samples > 0)
{
flint_printf("precomp: %f, %f\n", total_precomp/nprecomp_samples, max_precomp);
}
flint_free(data);
}
}
int main(void)
{
mpn_ctx_t R;
#if 0#endif
mpn_ctx_init(R, UWORD(0x0003f00000000001));
flint_printf(" --- fft_small 1 thread --- \n");
flint_set_num_threads(1);
profile_mul(R, 20, 26, 0, 1);
#if 0#endif
#if 0#endif
mpn_ctx_clear(R);
return 0;
}