namespace asm_code {
const double range_check_range=double((1ull<<53)-1);
const uint64 double_sign_mask=(1ull<<63);
const uint64 double_abs_mask=~double_sign_mask;
void range_check(
reg_vector v, reg_vector range, reg_vector c_double_abs_mask,
string out_of_range_label
) {
EXPAND_MACROS_SCOPE;
m.bind(range, "range");
m.bind(c_double_abs_mask, "double_abs_mask");
m.bind(v, "tmp");
APPEND_M(str( "ANDPD `tmp, `double_abs_mask" ));
APPEND_M(str( "CMPNLEPD `tmp, `range" ));
APPEND_M(str( "PTEST `tmp, `tmp" ));
APPEND_M(str( "JNZ #", out_of_range_label ));
}
void dot_product_exact(
array<reg_vector, 2> a, array<reg_vector, 2> b, reg_vector v, reg_vector range, reg_vector c_double_abs_mask,
string out_of_range_label, bool result_always_in_range=false
) {
EXPAND_MACROS_SCOPE;
m.bind(a, "a");
m.bind(b, "b");
m.bind(v, "v");
APPEND_M(str( "MULPD `b_0, `a_0" ));
APPEND_M(str( "MOVAPD `v, `b_0" ));
range_check(b[0], range, c_double_abs_mask, out_of_range_label);
if (enable_all_instructions) {
APPEND_M(str( "VFMADD231PD `v, `b_1, `a_1" ));
} else {
APPEND_M(str( "MULPD `b_1, `a_1" ));
APPEND_M(str( "ADDPD `v, `b_1" ));
range_check(b[1], range, c_double_abs_mask, out_of_range_label);
}
if (!result_always_in_range) {
APPEND_M(str( "MOVAPD `b_0, `v" ));
range_check(b[0], range, c_double_abs_mask, out_of_range_label);
}
}
void gcd_base_continued_fraction(
reg_alloc regs,
reg_vector ab, reg_vector u, reg_vector v, reg_vector is_lehmer, reg_vector ab_threshold,
string no_progress_label
) {
EXPAND_MACROS_SCOPE;
track_asm( "gcd_base" );
static double_table<continued_fraction> c_table=generate_table(gcd_table_num_exponent_bits, gcd_table_num_fraction_bits);
static bool outputted_table=false;
if (!outputted_table) {
#ifdef CHIAOSX
APPEND_M(str( ".text " ));
#else
APPEND_M(str( ".text 1" ));
#endif
APPEND_M(str( ".balign 64" ));
APPEND_M(asmprefix+str( "gcd_base_table:" ));
string table_data;
auto out_double=[&](double v) {
if (!table_data.empty()) {
table_data += ", ";
}
table_data+=to_hex(*(uint64*)&v);
};
for (continued_fraction c : c_table.data) {
matrix2 mat=c.get_matrix();
out_double(mat[0][0]); out_double(mat[1][0]); out_double(mat[0][1]); out_double(mat[1][1]);
APPEND_M(str( ".quad #", table_data ));
table_data.clear();
}
APPEND_M(str( ".text" ));
outputted_table=true;
}
m.bind(ab, "ab");
m.bind(u, "u");
m.bind(v, "v");
m.bind(is_lehmer, "is_lehmer");
m.bind(ab_threshold, "ab_threshold");
reg_vector m_0=regs.bind_vector(m, "m_0");
reg_vector m_1=regs.bind_vector(m, "m_1");
reg_vector new_ab=regs.bind_vector(m, "new_ab");
reg_vector new_ab_1=regs.bind_vector(m, "new_ab_1");
reg_vector tmp=regs.bind_vector(m, "tmp");
reg_vector tmp2=regs.bind_vector(m, "tmp2");
reg_vector new_u=regs.bind_vector(m, "new_u");
reg_vector new_v=regs.bind_vector(m, "new_v");
reg_vector q=regs.bind_vector(m, "q");
reg_vector c_range_check_range=regs.bind_vector(m, "range_check_range");
reg_vector c_double_abs_mask=regs.bind_vector(m, "double_abs_mask");
reg_scalar q_scalar=regs.bind_scalar(m, "q_scalar");
reg_scalar q_scalar_2=regs.bind_scalar(m, "q_scalar_2");
reg_scalar q_scalar_3=regs.bind_scalar(m, "q_scalar_3");
reg_scalar loop_counter=regs.bind_scalar(m, "loop_counter");
reg_scalar c_table_delta_minus_1=regs.bind_scalar(m, "c_table_delta_minus_1");
APPEND_M(str( "MOV `c_table_delta_minus_1, #", constant_address_uint64(c_table.delta-1, c_table.delta-1) ));
string exit_label=m.alloc_label();
string loop_label=m.alloc_label();
APPEND_M(str( "MOV `loop_counter, #", to_hex(gcd_base_max_iter) ));
APPEND_M(str( "MOVAPD `u, #", constant_address_double(1.0, 0.0) ));
APPEND_M(str( "MOVAPD `v, #", constant_address_double(0.0, 1.0) ));
APPEND_M(str( "MOVAPD `range_check_range, #", constant_address_double(range_check_range, range_check_range) ));
APPEND_M(str( "MOVAPD `double_abs_mask, #", constant_address_uint64(double_abs_mask, double_abs_mask) ));
APPEND_M(str( "MOVAPD `tmp, `ab" ));
APPEND_M(str( "SHUFPD `tmp, `tmp, 3" )); APPEND_M(str( "MOVAPD `q, `ab" ));
APPEND_M(str( "DIVSD `q, `tmp" ));
{
APPEND_M(str( "#:", loop_label ));
track_asm( "gcd_base iter" );
string no_table_label=m.alloc_label();
APPEND_M( "#gcd_base loop start" );
APPEND_M(str( "MOVQ `q_scalar, `q" ));
APPEND_M(str( "MOV `q_scalar_2, `q_scalar" ));
APPEND_M(str( "MOV `q_scalar_3, `q_scalar" ));
assert(c_table.right_shift_amount>5);
APPEND_M(str( "SHR `q_scalar, #", to_hex(c_table.right_shift_amount-5) ));
APPEND_M(str( "AND `q_scalar, -32" ));
APPEND_M(str( "SUB `q_scalar, #", to_hex(c_table.range_start_shifted<<5) ));
APPEND_M(str( "JB #", track_asm( "gcd_base below table start", no_table_label ) ));
APPEND_M(str( "CMP `q_scalar, #", to_hex((c_table.range_end_shifted-c_table.range_start_shifted)<<5) ));
APPEND_M(str( "JAE #", track_asm( "gcd_base after table end", no_table_label ) ));
#ifdef CHIAOSX
APPEND_M(str( "LEA RSI,[RIP+")+asmprefix+str("gcd_base_table]"));
APPEND_M(str( "MOVAPD `m_0, [`q_scalar+RSI]" ));
APPEND_M(str( "MOVAPD `m_1, [16+`q_scalar+RSI]" ));
#else
APPEND_M(str( "MOVAPD `m_0, [")+asmprefix+str("gcd_base_table+`q_scalar]" ));
APPEND_M(str( "MOVAPD `m_1, [")+asmprefix+str("gcd_base_table+16+`q_scalar]" ));
#endif
APPEND_M(str( "MOVAPD `tmp, `ab" ));
APPEND_M(str( "CMPLEPD `tmp, `ab_threshold" )); APPEND_M(str( "PTEST `tmp, `tmp" ));
APPEND_M(str( "JNZ #", track_asm( "gcd_base ab[1]<=ab_threshold", exit_label ) ));
APPEND_M(str( "AND `q_scalar_2, `c_table_delta_minus_1" ));
APPEND_M(str( "JZ #", track_asm( "gcd_base on slot boundary", no_table_label ) ));
APPEND_M(str( "CMP `q_scalar_2, `c_table_delta_minus_1" ));
APPEND_M(str( "JE #", track_asm( "gcd_base on slot boundary", no_table_label ) ));
auto calculate_using_m=[&](string fail_label) {
APPEND_M(str( "MOVAPD `tmp, `ab" ));
APPEND_M(str( "SHUFPD `tmp, `tmp, 0" ));
APPEND_M(str( "MOVAPD `tmp2, `ab" ));
APPEND_M(str( "SHUFPD `tmp2, `tmp2, 3" ));
dot_product_exact(
{m_0, m_1}, {tmp, tmp2}, new_ab, c_range_check_range, c_double_abs_mask,
track_asm( "gcd_base ab range check failed", fail_label),
true
);
APPEND_M(str( "MOVAPD `new_ab_1, `new_ab" ));
APPEND_M(str( "SHUFPD `new_ab_1, `new_ab_1, 3" ));
APPEND_M(str( "MOVAPD `q, `new_ab" ));
APPEND_M(str( "DIVSD `q, `new_ab_1" ));
APPEND_M(str( "MOVAPD `tmp, `u" ));
APPEND_M(str( "SHUFPD `tmp, `tmp, 0" ));
APPEND_M(str( "MOVAPD `tmp2, `u" ));
APPEND_M(str( "SHUFPD `tmp2, `tmp2, 3" ));
dot_product_exact(
{m_0, m_1}, {tmp, tmp2}, new_u, c_range_check_range, c_double_abs_mask,
track_asm( "gcd_base uv range check failed", fail_label)
);
APPEND_M(str( "MOVAPD `tmp, `v" ));
APPEND_M(str( "SHUFPD `tmp, `tmp, 0" ));
APPEND_M(str( "MOVAPD `tmp2, `v" ));
APPEND_M(str( "SHUFPD `tmp2, `tmp2, 3" ));
dot_product_exact(
{m_0, m_1}, {tmp, tmp2}, new_v, c_range_check_range, c_double_abs_mask,
track_asm( "gcd_base uv range check failed", fail_label)
);
};
calculate_using_m(no_table_label);
APPEND_M(str( "UCOMISD `new_ab, `ab_threshold" ));
APPEND_M(str( "JBE #", track_asm( "gcd_base new_ab[0]<=ab_threshold for table", no_table_label ) ));
string lehmer_label=m.alloc_label();
APPEND_M(str( "JMP #", lehmer_label ));
APPEND_M(str( "#:", no_table_label ));
APPEND_M( "#gcd_base no table" );
{
track_asm( "gcd_base iter no table" );
APPEND_M(str( "MOVAPD `tmp, `ab" ));
APPEND_M(str( "CMPLEPD `tmp, `ab_threshold" )); APPEND_M(str( "PTEST `tmp, `tmp" ));
APPEND_M(str( "JNZ #", track_asm( "gcd_base ab[1]<=ab_threshold", exit_label ) ));
APPEND_M(str( "MOVQ `q, `q_scalar_3" ));
APPEND_M(str( "ROUNDSD `q, `q, 1" ));
APPEND_M(str( "MOVAPD `m_0, #", constant_address_double(0.0, 1.0) ));
APPEND_M(str( "MOVAPD `m_1, `m_0" )); APPEND_M(str( "SUBSD `m_1, `q" )); APPEND_M(str( "SHUFPD `m_1, `m_1, 1" ));
calculate_using_m(exit_label);
}
APPEND_M(str( "#:", lehmer_label ));
APPEND_M( "#gcd_base end no table" );
APPEND_M(str( "MOVAPD `m_0, `new_u" ));
APPEND_M(str( "SHUFPD `m_0, `new_v, 0" ));
APPEND_M(str( "MOVAPD `m_1, `new_u" ));
APPEND_M(str( "SHUFPD `m_1, `new_v, 3" ));
APPEND_M(str( "MOVAPD `tmp, `new_ab" ));
APPEND_M(str( "SHUFPD `tmp, `tmp, 0" ));
APPEND_M(str( "SUBPD `tmp, `new_ab_1" ));
APPEND_M(str( "ADDPD `tmp, `m_0" ));
APPEND_M(str( "CMPLTPD `tmp, `m_1" ));
APPEND_M(str( "XORPD `m_1, #", constant_address_uint64(double_sign_mask, double_sign_mask) ));
APPEND_M(str( "CMPLTPD `new_ab_1, `m_1" ));
APPEND_M(str( "ORPD `tmp, `new_ab_1" )); APPEND_M(str( "ANDPD `tmp, `is_lehmer" )); APPEND_M(str( "PTEST `tmp, `tmp" ));
APPEND_M(str( "JNZ #", track_asm( "gcd_base lehmer failed", exit_label ) ));
APPEND_M(str( "MOVAPD `ab, `new_ab" ));
APPEND_M(str( "MOVAPD `u, `new_u" ));
APPEND_M(str( "MOVAPD `v, `new_v" ));
track_asm( "gcd_base good iter" );
APPEND_M(str( "DEC `loop_counter" ));
APPEND_M(str( "JNZ #", loop_label ));
APPEND_M( "#gcd_base loop end" );
}
track_asm( "gcd_base good exit" );
APPEND_M(str( "#:", exit_label ));
APPEND_M(str( "CMP `loop_counter, #", to_hex(gcd_base_max_iter) ));
APPEND_M(str( "JE #", track_asm( "gcd_base no progress", no_progress_label ) ));
}
}