namespace asm_code {
struct asm_integer {
reg_scalar addr_base;
reg_spill addr_base_spill;
int addr_offset=0;
bool is_signed=false;
int size=0;
asm_integer() {}
asm_integer(reg_spill t_spill, int t_size) {
addr_base_spill=t_spill;
size=t_size;
}
string operator[](int pos) {
assert(pos>=0 && pos<size);
return str( "[#+#]", addr_base.name(), to_hex(addr_offset+pos*8) );
}
bool is_null() {
return size==0;
}
void update_end_index(reg_alloc regs, reg_scalar end_index) {
EXPAND_MACROS_SCOPE;
assert(size%4==0);
assert(addr_offset==0);
m.bind(end_index, "end_index");
m.bind(addr_base, "addr_base");
reg_scalar tmp_value=regs.bind_scalar(m, "tmp_value");
reg_scalar tmp_0=regs.bind_scalar(m, "tmp_0");
reg_scalar tmp_8=regs.bind_scalar(m, "tmp_8");
APPEND_M(str( "LEA `end_index, [`addr_base+`end_index*8]" ));
APPEND_M(str( "XOR `tmp_0, `tmp_0" ));
APPEND_M(str( "MOV `tmp_8, 8" ));
string loop_label=m.alloc_label();
const int num_unroll=2;
assert(num_unroll>=1);
for (int x=0;x<num_unroll;++x) {
if (x==num_unroll-1) {
APPEND_M(str( "#:", loop_label ));
}
APPEND_M(str( "MOV `tmp_value, [`end_index]" ));
APPEND_M(str( "CMP `tmp_value, `tmp_0" ));
APPEND_M(str( "MOV `tmp_value, `tmp_0" ));
APPEND_M(str( "CMOVE `tmp_value, `tmp_8" ));
APPEND_M(str( "CMP `end_index, `addr_base" ));
APPEND_M(str( "CMOVE `tmp_value, `tmp_0" ));
APPEND_M(str( "SUB `end_index, `tmp_value" ));
if (x==1) {
APPEND_M(str( "CMP `tmp_value, `tmp_0" ));
APPEND_M(str( "JNE #", track_asm( "update_end_index loop", loop_label ) ));
}
}
APPEND_M(str( "SUB `end_index, `addr_base" ));
APPEND_M(str( "SHR `end_index, 3" ));
}
void calculate_head_start(reg_alloc regs, reg_scalar end_index) {
EXPAND_MACROS_SCOPE;
assert(size%4==0);
m.bind(end_index, "end_index");
reg_scalar tmp=regs.bind_scalar(m, "tmp");
APPEND_M(str( "XOR `tmp, `tmp" ));
APPEND_M(str( "SUB `end_index, 2" ));
APPEND_M(str( "CMOVB `end_index, `tmp" ));
}
void extract_head_at(reg_alloc regs, reg_scalar head_start, array<reg_scalar, 3> res) {
EXPAND_MACROS_SCOPE;
assert(size%4==0);
m.bind(addr_base, "addr_base");
m.bind(head_start, "head_start");
m.bind(res, "res");
reg_scalar tmp_addr=regs.bind_scalar(m, "tmp_addr");
APPEND_M(str( "LEA `tmp_addr, [`addr_base+`head_start*8+#]", to_hex(addr_offset) ));
APPEND_M(str( "MOV `res_0, [`tmp_addr]" ));
APPEND_M(str( "MOV `res_1, [`tmp_addr+8]" ));
APPEND_M(str( "MOV `res_2, [`tmp_addr+16]" ));
}
void mul_add_bmi(
reg_alloc regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
) {
EXPAND_MACROS_SCOPE;
m.bind(b, "b");
reg_scalar mul_low_0=regs.bind_scalar(m, "mul_low_0");
reg_scalar mul_low_1=regs.bind_scalar(m, "mul_low_1");
reg_scalar mul_high_0=regs.bind_scalar(m, "mul_high_0");
reg_scalar mul_high_1=regs.bind_scalar(m, "mul_high_1");
reg_scalar rdx=regs.bind_scalar(m, "rdx", reg_rdx);
APPEND_M(str( "XOR RDX, RDX" ));
if (carry_in_is_1) {
APPEND_M(str( "STC" ));
}
APPEND_M(str( "MOV RDX, `b" ));
for (int pos=0;pos<size;pos+=2) {
bool first=(pos==0);
APPEND_M(str( "MULX `mul_high_0, `mul_low_0, #", a[pos] ));
if (!first) {
APPEND_M(str( "ADOX `mul_low_0, `mul_high_1" ));
}
APPEND_M(str( "MULX `mul_high_1, `mul_low_1, #", a[pos+1] ));
APPEND_M(str( "ADOX `mul_low_1, `mul_high_0" ));
if (!c.is_null()) {
APPEND_M(str( "ADCX `mul_low_0, #", c[pos] ));
APPEND_M(str( "ADCX `mul_low_1, #", c[pos+1] ));
}
if (invert_output) {
APPEND_M(str( "NOT `mul_low_0" ));
APPEND_M(str( "NOT `mul_low_1" ));
}
APPEND_M(str( "MOV #, `mul_low_0", (*this)[pos] ));
APPEND_M(str( "MOV #, `mul_low_1", (*this)[pos+1] ));
}
}
void mul_add_slow(
reg_alloc regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
) {
EXPAND_MACROS_SCOPE;
m.bind(b, "b");
reg_scalar mul_carry=regs.bind_scalar(m, "mul_carry");
reg_scalar add_carry=regs.bind_scalar(m, "add_carry");
reg_scalar mul_high_4_previous=regs.bind_scalar(m, "mul_high_4_previous");
reg_scalar mul_low_0=regs.bind_scalar(m, "mul_low_0");
reg_scalar mul_low_1=regs.bind_scalar(m, "mul_low_1");
reg_scalar mul_low_2=regs.bind_scalar(m, "mul_low_2");
reg_scalar mul_low_3=regs.bind_scalar(m, "mul_low_3", reg_rax);
reg_scalar mul_high_0=regs.bind_scalar(m, "mul_high_0");
reg_scalar mul_high_1=regs.bind_scalar(m, "mul_high_1");
reg_scalar mul_high_2=regs.bind_scalar(m, "mul_high_2");
reg_scalar mul_high_3=regs.bind_scalar(m, "mul_high_3", reg_rdx);
for (int pos=0;pos<size;pos+=4) {
bool first=(pos==0);
bool last=(pos==size-4);
for (int x=0;x<4;++x) {
APPEND_M(str( "MOV RAX, `b" ));
APPEND_M(str( "MUL QWORD PTR #", a[pos+x] ));
if (x==3) {
assert(mul_low_3.value==reg_rax.value);
assert(mul_high_3.value==reg_rdx.value);
} else {
APPEND_M(str( "MOV `mul_low_#, RAX", x ));
APPEND_M(str( "MOV `mul_high_#, RDX", x ));
}
}
if (first) {
APPEND_M(str( "ADD `mul_low_1, `mul_high_0" ));
} else {
APPEND_M(str( "ADD `mul_carry, 1" )); APPEND_M(str( "ADC `mul_low_0, `mul_high_4_previous" ));
APPEND_M(str( "ADC `mul_low_1, `mul_high_0" ));
}
APPEND_M(str( "ADC `mul_low_2, `mul_high_1" ));
APPEND_M(str( "ADC `mul_low_3, `mul_high_2" ));
if (!last) {
APPEND_M(str( "MOV `mul_high_4_previous, `mul_high_3" ));
APPEND_M(str( "SBB `mul_carry, `mul_carry" )); }
if (!c.is_null()) {
if (first) {
if (carry_in_is_1) {
APPEND_M(str( "STC" ));
APPEND_M(str( "ADC `mul_low_0, #", c[pos] ));
} else {
APPEND_M(str( "ADD `mul_low_0, #", c[pos] ));
}
} else {
APPEND_M(str( "ADD `add_carry, 1" )); APPEND_M(str( "ADC `mul_low_0, #", c[pos] ));
}
for (int x=1;x<4;++x) {
APPEND_M(str( "ADC `mul_low_#, #", x, c[pos+x] ));
}
if (!last) {
APPEND_M(str( "SBB `add_carry, `add_carry" )); }
}
for (int x=0;x<4;++x) {
if (invert_output) {
APPEND_M(str( "NOT `mul_low_#", x ));
}
APPEND_M(str( "MOV #, `mul_low_#", (*this)[pos+x], x ));
}
}
}
void mul_add(
const reg_alloc& regs, asm_integer a, reg_scalar b, asm_integer c, bool invert_output, bool carry_in_is_1
) {
EXPAND_MACROS_SCOPE;
assert(!carry_in_is_1 || !c.is_null());
assert(size%4==0);
assert(size==a.size && (c.is_null() || size==c.size));
if (enable_all_instructions) {
mul_add_bmi(regs, a, b, c, invert_output, carry_in_is_1);
} else {
mul_add_slow(regs, a, b, c, invert_output, carry_in_is_1);
}
}
};
void calculate_shift_amount(reg_alloc regs, array<reg_scalar, 3> limbs, reg_scalar res) {
EXPAND_MACROS_SCOPE;
m.bind(limbs, "limbs");
m.bind(res, "res");
reg_scalar tmp=regs.bind_scalar(m, "tmp");
APPEND_M(str( "BSR `res, `limbs_2" ));
APPEND_M(str( "INC `res" ));
APPEND_M(str( "XOR `tmp, `tmp" ));
APPEND_M(str( "CMP `limbs_2, `tmp" ));
APPEND_M(str( "CMOVE `res, `tmp" ));
}
void shift_right(reg_alloc regs, array<reg_scalar, 3> limbs, reg_scalar amount) {
EXPAND_MACROS_SCOPE;
m.bind(limbs, "limbs");
m.bind(amount, "amount");
regs.get_scalar(reg_rcx);
APPEND_M(str( "MOV RCX, `amount" ));
APPEND_M(str( "SHRD `limbs_0, `limbs_1, CL" ));
APPEND_M(str( "SHRD `limbs_1, `limbs_2, CL" ));
APPEND_M(str( "CMP `amount, 64" ));
APPEND_M(str( "CMOVE `limbs_0, `limbs_1" ));
APPEND_M(str( "CMOVE `limbs_1, `limbs_2" ));
}
void gcd_unsigned(
reg_alloc regs_parent,
asm_integer spill_a, asm_integer spill_b, asm_integer spill_a_2, asm_integer spill_b_2, asm_integer spill_threshold,
reg_spill spill_uv_counter_start, reg_spill spill_out_uv_counter_addr, reg_spill spill_out_uv_addr,
reg_spill spill_iter, reg_spill spill_a_end_index, int max_iterations
) {
EXPAND_MACROS_SCOPE_PUBLIC;
track_asm( "gcd_unsigned" );
int int_size=spill_a.size;
assert(spill_a.addr_offset==0 && spill_b.addr_offset==0 && spill_threshold.addr_offset==0);
assert(spill_a.addr_base.value==-1 && spill_b.addr_base.value==-1 && spill_threshold.addr_base.value==-1);
assert(spill_a_2.addr_offset==0 && spill_b_2.addr_offset==0);
assert(spill_a_2.addr_base.value==-1 && spill_b_2.addr_base.value==-1);
assert(spill_a.size==int_size && spill_b.size==int_size && spill_threshold.size==int_size);
assert(spill_a_2.size==int_size && spill_b_2.size==int_size);
m.bind(spill_a.addr_base_spill, "spill_a_addr_base");
m.bind(spill_a_2.addr_base_spill, "spill_a_2_addr_base");
m.bind(spill_b.addr_base_spill, "spill_b_addr_base");
m.bind(spill_b_2.addr_base_spill, "spill_b_2_addr_base");
m.bind(spill_threshold.addr_base_spill, "spill_threshold_addr_base");
m.bind(spill_iter, "spill_iter");
m.bind(spill_uv_counter_start, "spill_uv_counter_start");
m.bind(spill_out_uv_addr, "spill_out_uv_addr");
m.bind(spill_out_uv_counter_addr, "spill_out_uv_counter_addr");
m.bind(spill_a_end_index, "spill_a_end_index");
reg_spill spill_u_0=regs_parent.bind_spill(m, "spill_u_0");
reg_spill spill_u_1=regs_parent.bind_spill(m, "spill_u_1");
reg_spill spill_v_0=regs_parent.bind_spill(m, "spill_v_0");
reg_spill spill_v_1=regs_parent.bind_spill(m, "spill_v_1");
reg_spill spill_parity=regs_parent.bind_spill(m, "spill_parity");
reg_spill spill_is_lehmer=regs_parent.bind_spill(m, "spill_is_lehmer");
reg_spill spill_a_128=regs_parent.bind_spill(m, "spill_a_128", 16, 8);
reg_spill spill_b_128=regs_parent.bind_spill(m, "spill_b_128", 16, 8);
reg_spill spill_threshold_128=regs_parent.bind_spill(m, "spill_threshold_128", 16, 8);
m.bind(spill_a_128+8, "spill_a_128_8");
m.bind(spill_b_128+8, "spill_b_128_8");
m.bind(spill_threshold_128+8, "spill_threshold_128_8");
APPEND_M(str( "MOV QWORD PTR `spill_iter, -1" ));
string loop_start=m.alloc_label();
string loop=m.alloc_label();
string loop_exit=m.alloc_label();
APPEND_M(str( "JMP #", loop_start ));
APPEND_M(str( "#:", loop ));
gcd_128(
regs_parent,
{spill_a_128, spill_b_128}, {spill_u_0, spill_u_1}, {spill_v_0, spill_v_1},
spill_parity, spill_is_lehmer, spill_threshold_128,
track_asm( "gcd_unsigned error: gcd 128 stuck", m.alloc_error_label() )
);
string exit_multiply_uv=m.alloc_label();
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
reg_scalar tmp=regs.bind_scalar(m, "tmp");
string jump_table_label=m.alloc_label();
#ifdef CHIAOSX
APPEND_M(str( ".text " ));
#else
APPEND_M(str( ".text 1" ));
#endif
APPEND_M(str( ".balign 8" ));
APPEND_M(str( "#:", jump_table_label ));
#ifdef CHIAOSX
APPEND_M(str( ".text" ));
APPEND_M(str( "MOV `tmp, `spill_a_end_index" ));
for (int end_index=0;end_index<int_size;++end_index) {
int size=end_index+1;
int mapped_size=size;
while (mapped_size==0 || mapped_size%4!=0) {
++mapped_size;
}
APPEND_M(str( "CMP `tmp, #", size ));
APPEND_M(str( "JE ")+asmprefix+str("multiply_uv_size_#", mapped_size ));
}
#else
for (int end_index=0;end_index<int_size;++end_index) {
int size=end_index+1;
int mapped_size=size;
while (mapped_size==0 || mapped_size%4!=0) {
++mapped_size;
}
APPEND_M(str( ".quad ")+asmprefix+str("multiply_uv_size_#", mapped_size ));
}
APPEND_M(str( ".text" ));
APPEND_M(str( "MOV `tmp, `spill_a_end_index" ));
APPEND_M(str( "JMP QWORD PTR [#+`tmp*8]", jump_table_label ));
#endif
}
for (int size=4;size<=int_size;size+=4) {
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
APPEND_M(asmprefix+str( "multiply_uv_size_#:", size ));
track_asm(str( "gcd_unsigned multiply uv size #", size ));
reg_scalar addr_a=regs.bind_scalar(m, "addr_a");
reg_scalar addr_b=regs.bind_scalar(m, "addr_b");
reg_scalar addr_new=regs.bind_scalar(m, "addr_new");
reg_scalar tmp=regs.bind_scalar(m, "tmp");
reg_spill spill_mod_u_0=regs.bind_spill(m, "spill_mod_u_0");
reg_spill spill_mod_u_1=regs.bind_spill(m, "spill_mod_u_1");
reg_spill spill_mod_v_0=regs.bind_spill(m, "spill_mod_v_0");
reg_spill spill_mod_v_1=regs.bind_spill(m, "spill_mod_v_1");
reg_spill spill_addr_b_new=regs.bind_spill(m, "spill_addr_b_new");
APPEND_M(str( "MOV `tmp, `spill_parity" ));
APPEND_M(str( "CMP `tmp, 0" ));
for (int x=0;x<2;++x) {
APPEND_M(str( "MOV `addr_a, `spill_u_#", x ));
APPEND_M(str( "MOV `addr_b, `spill_v_#", x ));
APPEND_M(str( "MOV `addr_new, `addr_a" ));
APPEND_M(str( "CMOVNE `addr_a, `addr_b" ));
APPEND_M(str( "CMOVNE `addr_b, `addr_new" ));
APPEND_M(str( "MOV `spill_mod_u_#, `addr_a", x ));
APPEND_M(str( "MOV `spill_mod_v_#, `addr_b", x ));
}
APPEND_M(str( "MOV `addr_new, `spill_iter" ));
APPEND_M(str( "TEST `addr_new, 1" ));
APPEND_M(str( "MOV `addr_a, `spill_a_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_a, `spill_a_2_addr_base" ));
APPEND_M(str( "MOV `addr_b, `spill_b_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_b, `spill_b_2_addr_base" ));
APPEND_M(str( "CMP `tmp, 0" ));
APPEND_M(str( "MOV `addr_new, `addr_a" ));
APPEND_M(str( "CMOVNE `addr_a, `addr_b" ));
APPEND_M(str( "CMOVNE `addr_b, `addr_new" ));
APPEND_M(str( "MOV `addr_new, `spill_iter" ));
APPEND_M(str( "TEST `addr_new, 1" )); APPEND_M(str( "MOV `addr_new, `spill_b_2_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_new, `spill_b_addr_base" ));
APPEND_M(str( "MOV `spill_addr_b_new, `addr_new" ));
APPEND_M(str( "MOV `addr_new, `spill_a_2_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_new, `spill_a_addr_base" ));
asm_integer a;
a.size=int_size;
a.addr_base=addr_a;
asm_integer b;
b.size=int_size;
b.addr_base=addr_b;
asm_integer new_ab;
new_ab.size=int_size;
new_ab.addr_base=addr_new;
reg_spill tmp0_spill=regs.get_spill(int_size*8, 8);
asm_integer tmp0;
tmp0.size=int_size;
tmp0.addr_base=reg_rsp;
tmp0.addr_offset=tmp0_spill.get_rsp_offset();
reg_spill tmp1_spill=regs.get_spill(int_size*8, 8);
asm_integer tmp1;
tmp1.size=int_size;
tmp1.addr_base=reg_rsp;
tmp1.addr_offset=tmp1_spill.get_rsp_offset();
APPEND_M(str( "MOV `tmp, `spill_mod_v_0" ));
tmp0.mul_add(regs, b, tmp, asm_integer(), true, false);
APPEND_M(str( "MOV `tmp, `spill_mod_u_1" ));
tmp1.mul_add(regs, a, tmp, asm_integer(), true, false);
APPEND_M(str( "MOV `tmp, `spill_mod_u_0" ));
new_ab.mul_add(regs, a, tmp, tmp0, false, true);
APPEND_M(str( "MOV `addr_new, `spill_addr_b_new" ));
APPEND_M(str( "MOV `tmp, `spill_mod_v_1" ));
new_ab.mul_add(regs, b, tmp, tmp1, false, true);
APPEND_M(str( "JMP #", exit_multiply_uv ));
}
APPEND_M(str( "#:", exit_multiply_uv ));
reg_scalar iter=regs_parent.bind_scalar(m, "iter");
reg_scalar is_lehmer=regs_parent.bind_scalar(m, "is_lehmer");
reg_scalar a_head_0=regs_parent.bind_scalar(m, "a_head_0");
reg_scalar a_head_1=regs_parent.bind_scalar(m, "a_head_1");
reg_scalar b_head_0=regs_parent.bind_scalar(m, "b_head_0");
reg_scalar b_head_1=regs_parent.bind_scalar(m, "b_head_1");
reg_scalar a_head_start=regs_parent.bind_scalar(m, "a_head_start");
reg_scalar shift_right_amount=regs_parent.bind_scalar(m, "shift_right_amount");
APPEND_M(str( "#:", loop_start ));
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
reg_scalar addr_a=regs.bind_scalar(m, "addr_a", reg_rax);
reg_scalar addr_b=regs.bind_scalar(m, "addr_b", reg_rdx);
reg_scalar b_head_2=regs.bind_scalar(m, "b_head_2");
reg_scalar a_head_2=regs.bind_scalar(m, "a_head_2");
APPEND_M(str( "MOV `iter, `spill_iter" ));
APPEND_M(str( "TEST `iter, 1" )); APPEND_M(str( "MOV `addr_a, `spill_a_2_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_a, `spill_a_addr_base" ));
APPEND_M(str( "MOV `addr_b, `spill_b_2_addr_base" ));
APPEND_M(str( "CMOVNZ `addr_b, `spill_b_addr_base" ));
asm_integer a;
a.size=int_size;
a.addr_base=addr_a;
asm_integer b;
b.size=int_size;
b.addr_base=addr_b;
APPEND_M(str( "MOV `a_head_start, `spill_a_end_index" ));
a.update_end_index(regs, a_head_start);
APPEND_M(str( "MOV `spill_a_end_index, `a_head_start" ));
APPEND_M(str( "XOR `is_lehmer, `is_lehmer" ));
APPEND_M(str( "CMP `a_head_start, 2" ));
APPEND_M(str( "SETAE `is_lehmer_8" ));
APPEND_M(str( "MOV `spill_is_lehmer, `is_lehmer" ));
a.calculate_head_start(regs, a_head_start);
a.extract_head_at(regs, a_head_start, {a_head_0, a_head_1, a_head_2});
calculate_shift_amount(regs, {a_head_0, a_head_1, a_head_2}, shift_right_amount);
shift_right(regs, {a_head_0, a_head_1, a_head_2}, shift_right_amount);
b.extract_head_at(regs, a_head_start, {b_head_0, b_head_1, b_head_2});
shift_right(regs, {b_head_0, b_head_1, b_head_2}, shift_right_amount);
APPEND_M(str( "MOV `spill_a_128, `a_head_0" ));
APPEND_M(str( "MOV `spill_a_128_8, `a_head_1" ));
APPEND_M(str( "MOV `spill_b_128, `b_head_0" ));
APPEND_M(str( "MOV `spill_b_128_8, `b_head_1" ));
}
reg_scalar exit_flag=regs_parent.bind_scalar(m, "exit_flag");
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
reg_scalar addr_threshold=regs.bind_scalar(m, "addr_threshold", reg_rax);
reg_scalar threshold_head_0=regs.bind_scalar(m, "threshold_head_0", reg_rdx);
reg_scalar threshold_head_1=regs.bind_scalar(m, "threshold_head_1");
reg_scalar threshold_head_2=regs.bind_scalar(m, "threshold_head_2");
APPEND_M(str( "MOV `addr_threshold, `spill_threshold_addr_base" ));
asm_integer threshold;
threshold.size=int_size;
threshold.addr_base=addr_threshold;
threshold.extract_head_at(regs, a_head_start, {threshold_head_0, threshold_head_1, threshold_head_2});
shift_right(regs, {threshold_head_0, threshold_head_1, threshold_head_2}, shift_right_amount);
APPEND_M(str( "MOV `spill_threshold_128, `threshold_head_0" ));
APPEND_M(str( "MOV `spill_threshold_128_8, `threshold_head_1" ));
APPEND_M(str( "MOV `addr_threshold, `threshold_head_0" ));
APPEND_M(str( "MOV `threshold_head_2, `threshold_head_1" ));
APPEND_M(str( "SUB `addr_threshold, `a_head_0" ));
APPEND_M(str( "SBB `threshold_head_2, `a_head_1" ));
APPEND_M(str( "JNC #", track_asm( "gcd_unsigned error: a_head<=threshold_head", m.alloc_error_label() ) ));
APPEND_M(str( "XOR `exit_flag, `exit_flag" ));
APPEND_M(str( "SUB `threshold_head_0, `b_head_0" ));
APPEND_M(str( "SBB `threshold_head_1, `b_head_1" ));
APPEND_M(str( "SETNC `exit_flag_8" ));
APPEND_M(str( "OR `threshold_head_0, `threshold_head_1" ));
APPEND_M(str( "DEC `is_lehmer" )); APPEND_M(str( "OR `threshold_head_0, `is_lehmer" )); APPEND_M(str( "JZ #", track_asm( "gcd_unsigned error: b_head==threshold_head and is_lehmer", m.alloc_error_label() ) ));
}
{
EXPAND_MACROS_SCOPE;
reg_alloc regs=regs_parent;
reg_scalar out_uv_addr=regs.bind_scalar(m, "out_uv_addr");
reg_scalar tmp=regs.bind_scalar(m, "tmp");
APPEND_M(str( "MOV `out_uv_addr, `iter" ));
APPEND_M(str( "SHL `out_uv_addr, 6" ));
APPEND_M(str( "ADD `out_uv_addr, `spill_out_uv_addr" ));
APPEND_M(str( "MOV `tmp, `spill_u_0" ));
APPEND_M(str( "MOV [`out_uv_addr], `tmp" ));
APPEND_M(str( "MOV `tmp, `spill_u_1" ));
APPEND_M(str( "MOV [`out_uv_addr+8], `tmp" ));
APPEND_M(str( "MOV `tmp, `spill_v_0" ));
APPEND_M(str( "MOV [`out_uv_addr+16], `tmp" ));
APPEND_M(str( "MOV `tmp, `spill_v_1" ));
APPEND_M(str( "MOV [`out_uv_addr+24], `tmp" ));
APPEND_M(str( "MOV `tmp, `spill_parity" ));
APPEND_M(str( "MOV [`out_uv_addr+32], `tmp" ));
APPEND_M(str( "MOV [`out_uv_addr+40], `exit_flag" ));
APPEND_M(str( "MOV `tmp, `spill_uv_counter_start" ));
APPEND_M(str( "ADD `tmp, `iter" ));
APPEND_M(str( "MOV `out_uv_addr, `spill_out_uv_counter_addr" ));
APPEND_M(str( "MOV [`out_uv_addr], `tmp" ));
APPEND_M(str( "INC `iter" ));
APPEND_M(str( "MOV `spill_iter, `iter" ));
APPEND_M(str( "CMP `exit_flag, 0" ));
APPEND_M(str( "JNE #", loop_exit ));
APPEND_M(str( "CMP `iter, #", to_hex(max_iterations) )); APPEND_M(str( "JGE #", track_asm( "gcd_unsigned error: max_iterations exceeded", m.alloc_error_label() ) ));
}
APPEND_M(str( "JMP #", loop ));
APPEND_M(str( "#:", loop_exit ));
}
}