use crate::{
Assembler, AssemblerData, CHOICE_BOTH, CHOICE_LEFT, CHOICE_RIGHT, IMM_REG,
OFFSET, REGISTER_LIMIT, mmap::Mmap, point::PointAssembler, reg,
};
use dynasmrt::{DynasmApi, DynasmError, dynasm};
const STACK_SIZE: u32 = 0xb0;
#[expect(clippy::useless_conversion)]
impl Assembler for PointAssembler {
type Data = f32;
fn init(mmap: Mmap, slot_count: usize) -> Self {
let mut out = AssemblerData::new(mmap);
out.prepare_stack(slot_count, STACK_SIZE as usize);
dynasm!(out.ops
; stp x29, x30, [sp, 0x0]
; mov x29, sp
; stp d8, d9, [sp, 0x10]
; stp d10, d11, [sp, 0x20]
; stp d12, d13, [sp, 0x30]
; stp d14, d15, [sp, 0x40]
);
Self(out)
}
fn bytes_per_clause() -> usize {
10
}
fn build_load(&mut self, dst_reg: u8, src_mem: u32) {
assert!((dst_reg as usize) < REGISTER_LIMIT);
let sp_offset = self.0.stack_pos(src_mem) + STACK_SIZE;
assert!(sp_offset <= 16384);
dynasm!(self.0.ops ; ldr S(reg(dst_reg)), [sp, sp_offset])
}
fn build_store(&mut self, dst_mem: u32, src_reg: u8) {
assert!((src_reg as usize) < REGISTER_LIMIT);
let sp_offset = self.0.stack_pos(dst_mem) + STACK_SIZE;
assert!(sp_offset <= 16384);
dynasm!(self.0.ops ; str S(reg(src_reg)), [sp, sp_offset])
}
fn build_input(&mut self, out_reg: u8, src_arg: u32) {
assert!(src_arg < 16384 / 4);
dynasm!(self.0.ops
; ldr S(reg(out_reg)), [x0, src_arg * 4]
);
}
fn build_output(&mut self, arg_reg: u8, output_index: u32) {
assert!(output_index < 16384 / 4);
dynasm!(self.0.ops
; str S(reg(arg_reg)), [x3, output_index * 4]
);
}
fn build_copy(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops ; fmov S(reg(out_reg)), S(reg(lhs_reg)))
}
fn build_sin(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn point_sin(v: f32) -> f32 {
v.sin()
}
self.call_fn_unary(out_reg, lhs_reg, point_sin);
}
fn build_cos(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_cos(f: f32) -> f32 {
f.cos()
}
self.call_fn_unary(out_reg, lhs_reg, float_cos);
}
fn build_tan(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_tan(f: f32) -> f32 {
f.tan()
}
self.call_fn_unary(out_reg, lhs_reg, float_tan);
}
fn build_asin(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_asin(f: f32) -> f32 {
f.asin()
}
self.call_fn_unary(out_reg, lhs_reg, float_asin);
}
fn build_acos(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_acos(f: f32) -> f32 {
f.acos()
}
self.call_fn_unary(out_reg, lhs_reg, float_acos);
}
fn build_atan(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_atan(f: f32) -> f32 {
f.atan()
}
self.call_fn_unary(out_reg, lhs_reg, float_atan);
}
fn build_exp(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_exp(f: f32) -> f32 {
f.exp()
}
self.call_fn_unary(out_reg, lhs_reg, float_exp);
}
fn build_ln(&mut self, out_reg: u8, lhs_reg: u8) {
extern "C" fn float_ln(f: f32) -> f32 {
f.ln()
}
self.call_fn_unary(out_reg, lhs_reg, float_ln);
}
fn build_neg(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops ; fneg S(reg(out_reg)), S(reg(lhs_reg)))
}
fn build_abs(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops ; fabs S(reg(out_reg)), S(reg(lhs_reg)))
}
fn build_recip(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops
; fmov s7, 1.0
; fdiv S(reg(out_reg)), s7, S(reg(lhs_reg))
)
}
fn build_sqrt(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops ; fsqrt S(reg(out_reg)), S(reg(lhs_reg)))
}
fn build_square(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops ; fmul S(reg(out_reg)), S(reg(lhs_reg)), S(reg(lhs_reg)))
}
fn build_floor(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), S(reg(lhs_reg))
; mvn v6.b8, v6.b8
; fcvtms S(reg(out_reg)), S(reg(lhs_reg))
; scvtf S(reg(out_reg)), S(reg(out_reg))
; orr V(reg(out_reg)).B8, V(reg(out_reg)).B8, v6.b8
);
}
fn build_ceil(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), S(reg(lhs_reg))
; mvn v6.b8, v6.b8
; fcvtps S(reg(out_reg)), S(reg(lhs_reg))
; scvtf S(reg(out_reg)), S(reg(out_reg))
; orr V(reg(out_reg)).B8, V(reg(out_reg)).B8, v6.b8
);
}
fn build_round(&mut self, out_reg: u8, lhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), S(reg(lhs_reg))
; mvn v6.b8, v6.b8
; fcvtas S(reg(out_reg)), S(reg(lhs_reg))
; scvtf S(reg(out_reg)), S(reg(out_reg))
; orr V(reg(out_reg)).B8, V(reg(out_reg)).B8, v6.b8
);
}
fn build_add(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fadd S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
)
}
fn build_sub(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fsub S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
)
}
fn build_mul(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fmul S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
)
}
fn build_div(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fdiv S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
)
}
fn build_atan2(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
extern "C" fn float_atan2(y: f32, x: f32) -> f32 {
y.atan2(x)
}
self.call_fn_binary(out_reg, lhs_reg, rhs_reg, float_atan2);
}
fn build_max(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; ldrb w14, [x1]
; fcmp S(reg(lhs_reg)), S(reg(rhs_reg))
; b.mi 20 ; b.gt 32
; fmax S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
; orr w14, w14, CHOICE_BOTH
; b 32
; fmov S(reg(out_reg)), S(reg(rhs_reg))
; orr w14, w14, CHOICE_RIGHT
; strb w14, [x2, 0] ; b 16
; fmov S(reg(out_reg)), S(reg(lhs_reg))
; orr w14, w14, CHOICE_LEFT
; strb w14, [x2, 0]
; strb w14, [x1], 1 )
}
fn build_min(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; ldrb w14, [x1]
; fcmp S(reg(lhs_reg)), S(reg(rhs_reg))
; b.mi 20
; b.gt 32
; fmin S(reg(out_reg)), S(reg(lhs_reg)), S(reg(rhs_reg))
; orr w14, w14, CHOICE_BOTH
; b 32
; fmov S(reg(out_reg)), S(reg(lhs_reg))
; orr w14, w14, CHOICE_LEFT
; strb w14, [x2, 0] ; b 16
; fmov S(reg(out_reg)), S(reg(rhs_reg))
; orr w14, w14, CHOICE_RIGHT
; strb w14, [x2, 0]
; strb w14, [x1], 1 )
}
fn build_mod(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fabs s6, S(reg(rhs_reg))
; fdiv s7, S(reg(lhs_reg)), s6
; frintm s7, s7 ; fmul s7, s7, s6
; fsub S(reg(out_reg)), S(reg(lhs_reg)), s7
)
}
fn build_not(&mut self, out_reg: u8, arg_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(arg_reg)), 0.0
; fmov S(reg(out_reg)), 1.0
; and V(reg(out_reg)).b8, V(reg(out_reg)).b8, v6.b8
);
}
fn build_and(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), 0.0
; fmov w10, s6 ; mov w9, CHOICE_LEFT
; and w9, w9, w10
; mvn w10, w10
; mov w11, CHOICE_RIGHT
; and w11, w11, w10 ; orr w11, w11, w9
; ldrb w14, [x1]
; orr w14, w14, w11
; strb w14, [x1], 1
; strb w14, [x2, 0]
; and v5.b8, v6.b8, V(reg(lhs_reg)).b8
; mvn v6.b8, v6.b8
; and v6.b8, v6.b8, V(reg(rhs_reg)).b8
; orr V(reg(out_reg)).b8, v5.b8, v6.b8
);
}
fn build_or(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), 0.0
; mvn v6.b8, v6.b8
; fmov w10, s6 ; mov w9, CHOICE_LEFT
; and w9, w9, w10
; mvn w10, w10
; mov w11, CHOICE_RIGHT
; and w11, w11, w10 ; orr w11, w11, w9
; ldrb w14, [x1]
; orr w14, w14, w11
; strb w14, [x1], 1
; strb w14, [x2, 0]
; and v5.b8, v6.b8, V(reg(lhs_reg)).b8
; mvn v6.b8, v6.b8
; and v6.b8, v6.b8, V(reg(rhs_reg)).b8
; orr V(reg(out_reg)).b8, v5.b8, v6.b8
);
}
fn build_compare(&mut self, out_reg: u8, lhs_reg: u8, rhs_reg: u8) {
dynasm!(self.0.ops
; fcmeq s6, S(reg(lhs_reg)), S(reg(lhs_reg))
; fcmeq s7, S(reg(rhs_reg)), S(reg(rhs_reg))
; and v6.b8, v6.b8, v7.b8
; mvn v6.b8, v6.b8
; fcmgt s4, S(reg(rhs_reg)), S(reg(lhs_reg))
; fcmgt s5, S(reg(lhs_reg)), S(reg(rhs_reg))
; fmov s7, -1.0
; and V(reg(out_reg)).B8, v4.B8, v7.B8
; fmov s7, 1.0
; and v5.B8, v5.B8, v7.B8
; orr V(reg(out_reg)).B8, V(reg(out_reg)).B8, v5.B8
; mov w9, f32::NAN.to_bits()
; fmov s7, w9
; and v7.b8, v7.b8, v6.b8
; orr V(reg(out_reg)).B8, V(reg(out_reg)).B8, v7.b8
);
}
fn load_imm(&mut self, imm: f32) -> u8 {
let imm_u32 = imm.to_bits();
if imm_u32 & 0xFFFF == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; fmov S(IMM_REG), w9
);
} else if imm_u32 & 0xFFFF_0000 == 0 {
dynasm!(self.0.ops
; movz w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG), w9
);
} else {
dynasm!(self.0.ops
; movz w9, imm_u32 >> 16, lsl 16
; movk w9, imm_u32 & 0xFFFF
; fmov S(IMM_REG), w9
);
}
IMM_REG.wrapping_sub(OFFSET)
}
fn finalize(mut self) -> Result<Mmap, DynasmError> {
if self.0.saved_callee_regs {
dynasm!(self.0.ops
; ldp x20, x21, [sp, 0x90]
; ldr x22, [sp, 0xa0]
; ldr x23, [sp, 0xa8]
)
}
dynasm!(self.0.ops
; ldp x29, x30, [sp, 0x0]
; ldp d8, d9, [sp, 0x10]
; ldp d10, d11, [sp, 0x20]
; ldp d12, d13, [sp, 0x30]
; ldp d14, d15, [sp, 0x40]
);
self.0.finalize()
}
}
#[expect(clippy::useless_conversion)]
impl PointAssembler {
fn ensure_callee_regs_saved(&mut self) {
if !self.0.saved_callee_regs {
dynasm!(self.0.ops
; stp x20, x21, [sp, 0x90]
; str x22, [sp, 0xa0]
; str x23, [sp, 0xa8]
);
self.0.saved_callee_regs = true;
}
}
fn call_fn_unary(
&mut self,
out_reg: u8,
arg_reg: u8,
f: extern "C" fn(f32) -> f32,
) {
self.ensure_callee_regs_saved();
let addr = f as usize;
dynasm!(self.0.ops
; mov x20, x0
; mov x21, x1
; mov x22, x2
; mov x23, x3
; stp s16, s17, [sp, 0x50]
; stp s18, s19, [sp, 0x58]
; stp s20, s21, [sp, 0x60]
; stp s22, s23, [sp, 0x68]
; stp s24, s25, [sp, 0x70]
; stp s26, s27, [sp, 0x78]
; stp s28, s29, [sp, 0x80]
; stp s30, s31, [sp, 0x88]
; movz x0, (addr >> 48) as u32 & 0xFFFF, lsl 48
; movk x0, (addr >> 32) as u32 & 0xFFFF, lsl 32
; movk x0, (addr >> 16) as u32 & 0xFFFF, lsl 16
; movk x0, addr as u32 & 0xFFFF
; fmov s0, S(reg(arg_reg))
; blr x0
; ldp s16, s17, [sp, 0x50]
; ldp s18, s19, [sp, 0x58]
; ldp s20, s21, [sp, 0x60]
; ldp s22, s23, [sp, 0x68]
; ldp s24, s25, [sp, 0x70]
; ldp s26, s27, [sp, 0x78]
; ldp s28, s29, [sp, 0x80]
; ldp s30, s31, [sp, 0x88]
; fmov S(reg(out_reg)), s0
; ldp s0, s1, [sp, 0x90]
; ldr s2, [sp, 0x98]
; mov x0, x20
; mov x1, x21
; mov x2, x22
; mov x3, x23
);
}
fn call_fn_binary(
&mut self,
out_reg: u8,
lhs_reg: u8,
rhs_reg: u8,
f: extern "C" fn(f32, f32) -> f32,
) {
self.ensure_callee_regs_saved();
let addr = f as usize;
dynasm!(self.0.ops
; mov x20, x0
; mov x21, x1
; mov x22, x2
; stp s16, s17, [sp, 0x50]
; stp s18, s19, [sp, 0x58]
; stp s20, s21, [sp, 0x60]
; stp s22, s23, [sp, 0x68]
; stp s24, s25, [sp, 0x70]
; stp s26, s27, [sp, 0x78]
; stp s28, s29, [sp, 0x80]
; stp s30, s31, [sp, 0x88]
; movz x0, (addr >> 48) as u32 & 0xFFFF, lsl 48
; movk x0, (addr >> 32) as u32 & 0xFFFF, lsl 32
; movk x0, (addr >> 16) as u32 & 0xFFFF, lsl 16
; movk x0, addr as u32 & 0xFFFF
; fmov s0, S(reg(lhs_reg))
; fmov s1, S(reg(rhs_reg))
; blr x0
; ldp s16, s17, [sp, 0x50]
; ldp s18, s19, [sp, 0x58]
; ldp s20, s21, [sp, 0x60]
; ldp s22, s23, [sp, 0x68]
; ldp s24, s25, [sp, 0x70]
; ldp s26, s27, [sp, 0x78]
; ldp s28, s29, [sp, 0x80]
; ldp s30, s31, [sp, 0x88]
; fmov S(reg(out_reg)), s0
; ldp s0, s1, [sp, 0x90]
; ldr s2, [sp, 0x98]
; mov x0, x20
; mov x1, x21
; mov x2, x22
);
}
}