use std::arch::asm;
macro_rules! impl_op_round {
($t:ty, $f:ident ($x:ident $(,$y:ident)*), $inst:literal, rd) => {
impl_op_round!($t, $f ($x $(,$y)*), $inst, "16256"); };
($t:ty, $f:ident ($x:ident $(,$y:ident)*), $inst:literal, ru) => {
impl_op_round!($t, $f ($x $(,$y)*), $inst, "24448"); };
($t:ty, $f:ident ($x:ident $(,$y:ident)*), $inst:literal, $mxcsr:literal) => {
pub(crate) fn $f(mut $x: $t, $($y: $t,)*) -> $t {
unsafe {
asm!(
"push {rax}", "vstmxcsr [rsp]",
concat!("mov dword ptr [rsp + 4], ", $mxcsr),
"vldmxcsr [rsp + 4]",
$inst,
"vldmxcsr [rsp]",
"pop {rax}", $x = inout(xmm_reg) $x,
$($y = in(xmm_reg) $y,)*
rax = out(reg) _, options(pure, nomem, preserves_flags)
);
}
$x
}
};
}
impl_op_round!(f64, sqrt1_rd(x), "vsqrtsd {x}, {x}, {x}", rd);
impl_op_round!(f64, sqrt1_ru(x), "vsqrtsd {x}, {x}, {x}", ru);
impl_op_round!(f64, sub1_ru(x, y), "vsubsd {x}, {x}, {y}", ru);
impl_op_round!(super::F64X2, add_ru(x, y), "vaddpd {x}, {x}, {y}", ru);
impl_op_round!(super::F64X2, sub_ru(x, y), "vsubpd {x}, {x}, {y}", ru);
impl_op_round!(super::F64X2, mul_ru(x, y), "vmulpd {x}, {x}, {y}", ru);
impl_op_round!(super::F64X2, div_ru(x, y), "vdivpd {x}, {x}, {y}", ru);
impl_op_round!(
super::F64X2,
mul_add_ru(x, y, z),
"vfmadd213pd {x}, {y}, {z}",
ru
);