use proc_macro2::TokenStream;
use quote::quote;
pub(super) fn gen_sse2_f32_v4_size2_soa() -> TokenStream {
quote! {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "sse2")]
pub unsafe fn notw_2_v4_sse2_f32_soa(
re_in: *const f32,
im_in: *const f32,
re_out: *mut f32,
im_out: *mut f32,
) {
use core::arch::x86_64::*;
let re0 = _mm_loadu_ps(re_in); let im0 = _mm_loadu_ps(im_in); let re1 = _mm_loadu_ps(re_in.add(4)); let im1 = _mm_loadu_ps(im_in.add(4));
_mm_storeu_ps(re_out, _mm_add_ps(re0, re1)); _mm_storeu_ps(im_out, _mm_add_ps(im0, im1)); _mm_storeu_ps(re_out.add(4), _mm_sub_ps(re0, re1)); _mm_storeu_ps(im_out.add(4), _mm_sub_ps(im0, im1)); }
}
}
pub(super) fn gen_sse2_f32_v4_size4_soa() -> TokenStream {
quote! {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "sse2")]
pub unsafe fn notw_4_v4_sse2_f32_soa(
re_in: *const f32,
im_in: *const f32,
re_out: *mut f32,
im_out: *mut f32,
) {
use core::arch::x86_64::*;
let re0 = _mm_loadu_ps(re_in);
let im0 = _mm_loadu_ps(im_in);
let re1 = _mm_loadu_ps(re_in.add(4));
let im1 = _mm_loadu_ps(im_in.add(4));
let re2 = _mm_loadu_ps(re_in.add(8));
let im2 = _mm_loadu_ps(im_in.add(8));
let re3 = _mm_loadu_ps(re_in.add(12));
let im3 = _mm_loadu_ps(im_in.add(12));
let t0_re = _mm_add_ps(re0, re2); let t0_im = _mm_add_ps(im0, im2);
let t1_re = _mm_sub_ps(re0, re2); let t1_im = _mm_sub_ps(im0, im2);
let t2_re = _mm_add_ps(re1, re3); let t2_im = _mm_add_ps(im1, im3);
let t3_re = _mm_sub_ps(re1, re3); let t3_im = _mm_sub_ps(im1, im3);
let neg_mask = _mm_set1_ps(-0.0_f32);
let t3rot_re = t3_im; let t3rot_im = _mm_xor_ps(t3_re, neg_mask);
_mm_storeu_ps(re_out, _mm_add_ps(t0_re, t2_re));
_mm_storeu_ps(im_out, _mm_add_ps(t0_im, t2_im));
_mm_storeu_ps(re_out.add(4), _mm_add_ps(t1_re, t3rot_re));
_mm_storeu_ps(im_out.add(4), _mm_add_ps(t1_im, t3rot_im));
_mm_storeu_ps(re_out.add(8), _mm_sub_ps(t0_re, t2_re));
_mm_storeu_ps(im_out.add(8), _mm_sub_ps(t0_im, t2_im));
_mm_storeu_ps(re_out.add(12), _mm_sub_ps(t1_re, t3rot_re));
_mm_storeu_ps(im_out.add(12), _mm_sub_ps(t1_im, t3rot_im));
}
}
}