use proc_macro2::TokenStream;
use quote::quote;
pub(super) fn gen_avx2_f32_v8_size2_soa() -> TokenStream {
quote! {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
pub unsafe fn notw_2_v8_avx2_f32_soa(
re_in: *const f32,
im_in: *const f32,
re_out: *mut f32,
im_out: *mut f32,
) {
use core::arch::x86_64::*;
let re0 = _mm256_loadu_ps(re_in); let im0 = _mm256_loadu_ps(im_in); let re1 = _mm256_loadu_ps(re_in.add(8)); let im1 = _mm256_loadu_ps(im_in.add(8));
_mm256_storeu_ps(re_out, _mm256_add_ps(re0, re1)); _mm256_storeu_ps(im_out, _mm256_add_ps(im0, im1)); _mm256_storeu_ps(re_out.add(8), _mm256_sub_ps(re0, re1)); _mm256_storeu_ps(im_out.add(8), _mm256_sub_ps(im0, im1)); }
}
}
pub(super) fn gen_avx2_f32_v8_size4_soa() -> TokenStream {
quote! {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
pub unsafe fn notw_4_v8_avx2_f32_soa(
re_in: *const f32,
im_in: *const f32,
re_out: *mut f32,
im_out: *mut f32,
) {
use core::arch::x86_64::*;
let re0 = _mm256_loadu_ps(re_in);
let im0 = _mm256_loadu_ps(im_in);
let re1 = _mm256_loadu_ps(re_in.add(8));
let im1 = _mm256_loadu_ps(im_in.add(8));
let re2 = _mm256_loadu_ps(re_in.add(16));
let im2 = _mm256_loadu_ps(im_in.add(16));
let re3 = _mm256_loadu_ps(re_in.add(24));
let im3 = _mm256_loadu_ps(im_in.add(24));
let t0_re = _mm256_add_ps(re0, re2);
let t0_im = _mm256_add_ps(im0, im2);
let t1_re = _mm256_sub_ps(re0, re2);
let t1_im = _mm256_sub_ps(im0, im2);
let t2_re = _mm256_add_ps(re1, re3);
let t2_im = _mm256_add_ps(im1, im3);
let t3_re = _mm256_sub_ps(re1, re3);
let t3_im = _mm256_sub_ps(im1, im3);
let neg_mask = _mm256_set1_ps(-0.0_f32);
let t3rot_re = t3_im; let t3rot_im = _mm256_xor_ps(t3_re, neg_mask);
_mm256_storeu_ps(re_out, _mm256_add_ps(t0_re, t2_re));
_mm256_storeu_ps(im_out, _mm256_add_ps(t0_im, t2_im));
_mm256_storeu_ps(re_out.add(8), _mm256_add_ps(t1_re, t3rot_re));
_mm256_storeu_ps(im_out.add(8), _mm256_add_ps(t1_im, t3rot_im));
_mm256_storeu_ps(re_out.add(16), _mm256_sub_ps(t0_re, t2_re));
_mm256_storeu_ps(im_out.add(16), _mm256_sub_ps(t0_im, t2_im));
_mm256_storeu_ps(re_out.add(24), _mm256_sub_ps(t1_re, t3rot_re));
_mm256_storeu_ps(im_out.add(24), _mm256_sub_ps(t1_im, t3rot_im));
}
}
}
#[allow(clippy::too_many_lines)]
pub(super) fn gen_avx2_f32_v8_size8_soa() -> TokenStream {
quote! {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
#[target_feature(enable = "avx2")]
#[allow(clippy::too_many_lines)]
pub unsafe fn notw_8_v8_avx2_f32_soa(
re_in: *const f32,
im_in: *const f32,
re_out: *mut f32,
im_out: *mut f32,
) {
use core::arch::x86_64::*;
let inv_sqrt2 = _mm256_set1_ps(0.707_106_77_f32);
let neg_mask = _mm256_set1_ps(-0.0_f32);
let re0 = _mm256_loadu_ps(re_in);
let im0 = _mm256_loadu_ps(im_in);
let re1 = _mm256_loadu_ps(re_in.add(8));
let im1 = _mm256_loadu_ps(im_in.add(8));
let re2 = _mm256_loadu_ps(re_in.add(16));
let im2 = _mm256_loadu_ps(im_in.add(16));
let re3 = _mm256_loadu_ps(re_in.add(24));
let im3 = _mm256_loadu_ps(im_in.add(24));
let re4 = _mm256_loadu_ps(re_in.add(32));
let im4 = _mm256_loadu_ps(im_in.add(32));
let re5 = _mm256_loadu_ps(re_in.add(40));
let im5 = _mm256_loadu_ps(im_in.add(40));
let re6 = _mm256_loadu_ps(re_in.add(48));
let im6 = _mm256_loadu_ps(im_in.add(48));
let re7 = _mm256_loadu_ps(re_in.add(56));
let im7 = _mm256_loadu_ps(im_in.add(56));
let a0r = _mm256_add_ps(re0, re4);
let a0i = _mm256_add_ps(im0, im4);
let a1r = _mm256_add_ps(re1, re5);
let a1i = _mm256_add_ps(im1, im5);
let a2r = _mm256_add_ps(re2, re6);
let a2i = _mm256_add_ps(im2, im6);
let a3r = _mm256_add_ps(re3, re7);
let a3i = _mm256_add_ps(im3, im7);
let b0r = _mm256_sub_ps(re0, re4);
let b0i = _mm256_sub_ps(im0, im4);
let b1r = _mm256_sub_ps(re1, re5);
let b1i = _mm256_sub_ps(im1, im5);
let b2r = _mm256_sub_ps(re2, re6);
let b2i = _mm256_sub_ps(im2, im6);
let b3r = _mm256_sub_ps(re3, re7);
let b3i = _mm256_sub_ps(im3, im7);
let b1tr = _mm256_mul_ps(_mm256_add_ps(b1r, b1i), inv_sqrt2);
let b1ti = _mm256_mul_ps(_mm256_sub_ps(b1i, b1r), inv_sqrt2);
let b2tr = b2i;
let b2ti = _mm256_xor_ps(b2r, neg_mask);
let b3tr = _mm256_mul_ps(_mm256_sub_ps(b3i, b3r), inv_sqrt2);
let neg_b3r_plus_b3i_neg = _mm256_xor_ps(_mm256_add_ps(b3r, b3i), neg_mask);
let b3ti = _mm256_mul_ps(neg_b3r_plus_b3i_neg, inv_sqrt2);
let c0r = _mm256_add_ps(a0r, a2r);
let c0i = _mm256_add_ps(a0i, a2i);
let c2r = _mm256_sub_ps(a0r, a2r);
let c2i = _mm256_sub_ps(a0i, a2i);
let c1r = _mm256_add_ps(a1r, a3r);
let c1i = _mm256_add_ps(a1i, a3i);
let d3r = _mm256_sub_ps(a1r, a3r);
let d3i = _mm256_sub_ps(a1i, a3i);
let c3r = d3i;
let c3i = _mm256_xor_ps(d3r, neg_mask);
let e0r = _mm256_add_ps(b0r, b2tr);
let e0i = _mm256_add_ps(b0i, b2ti);
let e2r = _mm256_sub_ps(b0r, b2tr);
let e2i = _mm256_sub_ps(b0i, b2ti);
let e1r = _mm256_add_ps(b1tr, b3tr);
let e1i = _mm256_add_ps(b1ti, b3ti);
let f3r = _mm256_sub_ps(b1tr, b3tr);
let f3i = _mm256_sub_ps(b1ti, b3ti);
let e3r = f3i;
let e3i = _mm256_xor_ps(f3r, neg_mask);
_mm256_storeu_ps(re_out, _mm256_add_ps(c0r, c1r)); _mm256_storeu_ps(im_out, _mm256_add_ps(c0i, c1i));
_mm256_storeu_ps(re_out.add(8), _mm256_add_ps(e0r, e1r)); _mm256_storeu_ps(im_out.add(8), _mm256_add_ps(e0i, e1i));
_mm256_storeu_ps(re_out.add(16), _mm256_add_ps(c2r, c3r)); _mm256_storeu_ps(im_out.add(16), _mm256_add_ps(c2i, c3i));
_mm256_storeu_ps(re_out.add(24), _mm256_add_ps(e2r, e3r)); _mm256_storeu_ps(im_out.add(24), _mm256_add_ps(e2i, e3i));
_mm256_storeu_ps(re_out.add(32), _mm256_sub_ps(c0r, c1r)); _mm256_storeu_ps(im_out.add(32), _mm256_sub_ps(c0i, c1i));
_mm256_storeu_ps(re_out.add(40), _mm256_sub_ps(e0r, e1r)); _mm256_storeu_ps(im_out.add(40), _mm256_sub_ps(e0i, e1i));
_mm256_storeu_ps(re_out.add(48), _mm256_sub_ps(c2r, c3r)); _mm256_storeu_ps(im_out.add(48), _mm256_sub_ps(c2i, c3i));
_mm256_storeu_ps(re_out.add(56), _mm256_sub_ps(e2r, e3r)); _mm256_storeu_ps(im_out.add(56), _mm256_sub_ps(e2i, e3i));
}
}
}