use crate::bigint::Nat;
use std::intrinsics::transmute;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64 as march;
#[cfg(target_arch = "x86")]
use std::arch::x86 as march;
impl Nat {
#[target_feature(enable = "avx2")]
unsafe fn mul_manuual_by_avx2(&mut self, rhs: &Self) {
let lhs = self.deep_clone();
let (min, max, _) = Self::min_max(&lhs, rhs);
let (min, max) = (min.as_slice(), max.as_slice());
let (minp, maxp) = (min.as_ptr(), max.as_ptr());
let (minp, maxp): (*const i32, *const i32) = (transmute(minp), transmute(maxp));
let (mut lpkg, mut rpkg) = (Vec::with_capacity(min.len()), Vec::with_capacity(max.len()));
let mask = march::_mm256_set1_epi32(-1);
(0..(min.len() as isize)).for_each(|i| {
lpkg.push(march::_mm256_set1_epi32(minp.offset(i).read()));
});
let (len, bound) = ((max.len() & (!3)) as isize, max.len() as isize);
(0..len).step_by(4).for_each(|i| {
let (x0, x1, x2, x3) = (maxp.offset(i).read(), maxp.offset(i+1).read(),
maxp.offset(i + 2).read(), maxp.offset(i + 3).read());
rpkg.push(march::_mm256_setr_epi32(x0, 0, x1, 0, x2, 0, x3, 0));
});
if (len+2) < bound {
let (x0, x1, x2, x3) = (maxp.offset(len).read(), maxp.offset(len+1).read(),
maxp.offset(len+2).read(), 0);
rpkg.push(march::_mm256_setr_epi32(x0, 0, x1, 0, x2, 0, x3, 0));
} else if (len + 1) < bound {
let (x0, x1, x2, x3) = (maxp.offset(len).read(), maxp.offset(len + 1).read(),
0, 0);
rpkg.push(march::_mm256_setr_epi32(x0, 0, x1, 0, x2, 0, x3, 0));
} else if len < bound {
let (x0, x1, x2, x3) = (maxp.offset(len).read(), 0, 0, 0);
rpkg.push(march::_mm256_setr_epi32(x0, 0, x1, 0, x2, 0, x3, 0));
};
let mut r = Vec::with_capacity(lpkg.len());
let v_len = min.len() + (((max.len() + 3) >> 2) << 2) - 1;
let mut tmp0 = Vec::with_capacity(v_len);
for (i, &l) in lpkg.iter().enumerate() {
tmp0.clear();
tmp0.resize(v_len, 0u64);
let mut tmp0_ptr: *mut i64 = transmute(tmp0.as_mut_ptr());
tmp0_ptr = tmp0_ptr.add(i);
for &r in rpkg.iter() {
let tmp1 = march::_mm256_mul_epu32(l, r);
march::_mm256_maskstore_epi64(tmp0_ptr, mask, tmp1);
tmp0_ptr = tmp0_ptr.add(4);
}
r.push(tmp0.clone());
}
let v = self.as_mut_vec();
v.reserve(v_len.overflowing_sub(self.num()).0);
v.clear();
let mut pre = 0;
const MASK: u64 = u32::MAX as u64;
(0..v_len).for_each(|i| {
let mut c = 0;
let mut out = pre;
r.iter().for_each(|y| {
c += march::_addcarryx_u64(0, out, y[i], &mut out) as u64;
});
v.push((out & MASK) as u32);
pre = (out >> 32) | (c << 32);
});
if pre > 0 {v.push((pre & MASK) as u32);}
self.trim_head_zero();
}
pub(super) fn mul_inner(&mut self, max: &Self) {
if self == &0u32 || max == &0u32 {self.clear(); self.as_mut_vec().push(0);}
else if self == &1u32 {
if self.as_vec().as_ptr() != max.as_vec().as_ptr() {
self.clear();
self.as_mut_vec().extend_from_slice(max.as_slice());
}
}
else if max != &1u32 {
unsafe {
self.mul_manuual_by_avx2(max);
}
}
}
}