#![cfg(target_feature="sse")]
use super::*;
use core::ops::*;
#[derive(Clone, Copy)]
#[allow(bad_style)]
#[repr(transparent)]
pub struct m128(pub __m128);
unsafe impl Zeroable for m128 {}
unsafe impl Pod for m128 {}
impl core::fmt::Debug for m128 {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let a: [f32; 4] = cast(self.0);
f.write_str("m128(")?;
core::fmt::Debug::fmt(&a[0], f)?;
f.write_str(", ")?;
core::fmt::Debug::fmt(&a[1], f)?;
f.write_str(", ")?;
core::fmt::Debug::fmt(&a[2], f)?;
f.write_str(", ")?;
core::fmt::Debug::fmt(&a[3], f)?;
f.write_str(")")
}
}
impl core::fmt::Display for m128 {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let a: [f32; 4] = cast(self.0);
f.write_str("m128(")?;
core::fmt::Display::fmt(&a[0], f)?;
f.write_str(", ")?;
core::fmt::Display::fmt(&a[1], f)?;
f.write_str(", ")?;
core::fmt::Display::fmt(&a[2], f)?;
f.write_str(", ")?;
core::fmt::Display::fmt(&a[3], f)?;
f.write_str(")")
}
}
impl core::fmt::LowerExp for m128 {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let a: [f32; 4] = cast(self.0);
f.write_str("m128(")?;
core::fmt::LowerExp::fmt(&a[0], f)?;
f.write_str(", ")?;
core::fmt::LowerExp::fmt(&a[1], f)?;
f.write_str(", ")?;
core::fmt::LowerExp::fmt(&a[2], f)?;
f.write_str(", ")?;
core::fmt::LowerExp::fmt(&a[3], f)?;
f.write_str(")")
}
}
impl core::fmt::UpperExp for m128 {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let a: [f32; 4] = cast(self.0);
f.write_str("m128(")?;
core::fmt::UpperExp::fmt(&a[0], f)?;
f.write_str(", ")?;
core::fmt::UpperExp::fmt(&a[1], f)?;
f.write_str(", ")?;
core::fmt::UpperExp::fmt(&a[2], f)?;
f.write_str(", ")?;
core::fmt::UpperExp::fmt(&a[3], f)?;
f.write_str(")")
}
}
impl Add for m128 {
type Output = Self;
#[inline(always)]
fn add(self, rhs: Self) -> Self {
Self(unsafe { _mm_add_ps(self.0, rhs.0) })
}
}
impl AddAssign for m128 {
#[inline(always)]
fn add_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_add_ps(self.0, rhs.0) };
}
}
impl BitAnd for m128 {
type Output = Self;
#[inline(always)]
fn bitand(self, rhs: Self) -> Self {
Self(unsafe { _mm_and_ps(self.0, rhs.0) })
}
}
impl BitAndAssign for m128 {
#[inline(always)]
fn bitand_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_and_ps(self.0, rhs.0) };
}
}
impl Div for m128 {
type Output = Self;
#[inline(always)]
fn div(self, rhs: Self) -> Self {
Self(unsafe { _mm_div_ps(self.0, rhs.0) })
}
}
impl DivAssign for m128 {
#[inline(always)]
fn div_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_div_ps(self.0, rhs.0) };
}
}
impl Mul for m128 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: Self) -> Self {
Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
}
}
impl MulAssign for m128 {
#[inline(always)]
fn mul_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_mul_ps(self.0, rhs.0) };
}
}
impl Sub for m128 {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: Self) -> Self {
Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
}
}
impl SubAssign for m128 {
#[inline(always)]
fn sub_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_sub_ps(self.0, rhs.0) };
}
}
impl BitOr for m128 {
type Output = Self;
#[inline(always)]
fn bitor(self, rhs: Self) -> Self {
Self(unsafe { _mm_or_ps(self.0, rhs.0) })
}
}
impl BitOrAssign for m128 {
#[inline(always)]
fn bitor_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_or_ps(self.0, rhs.0) };
}
}
impl BitXor for m128 {
type Output = Self;
#[inline(always)]
fn bitxor(self, rhs: Self) -> Self {
Self(unsafe { _mm_xor_ps(self.0, rhs.0) })
}
}
impl BitXorAssign for m128 {
#[inline(always)]
fn bitxor_assign(&mut self, rhs: Self) {
self.0 = unsafe { _mm_xor_ps(self.0, rhs.0) };
}
}
impl Neg for m128 {
type Output = Self;
fn neg(self) -> Self {
Self(unsafe { _mm_sub_ps(_mm_setzero_ps(), self.0) })
}
}
impl Not for m128 {
type Output = Self;
fn not(self) -> Self {
let f: f32 = cast(-1_i32);
let b = m128::splat(f);
self ^ b
}
}
impl m128 {
#[inline(always)]
pub fn add0(self, rhs: Self) -> Self {
Self(unsafe { _mm_add_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn andnot(self, rhs: Self) -> Self {
Self(unsafe { _mm_andnot_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_eq(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpeq_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_eq0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpeq_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ge(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpge_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ge0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpge_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_gt(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpgt_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_gt0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpgt_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_le(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmple_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_le0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmple_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_lt(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmplt_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_lt0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmplt_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ne(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpneq_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ne0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpneq_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nge(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnge_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nge0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnge_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ngt(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpngt_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ngt0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpngt_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nle(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnle_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nle0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnle_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nlt(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnlt_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nlt0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpnlt_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ordinary(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpord_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_ordinary0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpord_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nan(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpunord_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmp_nan0(self, rhs: Self) -> Self {
Self(unsafe { _mm_cmpunord_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn cmpi_eq0(self, rhs: Self) -> i32 {
unsafe { _mm_comieq_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn cmpi_ge0(self, rhs: Self) -> i32 {
unsafe { _mm_comige_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn cmpi_gt0(self, rhs: Self) -> i32 {
unsafe { _mm_comigt_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn cmpi_le0(self, rhs: Self) -> i32 {
unsafe { _mm_comile_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn cmpi_lt0(self, rhs: Self) -> i32 {
unsafe { _mm_comilt_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn cmpi_ne0(self, rhs: Self) -> i32 {
unsafe { _mm_comineq_ss(self.0, rhs.0) }
}
#[inline(always)]
pub fn round_replace0_i32(self, rhs: i32) -> Self {
Self(unsafe { _mm_cvt_si2ss(self.0, rhs) })
}
#[inline(always)]
pub fn round_extract0_i32(self) -> i32 {
unsafe { _mm_cvt_ss2si(self.0) }
}
#[inline(always)]
#[cfg(target_arch = "x86_64")]
pub fn round_replace0_i64(self, rhs: i64) -> Self {
Self(unsafe { _mm_cvtsi64_ss(self.0, rhs) })
}
#[inline(always)]
pub fn extract0_f32(self) -> f32 {
unsafe { _mm_cvtss_f32(self.0) }
}
#[inline(always)]
#[cfg(target_arch = "x86_64")]
pub fn round_extract0_i64(self) -> i64 {
unsafe { _mm_cvtss_si64(self.0) }
}
#[inline(always)]
pub fn truncate_extract0_i32(self) -> i32 {
unsafe { _mm_cvtt_ss2si(self.0) }
}
#[inline(always)]
#[cfg(target_arch = "x86_64")]
pub fn truncate_extract0_i64(self) -> i64 {
unsafe { _mm_cvttss_si64(self.0) }
}
#[inline(always)]
pub fn div0(self, rhs: Self) -> Self {
Self(unsafe { _mm_div_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn load(addr: &Align16<[f32; 4]>) -> Self {
let ptr: *const f32 = addr as *const Align16<[f32; 4]> as *const f32;
Self(unsafe { _mm_load_ps(ptr) })
}
#[allow(clippy::trivially_copy_pass_by_ref)]
#[inline(always)]
pub fn load_splat(addr: &f32) -> Self {
Self(unsafe { _mm_load_ps1(addr) })
}
#[allow(clippy::trivially_copy_pass_by_ref)]
#[inline(always)]
pub fn load0(addr: &f32) -> Self {
Self(unsafe { _mm_load_ss(addr) })
}
#[inline(always)]
pub fn load_reverse(addr: &Align16<[f32; 4]>) -> Self {
let ptr: *const f32 = addr as *const Align16<[f32; 4]> as *const f32;
Self(unsafe { _mm_loadr_ps(ptr) })
}
#[inline(always)]
pub fn load_unaligned(addr: &[f32; 4]) -> Self {
let ptr: *const f32 = addr as *const [f32; 4] as *const f32;
Self(unsafe { _mm_loadu_ps(ptr) })
}
#[inline(always)]
pub fn max(self, rhs: Self) -> Self {
Self(unsafe { _mm_max_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn max0(self, rhs: Self) -> Self {
Self(unsafe { _mm_max_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn min(self, rhs: Self) -> Self {
Self(unsafe { _mm_min_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn min0(self, rhs: Self) -> Self {
Self(unsafe { _mm_min_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn copy0(self, rhs: Self) -> Self {
Self(unsafe { _mm_move_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn copy_high_low(self, rhs: Self) -> Self {
Self(unsafe { _mm_movehl_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn copy_low_high(self, rhs: Self) -> Self {
Self(unsafe { _mm_movelh_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn move_mask(self) -> i32 {
unsafe { _mm_movemask_ps(self.0) }
}
#[inline(always)]
pub fn reciprocal(self) -> Self {
Self(unsafe { _mm_rcp_ps(self.0) })
}
#[inline(always)]
pub fn reciprocal0(self) -> Self {
Self(unsafe { _mm_rcp_ss(self.0) })
}
#[inline(always)]
pub fn reciprocal_sqrt(self) -> Self {
Self(unsafe { _mm_rsqrt_ps(self.0) })
}
#[inline(always)]
pub fn reciprocal_sqrt0(self) -> Self {
Self(unsafe { _mm_rsqrt_ss(self.0) })
}
#[inline(always)]
pub fn set(a: f32, b: f32, c: f32, d: f32) -> Self {
Self(unsafe { _mm_set_ps(a, b, c, d) })
}
#[inline(always)]
pub fn splat(a: f32) -> Self {
Self(unsafe { _mm_set1_ps(a) })
}
#[inline(always)]
pub fn set0(a: f32) -> Self {
Self(unsafe { _mm_set_ss(a) })
}
#[inline(always)]
pub fn set_reverse(a: f32, b: f32, c: f32, d: f32) -> Self {
Self(unsafe { _mm_setr_ps(a, b, c, d) })
}
#[inline(always)]
pub fn sqrt(self) -> Self {
Self(unsafe { _mm_sqrt_ps(self.0) })
}
#[inline(always)]
pub fn sqrt0(self) -> Self {
Self(unsafe { _mm_sqrt_ss(self.0) })
}
#[inline(always)]
pub fn store(self, addr: &mut Align16<[f32; 4]>) {
let ptr: *mut f32 = addr as *mut Align16<[f32; 4]> as *mut f32;
unsafe { _mm_store_ps(ptr, self.0) }
}
#[inline(always)]
pub fn store0_all(self, addr: &mut Align16<[f32; 4]>) {
let ptr: *mut f32 = addr as *mut Align16<[f32; 4]> as *mut f32;
unsafe { _mm_store_ps1(ptr, self.0) }
}
#[inline(always)]
pub fn store0(self, addr: &mut f32) {
unsafe { _mm_store_ss(addr, self.0) }
}
#[inline(always)]
pub fn store_reverse(self, addr: &mut Align16<[f32; 4]>) {
let ptr: *mut f32 = addr as *mut Align16<[f32; 4]> as *mut f32;
unsafe { _mm_storer_ps(ptr, self.0) }
}
#[inline(always)]
pub fn store_unaligned(self, addr: &mut [f32; 4]) {
let ptr: *mut f32 = addr as *mut [f32; 4] as *mut f32;
unsafe { _mm_storeu_ps(ptr, self.0) }
}
#[inline(always)]
pub fn sub0(self, rhs: Self) -> Self {
Self(unsafe { _mm_sub_ss(self.0, rhs.0) })
}
#[inline(always)]
pub fn unpack_high(self, rhs: Self) -> Self {
Self(unsafe { _mm_unpackhi_ps(self.0, rhs.0) })
}
#[inline(always)]
pub fn unpack_low(self, rhs: Self) -> Self {
Self(unsafe { _mm_unpacklo_ps(self.0, rhs.0) })
}
}
#[inline(always)]
pub fn prefetch0(ptr: *const impl Sized) {
unsafe { _mm_prefetch(ptr as *const i8, _MM_HINT_T0) }
}
#[inline(always)]
pub fn prefetch1(ptr: *const impl Sized) {
unsafe { _mm_prefetch(ptr as *const i8, _MM_HINT_T1) }
}
#[inline(always)]
pub fn prefetch2(ptr: *const impl Sized) {
unsafe { _mm_prefetch(ptr as *const i8, _MM_HINT_T2) }
}
#[inline(always)]
pub fn prefetch_nta(ptr: *const impl Sized) {
unsafe { _mm_prefetch(ptr as *const i8, _MM_HINT_NTA) }
}
#[inline(always)]
pub fn transpose4(r0: &mut m128, r1: &mut m128, r2: &mut m128, r3: &mut m128) {
unsafe { _MM_TRANSPOSE4_PS(&mut r0.0, &mut r1.0, &mut r2.0, &mut r3.0) }
}
#[macro_export]
macro_rules! shuffle128 {
($a:expr, $b:expr, [$i0a:literal,$i1a:literal,$i2b:literal,$i3b:literal]) => {{
const I0A: u32 = $i0a & 0b11;
const I1A: u32 = $i1a & 0b11;
const I2B: u32 = $i2b & 0b11;
const I3B: u32 = $i3b & 0b11;
const IMM: i32 = (I0A | I1A << 2 | I2B << 4 | I3B << 6) as i32;
#[cfg(target_arch = "x86")]
use core::arch::x86::_mm_shuffle_ps;
#[cfg(target_arch = "x86")]
use $crate::arch::x86::m128;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_shuffle_ps;
#[cfg(target_arch = "x86_64")]
use $crate::arch::x86_64::m128;
m128(unsafe { _mm_shuffle_ps($a.0, $b.0, IMM) })
}};
}