use crate::{Level, arch_types::ArchTypes, prelude::*, seal::Seal};
use crate::{
f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4,
mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32,
u32x4, u32x8, u32x16,
};
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[doc = "The SIMD token for the \"SSE4.2\" level."]
#[derive(Clone, Copy, Debug)]
pub struct Sse4_2 {
pub sse4_2: crate::core_arch::x86::Sse4_2,
}
impl Sse4_2 {
#[doc = r" Create a SIMD token."]
#[doc = r""]
#[doc = r" # Safety"]
#[doc = r""]
#[doc = r" The SSE4.2 CPU feature must be available."]
#[inline]
pub const unsafe fn new_unchecked() -> Self {
Sse4_2 {
sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() },
}
}
}
impl Seal for Sse4_2 {}
impl ArchTypes for Sse4_2 {
type f32x4 = crate::support::Aligned128<__m128>;
type i8x16 = crate::support::Aligned128<__m128i>;
type u8x16 = crate::support::Aligned128<__m128i>;
type mask8x16 = crate::support::Aligned128<__m128i>;
type i16x8 = crate::support::Aligned128<__m128i>;
type u16x8 = crate::support::Aligned128<__m128i>;
type mask16x8 = crate::support::Aligned128<__m128i>;
type i32x4 = crate::support::Aligned128<__m128i>;
type u32x4 = crate::support::Aligned128<__m128i>;
type mask32x4 = crate::support::Aligned128<__m128i>;
type f64x2 = crate::support::Aligned128<__m128d>;
type mask64x2 = crate::support::Aligned128<__m128i>;
type f32x8 = crate::support::Aligned256<[__m128; 2usize]>;
type i8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
type u8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
type mask8x32 = crate::support::Aligned256<[__m128i; 2usize]>;
type i16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
type u16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
type mask16x16 = crate::support::Aligned256<[__m128i; 2usize]>;
type i32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
type u32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
type mask32x8 = crate::support::Aligned256<[__m128i; 2usize]>;
type f64x4 = crate::support::Aligned256<[__m128d; 2usize]>;
type mask64x4 = crate::support::Aligned256<[__m128i; 2usize]>;
type f32x16 = crate::support::Aligned512<[__m128; 4usize]>;
type i8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
type u8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
type mask8x64 = crate::support::Aligned512<[__m128i; 4usize]>;
type i16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
type u16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
type mask16x32 = crate::support::Aligned512<[__m128i; 4usize]>;
type i32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
type u32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
type mask32x16 = crate::support::Aligned512<[__m128i; 4usize]>;
type f64x8 = crate::support::Aligned512<[__m128d; 4usize]>;
type mask64x8 = crate::support::Aligned512<[__m128i; 4usize]>;
}
impl Simd for Sse4_2 {
type f32s = f32x4<Self>;
type f64s = f64x2<Self>;
type u8s = u8x16<Self>;
type i8s = i8x16<Self>;
type u16s = u16x8<Self>;
type i16s = i16x8<Self>;
type u32s = u32x4<Self>;
type i32s = i32x4<Self>;
type mask8s = mask8x16<Self>;
type mask16s = mask16x8<Self>;
type mask32s = mask32x4<Self>;
type mask64s = mask64x2<Self>;
#[inline(always)]
fn level(self) -> Level {
#[cfg(not(all(target_feature = "avx2", target_feature = "fma")))]
return Level::Sse4_2(self);
#[cfg(all(target_feature = "avx2", target_feature = "fma"))]
{
Level::baseline()
}
}
#[inline]
fn vectorize<F: FnOnce() -> R, R>(self, f: F) -> R {
#[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]
unsafe fn vectorize_sse4_2<F: FnOnce() -> R, R>(f: F) -> R {
f()
}
unsafe { vectorize_sse4_2(f) }
}
#[inline(always)]
fn splat_f32x4(self, val: f32) -> f32x4<Self> {
unsafe { _mm_set1_ps(val).simd_into(self) }
}
#[inline(always)]
fn load_array_f32x4(self, val: [f32; 4usize]) -> f32x4<Self> {
f32x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f32x4(self, val: &[f32; 4usize]) -> f32x4<Self> {
f32x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f32x4(self, a: f32x4<Self>) -> [f32; 4usize] {
unsafe { core::mem::transmute::<__m128, [f32; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f32x4(self, a: &f32x4<Self>) -> &[f32; 4usize] {
unsafe { core::mem::transmute::<&__m128, &[f32; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f32x4(self, a: &mut f32x4<Self>) -> &mut [f32; 4usize] {
unsafe { core::mem::transmute::<&mut __m128, &mut [f32; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f32x4(self, a: f32x4<Self>, dest: &mut [f32; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f32,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f32x4(self, a: u8x16<Self>) -> f32x4<Self> {
unsafe {
f32x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_f32x4(b).val.0,
self.cvt_to_bytes_f32x4(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_f32x4(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f32x4<const SHIFT: usize>(
self,
a: f32x4<Self>,
b: f32x4<Self>,
) -> f32x4<Self> {
self.slide_f32x4::<SHIFT>(a, b)
}
#[inline(always)]
fn abs_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) }
}
#[inline(always)]
fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) }
}
#[inline(always)]
fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_sqrt_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn div_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn copysign_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe {
let mask = _mm_set1_ps(-0.0);
_mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self)
}
}
#[inline(always)]
fn simd_eq_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_lt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_le_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_ge_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_gt_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn zip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_high_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn min_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe {
let intermediate = _mm_max_ps(a.into(), b.into());
let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
_mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
}
}
#[inline(always)]
fn min_precise_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
unsafe {
let intermediate = _mm_min_ps(a.into(), b.into());
let b_is_nan = _mm_cmpunord_ps(b.into(), b.into());
_mm_blendv_ps(intermediate, a.into(), b_is_nan).simd_into(self)
}
}
#[inline(always)]
fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
a * b + c
}
#[inline(always)]
fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
a * b - c
}
#[inline(always)]
fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe {
_mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn ceil_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe {
_mm_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn round_ties_even_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe {
_mm_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
.simd_into(self)
}
}
#[inline(always)]
fn fract_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
a - self.trunc_f32x4(a)
}
#[inline(always)]
fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe {
_mm_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_blendv_ps(c.into(), b.into(), _mm_castsi128_ps(a.into())).simd_into(self) }
}
#[inline(always)]
fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self> {
f32x8 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
unsafe { _mm_castps_pd(a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
unsafe { _mm_castps_si128(a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self> {
unsafe { _mm_castps_si128(a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
unsafe { _mm_castps_si128(a.into()).simd_into(self) }
}
#[inline(always)]
fn cvt_u32_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
unsafe {
let mut converted = _mm_cvttps_epi32(a.into());
let in_range = _mm_cmplt_ps(a.into(), _mm_set1_ps(2147483648.0));
let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
if !all_in_range {
let excess = _mm_sub_ps(a.into(), _mm_set1_ps(2147483648.0));
let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
converted = _mm_add_epi32(converted, excess_converted);
}
converted.simd_into(self)
}
}
#[inline(always)]
fn cvt_u32_precise_f32x4(self, a: f32x4<Self>) -> u32x4<Self> {
unsafe {
let a = _mm_max_ps(a.into(), _mm_setzero_ps());
let mut converted = _mm_cvttps_epi32(a);
let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
if !all_in_range {
let exceeds_unsigned_range =
_mm_castps_si128(_mm_cmplt_ps(_mm_set1_ps(4294967040.0), a));
let excess = _mm_sub_ps(a, _mm_set1_ps(2147483648.0));
let excess_converted = _mm_cvttps_epi32(_mm_andnot_ps(in_range, excess));
converted = _mm_add_epi32(converted, excess_converted);
converted = _mm_blendv_epi8(
converted,
_mm_set1_epi32(u32::MAX.cast_signed()),
exceeds_unsigned_range,
);
}
converted.simd_into(self)
}
}
#[inline(always)]
fn cvt_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
unsafe { _mm_cvttps_epi32(a.into()).simd_into(self) }
}
#[inline(always)]
fn cvt_i32_precise_f32x4(self, a: f32x4<Self>) -> i32x4<Self> {
unsafe {
let a = a.into();
let mut converted = _mm_cvttps_epi32(a);
let in_range = _mm_cmplt_ps(a, _mm_set1_ps(2147483648.0));
let all_in_range = _mm_movemask_ps(in_range) == 0b1111;
if !all_in_range {
converted = _mm_blendv_epi8(
_mm_set1_epi32(i32::MAX),
converted,
_mm_castps_si128(in_range),
);
let is_not_nan = _mm_castps_si128(_mm_cmpord_ps(a, a));
converted = _mm_and_si128(converted, is_not_nan);
}
converted.simd_into(self)
}
}
#[inline(always)]
fn splat_i8x16(self, val: i8) -> i8x16<Self> {
unsafe { _mm_set1_epi8(val).simd_into(self) }
}
#[inline(always)]
fn load_array_i8x16(self, val: [i8; 16usize]) -> i8x16<Self> {
i8x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i8x16(self, val: &[i8; 16usize]) -> i8x16<Self> {
i8x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i8x16(self, a: i8x16<Self>) -> [i8; 16usize] {
unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i8x16(self, a: &i8x16<Self>) -> &[i8; 16usize] {
unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i8x16(self, a: &mut i8x16<Self>) -> &mut [i8; 16usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i8x16(self, a: i8x16<Self>, dest: &mut [i8; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i8x16(self, a: u8x16<Self>) -> i8x16<Self> {
unsafe {
i8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_i8x16(b).val.0,
self.cvt_to_bytes_i8x16(a).val.0,
SHIFT,
);
self.cvt_from_bytes_i8x16(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i8x16<const SHIFT: usize>(
self,
a: i8x16<Self>,
b: i8x16<Self>,
) -> i8x16<Self> {
self.slide_i8x16::<SHIFT>(a, b)
}
#[inline(always)]
fn add_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe {
let dst_even = _mm_mullo_epi16(a.into(), b.into());
let dst_odd =
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
_mm_or_si128(
_mm_slli_epi16(dst_odd, 8),
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
)
.simd_into(self)
}
}
#[inline(always)]
fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
a ^ !0
}
#[inline(always)]
fn shl_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
unsafe {
let val = a.into();
let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
_mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
}
}
#[inline(always)]
fn shlv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_i8x16(self, a: i8x16<Self>, shift: u32) -> i8x16<Self> {
unsafe {
let val = a.into();
let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
let lo_16 = _mm_unpacklo_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
let hi_16 = _mm_unpackhi_epi8(val, _mm_cmpgt_epi8(_mm_setzero_si128(), val));
let lo_shifted = _mm_sra_epi16(lo_16, shift_count);
let hi_shifted = _mm_sra_epi16(hi_16, shift_count);
_mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self)
}
}
#[inline(always)]
fn shrv_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpgt_epi8(b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_le_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_i8x16(self, a: mask8x16<Self>, b: i8x16<Self>, c: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x32<Self> {
i8x32 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn neg_i8x16(self, a: i8x16<Self>) -> i8x16<Self> {
unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_u8_i8x16(self, a: i8x16<Self>) -> u8x16<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn reinterpret_u32_i8x16(self, a: i8x16<Self>) -> u32x4<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn splat_u8x16(self, val: u8) -> u8x16<Self> {
unsafe { _mm_set1_epi8(val.cast_signed()).simd_into(self) }
}
#[inline(always)]
fn load_array_u8x16(self, val: [u8; 16usize]) -> u8x16<Self> {
u8x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u8x16(self, val: &[u8; 16usize]) -> u8x16<Self> {
u8x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u8x16(self, a: u8x16<Self>) -> [u8; 16usize] {
unsafe { core::mem::transmute::<__m128i, [u8; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u8x16(self, a: &u8x16<Self>) -> &[u8; 16usize] {
unsafe { core::mem::transmute::<&__m128i, &[u8; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u8x16(self, a: &mut u8x16<Self>) -> &mut [u8; 16usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [u8; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u8x16(self, a: u8x16<Self>, dest: &mut [u8; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u8,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_u8x16(b).val.0,
self.cvt_to_bytes_u8x16(a).val.0,
SHIFT,
);
self.cvt_from_bytes_u8x16(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u8x16<const SHIFT: usize>(
self,
a: u8x16<Self>,
b: u8x16<Self>,
) -> u8x16<Self> {
self.slide_u8x16::<SHIFT>(a, b)
}
#[inline(always)]
fn add_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe {
let dst_even = _mm_mullo_epi16(a.into(), b.into());
let dst_odd =
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
_mm_or_si128(
_mm_slli_epi16(dst_odd, 8),
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
)
.simd_into(self)
}
}
#[inline(always)]
fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_u8x16(self, a: u8x16<Self>) -> u8x16<Self> {
a ^ !0
}
#[inline(always)]
fn shl_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
unsafe {
let val = a.into();
let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
let lo_shifted = _mm_sll_epi16(lo_16, shift_count);
let hi_shifted = _mm_sll_epi16(hi_16, shift_count);
_mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
}
}
#[inline(always)]
fn shlv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_u8x16(self, a: u8x16<Self>, shift: u32) -> u8x16<Self> {
unsafe {
let val = a.into();
let shift_count = _mm_cvtsi32_si128(shift.cast_signed());
let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128());
let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128());
let lo_shifted = _mm_srl_epi16(lo_16, shift_count);
let hi_shifted = _mm_srl_epi16(hi_16, shift_count);
_mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self)
}
}
#[inline(always)]
fn shrv_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
unsafe {
let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi8(b_signed, a_signed).simd_into(self)
}
}
#[inline(always)]
fn simd_le_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> mask8x16<Self> {
unsafe {
let sign_bit = _mm_set1_epi8(0x80u8.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi8(a_signed, b_signed).simd_into(self)
}
}
#[inline(always)]
fn zip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_u8x16(self, a: mask8x16<Self>, b: u8x16<Self>, c: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x32<Self> {
u8x32 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn widen_u8x16(self, a: u8x16<Self>) -> u16x16<Self> {
unsafe {
let raw = a.into();
let high = _mm_cvtepu8_epi16(raw).simd_into(self);
let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self);
self.combine_u16x8(high, low)
}
}
#[inline(always)]
fn reinterpret_u32_u8x16(self, a: u8x16<Self>) -> u32x4<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn splat_mask8x16(self, val: i8) -> mask8x16<Self> {
unsafe { _mm_set1_epi8(val).simd_into(self) }
}
#[inline(always)]
fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self> {
mask8x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask8x16(self, val: &[i8; 16usize]) -> mask8x16<Self> {
mask8x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize] {
unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask8x16(self, a: &mask8x16<Self>) -> &[i8; 16usize] {
unsafe { core::mem::transmute::<&__m128i, &[i8; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask8x16(self, a: &mut mask8x16<Self>) -> &mut [i8; 16usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i8; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask8x16(self, a: mask8x16<Self>, dest: &mut [i8; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask8x16(self, a: u8x16<Self>) -> mask8x16<Self> {
unsafe {
mask8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask8x16(self, a: mask8x16<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask8x16<const SHIFT: usize>(
self,
a: mask8x16<Self>,
b: mask8x16<Self>,
) -> mask8x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_mask8x16(b).val.0,
self.cvt_to_bytes_mask8x16(a).val.0,
SHIFT,
);
self.cvt_from_bytes_mask8x16(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask8x16<const SHIFT: usize>(
self,
a: mask8x16<Self>,
b: mask8x16<Self>,
) -> mask8x16<Self> {
self.slide_mask8x16::<SHIFT>(a, b)
}
#[inline(always)]
fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_mask8x16(self, a: mask8x16<Self>) -> mask8x16<Self> {
a ^ !0
}
#[inline(always)]
fn select_mask8x16(
self,
a: mask8x16<Self>,
b: mask8x16<Self>,
c: mask8x16<Self>,
) -> mask8x16<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_eq_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn any_true_mask8x16(self, a: mask8x16<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
}
#[inline(always)]
fn all_true_mask8x16(self, a: mask8x16<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
}
#[inline(always)]
fn any_false_mask8x16(self, a: mask8x16<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
}
#[inline(always)]
fn all_false_mask8x16(self, a: mask8x16<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
}
#[inline(always)]
fn combine_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x32<Self> {
mask8x32 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn splat_i16x8(self, val: i16) -> i16x8<Self> {
unsafe { _mm_set1_epi16(val).simd_into(self) }
}
#[inline(always)]
fn load_array_i16x8(self, val: [i16; 8usize]) -> i16x8<Self> {
i16x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i16x8(self, val: &[i16; 8usize]) -> i16x8<Self> {
i16x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i16x8(self, a: i16x8<Self>) -> [i16; 8usize] {
unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i16x8(self, a: &i16x8<Self>) -> &[i16; 8usize] {
unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i16x8(self, a: &mut i16x8<Self>) -> &mut [i16; 8usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i16x8(self, a: i16x8<Self>, dest: &mut [i16; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i16x8(self, a: u8x16<Self>) -> i16x8<Self> {
unsafe {
i16x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_i16x8(b).val.0,
self.cvt_to_bytes_i16x8(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_i16x8(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i16x8<const SHIFT: usize>(
self,
a: i16x8<Self>,
b: i16x8<Self>,
) -> i16x8<Self> {
self.slide_i16x8::<SHIFT>(a, b)
}
#[inline(always)]
fn add_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn and_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
a ^ !0
}
#[inline(always)]
fn shl_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shlv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_i16x8(self, a: i16x8<Self>, shift: u32) -> i16x8<Self> {
unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shrv_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpgt_epi16(b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_le_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_i16x8(self, a: mask16x8<Self>, b: i16x8<Self>, c: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_i16x8(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x16<Self> {
i16x16 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn neg_i16x8(self, a: i16x8<Self>) -> i16x8<Self> {
unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_u8_i16x8(self, a: i16x8<Self>) -> u8x16<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn reinterpret_u32_i16x8(self, a: i16x8<Self>) -> u32x4<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn splat_u16x8(self, val: u16) -> u16x8<Self> {
unsafe { _mm_set1_epi16(val.cast_signed()).simd_into(self) }
}
#[inline(always)]
fn load_array_u16x8(self, val: [u16; 8usize]) -> u16x8<Self> {
u16x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u16x8(self, val: &[u16; 8usize]) -> u16x8<Self> {
u16x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u16x8(self, a: u16x8<Self>) -> [u16; 8usize] {
unsafe { core::mem::transmute::<__m128i, [u16; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u16x8(self, a: &u16x8<Self>) -> &[u16; 8usize] {
unsafe { core::mem::transmute::<&__m128i, &[u16; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u16x8(self, a: &mut u16x8<Self>) -> &mut [u16; 8usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [u16; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u16x8(self, a: u16x8<Self>, dest: &mut [u16; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u16,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u16x8(self, a: u8x16<Self>) -> u16x8<Self> {
unsafe {
u16x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_u16x8(b).val.0,
self.cvt_to_bytes_u16x8(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_u16x8(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u16x8<const SHIFT: usize>(
self,
a: u16x8<Self>,
b: u16x8<Self>,
) -> u16x8<Self> {
self.slide_u16x8::<SHIFT>(a, b)
}
#[inline(always)]
fn add_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn and_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_u16x8(self, a: u16x8<Self>) -> u16x8<Self> {
a ^ !0
}
#[inline(always)]
fn shl_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shlv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_u16x8(self, a: u16x8<Self>, shift: u32) -> u16x8<Self> {
unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shrv_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
unsafe {
let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi16(b_signed, a_signed).simd_into(self)
}
}
#[inline(always)]
fn simd_le_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> mask16x8<Self> {
unsafe {
let sign_bit = _mm_set1_epi16(0x8000u16.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi16(a_signed, b_signed).simd_into(self)
}
}
#[inline(always)]
fn zip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe {
let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
let t1 = _mm_shuffle_epi8(a.into(), mask);
let t2 = _mm_shuffle_epi8(b.into(), mask);
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_u16x8(self, a: mask16x8<Self>, b: u16x8<Self>, c: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_u16x8(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x16<Self> {
u16x16 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn reinterpret_u8_u16x8(self, a: u16x8<Self>) -> u8x16<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn reinterpret_u32_u16x8(self, a: u16x8<Self>) -> u32x4<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn splat_mask16x8(self, val: i16) -> mask16x8<Self> {
unsafe { _mm_set1_epi16(val).simd_into(self) }
}
#[inline(always)]
fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self> {
mask16x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask16x8(self, val: &[i16; 8usize]) -> mask16x8<Self> {
mask16x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize] {
unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask16x8(self, a: &mask16x8<Self>) -> &[i16; 8usize] {
unsafe { core::mem::transmute::<&__m128i, &[i16; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask16x8(self, a: &mut mask16x8<Self>) -> &mut [i16; 8usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i16; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask16x8(self, a: mask16x8<Self>, dest: &mut [i16; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask16x8(self, a: u8x16<Self>) -> mask16x8<Self> {
unsafe {
mask16x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask16x8(self, a: mask16x8<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask16x8<const SHIFT: usize>(
self,
a: mask16x8<Self>,
b: mask16x8<Self>,
) -> mask16x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_mask16x8(b).val.0,
self.cvt_to_bytes_mask16x8(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_mask16x8(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask16x8<const SHIFT: usize>(
self,
a: mask16x8<Self>,
b: mask16x8<Self>,
) -> mask16x8<Self> {
self.slide_mask16x8::<SHIFT>(a, b)
}
#[inline(always)]
fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_mask16x8(self, a: mask16x8<Self>) -> mask16x8<Self> {
a ^ !0
}
#[inline(always)]
fn select_mask16x8(
self,
a: mask16x8<Self>,
b: mask16x8<Self>,
c: mask16x8<Self>,
) -> mask16x8<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_eq_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn any_true_mask16x8(self, a: mask16x8<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 != 0 }
}
#[inline(always)]
fn all_true_mask16x8(self, a: mask16x8<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 == 0xffff }
}
#[inline(always)]
fn any_false_mask16x8(self, a: mask16x8<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 != 0xffff }
}
#[inline(always)]
fn all_false_mask16x8(self, a: mask16x8<Self>) -> bool {
unsafe { _mm_movemask_epi8(a.into()) as u32 == 0 }
}
#[inline(always)]
fn combine_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x16<Self> {
mask16x16 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn splat_i32x4(self, val: i32) -> i32x4<Self> {
unsafe { _mm_set1_epi32(val).simd_into(self) }
}
#[inline(always)]
fn load_array_i32x4(self, val: [i32; 4usize]) -> i32x4<Self> {
i32x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i32x4(self, val: &[i32; 4usize]) -> i32x4<Self> {
i32x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i32x4(self, a: i32x4<Self>) -> [i32; 4usize] {
unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i32x4(self, a: &i32x4<Self>) -> &[i32; 4usize] {
unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i32x4(self, a: &mut i32x4<Self>) -> &mut [i32; 4usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i32x4(self, a: i32x4<Self>, dest: &mut [i32; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i32x4(self, a: u8x16<Self>) -> i32x4<Self> {
unsafe {
i32x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_i32x4(b).val.0,
self.cvt_to_bytes_i32x4(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_i32x4(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i32x4<const SHIFT: usize>(
self,
a: i32x4<Self>,
b: i32x4<Self>,
) -> i32x4<Self> {
self.slide_i32x4::<SHIFT>(a, b)
}
#[inline(always)]
fn add_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn and_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
a ^ !0
}
#[inline(always)]
fn shl_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shlv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_i32x4(self, a: i32x4<Self>, shift: u32) -> i32x4<Self> {
unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shrv_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpgt_epi32(b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_le_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe {
let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe {
let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_i32x4(self, a: mask32x4<Self>, b: i32x4<Self>, c: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_i32x4(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x8<Self> {
i32x8 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn neg_i32x4(self, a: i32x4<Self>) -> i32x4<Self> {
unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) }
}
#[inline(always)]
fn reinterpret_u8_i32x4(self, a: i32x4<Self>) -> u8x16<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn reinterpret_u32_i32x4(self, a: i32x4<Self>) -> u32x4<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn cvt_f32_i32x4(self, a: i32x4<Self>) -> f32x4<Self> {
unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn splat_u32x4(self, val: u32) -> u32x4<Self> {
unsafe { _mm_set1_epi32(val.cast_signed()).simd_into(self) }
}
#[inline(always)]
fn load_array_u32x4(self, val: [u32; 4usize]) -> u32x4<Self> {
u32x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u32x4(self, val: &[u32; 4usize]) -> u32x4<Self> {
u32x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u32x4(self, a: u32x4<Self>) -> [u32; 4usize] {
unsafe { core::mem::transmute::<__m128i, [u32; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u32x4(self, a: &u32x4<Self>) -> &[u32; 4usize] {
unsafe { core::mem::transmute::<&__m128i, &[u32; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u32x4(self, a: &mut u32x4<Self>) -> &mut [u32; 4usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [u32; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u32x4(self, a: u32x4<Self>, dest: &mut [u32; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u32,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u32x4(self, a: u8x16<Self>) -> u32x4<Self> {
unsafe {
u32x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_u32x4(b).val.0,
self.cvt_to_bytes_u32x4(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_u32x4(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u32x4<const SHIFT: usize>(
self,
a: u32x4<Self>,
b: u32x4<Self>,
) -> u32x4<Self> {
self.slide_u32x4::<SHIFT>(a, b)
}
#[inline(always)]
fn add_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn and_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_u32x4(self, a: u32x4<Self>) -> u32x4<Self> {
a ^ !0
}
#[inline(always)]
fn shl_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shlv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
core::array::from_fn(|i| core::ops::Shl::shl(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn shr_u32x4(self, a: u32x4<Self>, shift: u32) -> u32x4<Self> {
unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift.cast_signed())).simd_into(self) }
}
#[inline(always)]
fn shrv_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
core::array::from_fn(|i| core::ops::Shr::shr(a[i], b[i])).simd_into(self)
}
#[inline(always)]
fn simd_eq_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn simd_lt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
unsafe {
let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi32(b_signed, a_signed).simd_into(self)
}
}
#[inline(always)]
fn simd_le_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_ge_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_gt_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> mask32x4<Self> {
unsafe {
let sign_bit = _mm_set1_epi32(0x80000000u32.cast_signed());
let a_signed = _mm_xor_si128(a.into(), sign_bit);
let b_signed = _mm_xor_si128(b.into(), sign_bit);
_mm_cmpgt_epi32(a_signed, b_signed).simd_into(self)
}
}
#[inline(always)]
fn zip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe {
let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
_mm_unpacklo_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn unzip_high_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe {
let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into());
let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into());
_mm_unpackhi_epi64(t1, t2).simd_into(self)
}
}
#[inline(always)]
fn select_u32x4(self, a: mask32x4<Self>, b: u32x4<Self>, c: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn min_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn combine_u32x4(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x8<Self> {
u32x8 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn reinterpret_u8_u32x4(self, a: u32x4<Self>) -> u8x16<Self> {
__m128i::from(a).simd_into(self)
}
#[inline(always)]
fn cvt_f32_u32x4(self, a: u32x4<Self>) -> f32x4<Self> {
unsafe {
let a = a.into();
let lo = _mm_blend_epi16::<0xAA>(a, _mm_set1_epi32(0x4B000000));
let hi = _mm_blend_epi16::<0xAA>(_mm_srli_epi32::<16>(a), _mm_set1_epi32(0x53000000));
let fhi = _mm_sub_ps(
_mm_castsi128_ps(hi),
_mm_set1_ps(f32::from_bits(0x53000080)),
);
let result = _mm_add_ps(_mm_castsi128_ps(lo), fhi);
result.simd_into(self)
}
}
#[inline(always)]
fn splat_mask32x4(self, val: i32) -> mask32x4<Self> {
unsafe { _mm_set1_epi32(val).simd_into(self) }
}
#[inline(always)]
fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self> {
mask32x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask32x4(self, val: &[i32; 4usize]) -> mask32x4<Self> {
mask32x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize] {
unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask32x4(self, a: &mask32x4<Self>) -> &[i32; 4usize] {
unsafe { core::mem::transmute::<&__m128i, &[i32; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask32x4(self, a: &mut mask32x4<Self>) -> &mut [i32; 4usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i32; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask32x4(self, a: mask32x4<Self>, dest: &mut [i32; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask32x4(self, a: u8x16<Self>) -> mask32x4<Self> {
unsafe {
mask32x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask32x4(self, a: mask32x4<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask32x4<const SHIFT: usize>(
self,
a: mask32x4<Self>,
b: mask32x4<Self>,
) -> mask32x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_mask32x4(b).val.0,
self.cvt_to_bytes_mask32x4(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_mask32x4(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask32x4<const SHIFT: usize>(
self,
a: mask32x4<Self>,
b: mask32x4<Self>,
) -> mask32x4<Self> {
self.slide_mask32x4::<SHIFT>(a, b)
}
#[inline(always)]
fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_mask32x4(self, a: mask32x4<Self>) -> mask32x4<Self> {
a ^ !0
}
#[inline(always)]
fn select_mask32x4(
self,
a: mask32x4<Self>,
b: mask32x4<Self>,
c: mask32x4<Self>,
) -> mask32x4<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_eq_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn any_true_mask32x4(self, a: mask32x4<Self>) -> bool {
unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0 }
}
#[inline(always)]
fn all_true_mask32x4(self, a: mask32x4<Self>) -> bool {
unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0b1111 }
}
#[inline(always)]
fn any_false_mask32x4(self, a: mask32x4<Self>) -> bool {
unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 != 0b1111 }
}
#[inline(always)]
fn all_false_mask32x4(self, a: mask32x4<Self>) -> bool {
unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 == 0 }
}
#[inline(always)]
fn combine_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x8<Self> {
mask32x8 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn splat_f64x2(self, val: f64) -> f64x2<Self> {
unsafe { _mm_set1_pd(val).simd_into(self) }
}
#[inline(always)]
fn load_array_f64x2(self, val: [f64; 2usize]) -> f64x2<Self> {
f64x2 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f64x2(self, val: &[f64; 2usize]) -> f64x2<Self> {
f64x2 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f64x2(self, a: f64x2<Self>) -> [f64; 2usize] {
unsafe { core::mem::transmute::<__m128d, [f64; 2usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f64x2(self, a: &f64x2<Self>) -> &[f64; 2usize] {
unsafe { core::mem::transmute::<&__m128d, &[f64; 2usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f64x2(self, a: &mut f64x2<Self>) -> &mut [f64; 2usize] {
unsafe { core::mem::transmute::<&mut __m128d, &mut [f64; 2usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f64x2(self, a: f64x2<Self>, dest: &mut [f64; 2usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f64,
dest.as_mut_ptr(),
2usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f64x2(self, a: u8x16<Self>) -> f64x2<Self> {
unsafe {
f64x2 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f64x2(self, a: f64x2<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe {
if SHIFT >= 2usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_f64x2(b).val.0,
self.cvt_to_bytes_f64x2(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_f64x2(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f64x2<const SHIFT: usize>(
self,
a: f64x2<Self>,
b: f64x2<Self>,
) -> f64x2<Self> {
self.slide_f64x2::<SHIFT>(a, b)
}
#[inline(always)]
fn abs_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) }
}
#[inline(always)]
fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) }
}
#[inline(always)]
fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_sqrt_pd(a.into()).simd_into(self) }
}
#[inline(always)]
fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn mul_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn div_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn copysign_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe {
let mask = _mm_set1_pd(-0.0);
_mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self)
}
}
#[inline(always)]
fn simd_eq_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_lt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_le_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_ge_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn simd_gt_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) }
}
#[inline(always)]
fn zip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn zip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_low_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn unzip_high_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn min_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn max_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe {
let intermediate = _mm_max_pd(a.into(), b.into());
let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
_mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
}
}
#[inline(always)]
fn min_precise_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
unsafe {
let intermediate = _mm_min_pd(a.into(), b.into());
let b_is_nan = _mm_cmpunord_pd(b.into(), b.into());
_mm_blendv_pd(intermediate, a.into(), b_is_nan).simd_into(self)
}
}
#[inline(always)]
fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
a * b + c
}
#[inline(always)]
fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
a * b - c
}
#[inline(always)]
fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe {
_mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn ceil_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe {
_mm_round_pd::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn round_ties_even_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe {
_mm_round_pd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a.into())
.simd_into(self)
}
}
#[inline(always)]
fn fract_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
a - self.trunc_f64x2(a)
}
#[inline(always)]
fn trunc_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe {
_mm_round_pd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
}
}
#[inline(always)]
fn select_f64x2(self, a: mask64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_blendv_pd(c.into(), b.into(), _mm_castsi128_pd(a.into())).simd_into(self) }
}
#[inline(always)]
fn combine_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x4<Self> {
f64x4 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn reinterpret_f32_f64x2(self, a: f64x2<Self>) -> f32x4<Self> {
unsafe { _mm_castpd_ps(a.into()).simd_into(self) }
}
#[inline(always)]
fn splat_mask64x2(self, val: i64) -> mask64x2<Self> {
unsafe { _mm_set1_epi64x(val).simd_into(self) }
}
#[inline(always)]
fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self> {
mask64x2 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask64x2(self, val: &[i64; 2usize]) -> mask64x2<Self> {
mask64x2 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize] {
unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask64x2(self, a: &mask64x2<Self>) -> &[i64; 2usize] {
unsafe { core::mem::transmute::<&__m128i, &[i64; 2usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask64x2(self, a: &mut mask64x2<Self>) -> &mut [i64; 2usize] {
unsafe { core::mem::transmute::<&mut __m128i, &mut [i64; 2usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask64x2(self, a: mask64x2<Self>, dest: &mut [i64; 2usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i64,
dest.as_mut_ptr(),
2usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask64x2(self, a: u8x16<Self>) -> mask64x2<Self> {
unsafe {
mask64x2 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask64x2(self, a: mask64x2<Self>) -> u8x16<Self> {
unsafe {
u8x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask64x2<const SHIFT: usize>(
self,
a: mask64x2<Self>,
b: mask64x2<Self>,
) -> mask64x2<Self> {
unsafe {
if SHIFT >= 2usize {
return b;
}
let result = dyn_alignr_128(
self.cvt_to_bytes_mask64x2(b).val.0,
self.cvt_to_bytes_mask64x2(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_mask64x2(u8x16 {
val: crate::support::Aligned128(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask64x2<const SHIFT: usize>(
self,
a: mask64x2<Self>,
b: mask64x2<Self>,
) -> mask64x2<Self> {
self.slide_mask64x2::<SHIFT>(a, b)
}
#[inline(always)]
fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn or_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn xor_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn not_mask64x2(self, a: mask64x2<Self>) -> mask64x2<Self> {
a ^ !0
}
#[inline(always)]
fn select_mask64x2(
self,
a: mask64x2<Self>,
b: mask64x2<Self>,
c: mask64x2<Self>,
) -> mask64x2<Self> {
unsafe { _mm_blendv_epi8(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn simd_eq_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) }
}
#[inline(always)]
fn any_true_mask64x2(self, a: mask64x2<Self>) -> bool {
unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0 }
}
#[inline(always)]
fn all_true_mask64x2(self, a: mask64x2<Self>) -> bool {
unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0b11 }
}
#[inline(always)]
fn any_false_mask64x2(self, a: mask64x2<Self>) -> bool {
unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 != 0b11 }
}
#[inline(always)]
fn all_false_mask64x2(self, a: mask64x2<Self>) -> bool {
unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 == 0 }
}
#[inline(always)]
fn combine_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x4<Self> {
mask64x4 {
val: crate::support::Aligned256([a.val.0, b.val.0]),
simd: self,
}
}
#[inline(always)]
fn splat_f32x8(self, val: f32) -> f32x8<Self> {
let half = self.splat_f32x4(val);
self.combine_f32x4(half, half)
}
#[inline(always)]
fn load_array_f32x8(self, val: [f32; 8usize]) -> f32x8<Self> {
f32x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f32x8(self, val: &[f32; 8usize]) -> f32x8<Self> {
f32x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f32x8(self, a: f32x8<Self>) -> [f32; 8usize] {
unsafe { core::mem::transmute::<[__m128; 2usize], [f32; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f32x8(self, a: &f32x8<Self>) -> &[f32; 8usize] {
unsafe { core::mem::transmute::<&[__m128; 2usize], &[f32; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f32x8(self, a: &mut f32x8<Self>) -> &mut [f32; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128; 2usize], &mut [f32; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f32x8(self, a: f32x8<Self>, dest: &mut [f32; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f32,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f32x8(self, a: u8x32<Self>) -> f32x8<Self> {
unsafe {
f32x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_f32x8(b).val.0,
self.cvt_to_bytes_f32x8(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_f32x8(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f32x8<const SHIFT: usize>(
self,
a: f32x8<Self>,
b: f32x8<Self>,
) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(
self.slide_within_blocks_f32x4::<SHIFT>(a0, b0),
self.slide_within_blocks_f32x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1))
}
#[inline(always)]
fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1))
}
#[inline(always)]
fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1))
}
#[inline(always)]
fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1))
}
#[inline(always)]
fn sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1))
}
#[inline(always)]
fn mul_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1))
}
#[inline(always)]
fn div_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1))
}
#[inline(always)]
fn copysign_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1))
}
#[inline(always)]
fn simd_eq_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1))
}
#[inline(always)]
fn simd_lt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1))
}
#[inline(always)]
fn simd_le_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1))
}
#[inline(always)]
fn simd_ge_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1))
}
#[inline(always)]
fn simd_gt_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1))
}
#[inline(always)]
fn zip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, _) = self.split_f32x8(a);
let (b0, _) = self.split_f32x8(b);
self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0))
}
#[inline(always)]
fn zip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (_, a1) = self.split_f32x8(a);
let (_, b1) = self.split_f32x8(b);
self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1))
}
#[inline(always)]
fn unzip_low_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1))
}
#[inline(always)]
fn unzip_high_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1))
}
#[inline(always)]
fn max_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1))
}
#[inline(always)]
fn min_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1))
}
#[inline(always)]
fn max_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(
self.max_precise_f32x4(a0, b0),
self.max_precise_f32x4(a1, b1),
)
}
#[inline(always)]
fn min_precise_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
self.combine_f32x4(
self.min_precise_f32x4(a0, b0),
self.min_precise_f32x4(a1, b1),
)
}
#[inline(always)]
fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
let (c0, c1) = self.split_f32x8(c);
self.combine_f32x4(
self.mul_add_f32x4(a0, b0, c0),
self.mul_add_f32x4(a1, b1, c1),
)
}
#[inline(always)]
fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
let (c0, c1) = self.split_f32x8(c);
self.combine_f32x4(
self.mul_sub_f32x4(a0, b0, c0),
self.mul_sub_f32x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
}
#[inline(always)]
fn ceil_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.ceil_f32x4(a0), self.ceil_f32x4(a1))
}
#[inline(always)]
fn round_ties_even_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(
self.round_ties_even_f32x4(a0),
self.round_ties_even_f32x4(a1),
)
}
#[inline(always)]
fn fract_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1))
}
#[inline(always)]
fn trunc_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1))
}
#[inline(always)]
fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_f32x8(b);
let (c0, c1) = self.split_f32x8(c);
self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1))
}
#[inline(always)]
fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self> {
f32x16 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>) {
(
f32x4 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
f32x4 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f64x2(
self.reinterpret_f64_f32x4(a0),
self.reinterpret_f64_f32x4(a1),
)
}
#[inline(always)]
fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_i32x4(
self.reinterpret_i32_f32x4(a0),
self.reinterpret_i32_f32x4(a1),
)
}
#[inline(always)]
fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1))
}
#[inline(always)]
fn reinterpret_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_u32x4(
self.reinterpret_u32_f32x4(a0),
self.reinterpret_u32_f32x4(a1),
)
}
#[inline(always)]
fn cvt_u32_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1))
}
#[inline(always)]
fn cvt_u32_precise_f32x8(self, a: f32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_u32x4(
self.cvt_u32_precise_f32x4(a0),
self.cvt_u32_precise_f32x4(a1),
)
}
#[inline(always)]
fn cvt_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1))
}
#[inline(always)]
fn cvt_i32_precise_f32x8(self, a: f32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_i32x4(
self.cvt_i32_precise_f32x4(a0),
self.cvt_i32_precise_f32x4(a1),
)
}
#[inline(always)]
fn splat_i8x32(self, val: i8) -> i8x32<Self> {
let half = self.splat_i8x16(val);
self.combine_i8x16(half, half)
}
#[inline(always)]
fn load_array_i8x32(self, val: [i8; 32usize]) -> i8x32<Self> {
i8x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i8x32(self, val: &[i8; 32usize]) -> i8x32<Self> {
i8x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i8x32(self, a: i8x32<Self>) -> [i8; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i8x32(self, a: &i8x32<Self>) -> &[i8; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i8x32(self, a: &mut i8x32<Self>) -> &mut [i8; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i8x32(self, a: i8x32<Self>, dest: &mut [i8; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i8x32(self, a: u8x32<Self>) -> i8x32<Self> {
unsafe {
i8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_i8x32(b).val.0,
self.cvt_to_bytes_i8x32(a).val.0,
SHIFT,
);
self.cvt_from_bytes_i8x32(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i8x32<const SHIFT: usize>(
self,
a: i8x32<Self>,
b: i8x32<Self>,
) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(
self.slide_within_blocks_i8x16::<SHIFT>(a0, b0),
self.slide_within_blocks_i8x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1))
}
#[inline(always)]
fn sub_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1))
}
#[inline(always)]
fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1))
}
#[inline(always)]
fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1))
}
#[inline(always)]
fn or_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1))
}
#[inline(always)]
fn xor_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1))
}
#[inline(always)]
fn not_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1))
}
#[inline(always)]
fn shl_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_i8x16(self.shl_i8x16(a0, shift), self.shl_i8x16(a1, shift))
}
#[inline(always)]
fn shlv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.shlv_i8x16(a0, b0), self.shlv_i8x16(a1, b1))
}
#[inline(always)]
fn shr_i8x32(self, a: i8x32<Self>, shift: u32) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_i8x16(self.shr_i8x16(a0, shift), self.shr_i8x16(a1, shift))
}
#[inline(always)]
fn shrv_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1))
}
#[inline(always)]
fn simd_eq_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1))
}
#[inline(always)]
fn simd_lt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1))
}
#[inline(always)]
fn simd_le_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1))
}
#[inline(always)]
fn simd_ge_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1))
}
#[inline(always)]
fn simd_gt_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1))
}
#[inline(always)]
fn zip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, _) = self.split_i8x32(a);
let (b0, _) = self.split_i8x32(b);
self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0))
}
#[inline(always)]
fn zip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (_, a1) = self.split_i8x32(a);
let (_, b1) = self.split_i8x32(b);
self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1))
}
#[inline(always)]
fn unzip_low_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1))
}
#[inline(always)]
fn unzip_high_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1))
}
#[inline(always)]
fn select_i8x32(self, a: mask8x32<Self>, b: i8x32<Self>, c: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_i8x32(b);
let (c0, c1) = self.split_i8x32(c);
self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1))
}
#[inline(always)]
fn min_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1))
}
#[inline(always)]
fn max_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
let (b0, b1) = self.split_i8x32(b);
self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1))
}
#[inline(always)]
fn combine_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x64<Self> {
i8x64 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_i8x32(self, a: i8x32<Self>) -> (i8x16<Self>, i8x16<Self>) {
(
i8x16 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
i8x16 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i8x32(self, a: i8x32<Self>) -> i8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1))
}
#[inline(always)]
fn reinterpret_u8_i8x32(self, a: i8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1))
}
#[inline(always)]
fn reinterpret_u32_i8x32(self, a: i8x32<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_i8x32(a);
self.combine_u32x4(
self.reinterpret_u32_i8x16(a0),
self.reinterpret_u32_i8x16(a1),
)
}
#[inline(always)]
fn splat_u8x32(self, val: u8) -> u8x32<Self> {
let half = self.splat_u8x16(val);
self.combine_u8x16(half, half)
}
#[inline(always)]
fn load_array_u8x32(self, val: [u8; 32usize]) -> u8x32<Self> {
u8x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u8x32(self, val: &[u8; 32usize]) -> u8x32<Self> {
u8x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u8x32(self, a: u8x32<Self>) -> [u8; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [u8; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u8x32(self, a: &u8x32<Self>) -> &[u8; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u8; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u8x32(self, a: &mut u8x32<Self>) -> &mut [u8; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u8; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u8x32(self, a: u8x32<Self>, dest: &mut [u8; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u8,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_u8x32(b).val.0,
self.cvt_to_bytes_u8x32(a).val.0,
SHIFT,
);
self.cvt_from_bytes_u8x32(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u8x32<const SHIFT: usize>(
self,
a: u8x32<Self>,
b: u8x32<Self>,
) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(
self.slide_within_blocks_u8x16::<SHIFT>(a0, b0),
self.slide_within_blocks_u8x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1))
}
#[inline(always)]
fn sub_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1))
}
#[inline(always)]
fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1))
}
#[inline(always)]
fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1))
}
#[inline(always)]
fn or_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1))
}
#[inline(always)]
fn xor_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1))
}
#[inline(always)]
fn not_u8x32(self, a: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1))
}
#[inline(always)]
fn shl_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
self.combine_u8x16(self.shl_u8x16(a0, shift), self.shl_u8x16(a1, shift))
}
#[inline(always)]
fn shlv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.shlv_u8x16(a0, b0), self.shlv_u8x16(a1, b1))
}
#[inline(always)]
fn shr_u8x32(self, a: u8x32<Self>, shift: u32) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
self.combine_u8x16(self.shr_u8x16(a0, shift), self.shr_u8x16(a1, shift))
}
#[inline(always)]
fn shrv_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1))
}
#[inline(always)]
fn simd_eq_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1))
}
#[inline(always)]
fn simd_lt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1))
}
#[inline(always)]
fn simd_le_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1))
}
#[inline(always)]
fn simd_ge_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1))
}
#[inline(always)]
fn simd_gt_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1))
}
#[inline(always)]
fn zip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, _) = self.split_u8x32(a);
let (b0, _) = self.split_u8x32(b);
self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0))
}
#[inline(always)]
fn zip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (_, a1) = self.split_u8x32(a);
let (_, b1) = self.split_u8x32(b);
self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1))
}
#[inline(always)]
fn unzip_low_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1))
}
#[inline(always)]
fn unzip_high_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1))
}
#[inline(always)]
fn select_u8x32(self, a: mask8x32<Self>, b: u8x32<Self>, c: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_u8x32(b);
let (c0, c1) = self.split_u8x32(c);
self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1))
}
#[inline(always)]
fn min_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1))
}
#[inline(always)]
fn max_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u8x32(a);
let (b0, b1) = self.split_u8x32(b);
self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1))
}
#[inline(always)]
fn combine_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x64<Self> {
u8x64 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_u8x32(self, a: u8x32<Self>) -> (u8x16<Self>, u8x16<Self>) {
(
u8x16 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
u8x16 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn widen_u8x32(self, a: u8x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u8x32(a);
self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1))
}
#[inline(always)]
fn reinterpret_u32_u8x32(self, a: u8x32<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u8x32(a);
self.combine_u32x4(
self.reinterpret_u32_u8x16(a0),
self.reinterpret_u32_u8x16(a1),
)
}
#[inline(always)]
fn splat_mask8x32(self, val: i8) -> mask8x32<Self> {
let half = self.splat_mask8x16(val);
self.combine_mask8x16(half, half)
}
#[inline(always)]
fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self> {
mask8x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask8x32(self, val: &[i8; 32usize]) -> mask8x32<Self> {
mask8x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask8x32(self, a: &mask8x32<Self>) -> &[i8; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i8; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask8x32(self, a: &mut mask8x32<Self>) -> &mut [i8; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i8; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask8x32(self, a: mask8x32<Self>, dest: &mut [i8; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask8x32(self, a: u8x32<Self>) -> mask8x32<Self> {
unsafe {
mask8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask8x32(self, a: mask8x32<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask8x32<const SHIFT: usize>(
self,
a: mask8x32<Self>,
b: mask8x32<Self>,
) -> mask8x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_mask8x32(b).val.0,
self.cvt_to_bytes_mask8x32(a).val.0,
SHIFT,
);
self.cvt_from_bytes_mask8x32(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask8x32<const SHIFT: usize>(
self,
a: mask8x32<Self>,
b: mask8x32<Self>,
) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
self.combine_mask8x16(
self.slide_within_blocks_mask8x16::<SHIFT>(a0, b0),
self.slide_within_blocks_mask8x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1))
}
#[inline(always)]
fn or_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1))
}
#[inline(always)]
fn xor_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1))
}
#[inline(always)]
fn not_mask8x32(self, a: mask8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1))
}
#[inline(always)]
fn select_mask8x32(
self,
a: mask8x32<Self>,
b: mask8x32<Self>,
c: mask8x32<Self>,
) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
let (c0, c1) = self.split_mask8x32(c);
self.combine_mask8x16(
self.select_mask8x16(a0, b0, c0),
self.select_mask8x16(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
let (a0, a1) = self.split_mask8x32(a);
let (b0, b1) = self.split_mask8x32(b);
self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1))
}
#[inline(always)]
fn any_true_mask8x32(self, a: mask8x32<Self>) -> bool {
let (a0, a1) = self.split_mask8x32(a);
self.any_true_mask8x16(a0) || self.any_true_mask8x16(a1)
}
#[inline(always)]
fn all_true_mask8x32(self, a: mask8x32<Self>) -> bool {
let (a0, a1) = self.split_mask8x32(a);
self.all_true_mask8x16(a0) && self.all_true_mask8x16(a1)
}
#[inline(always)]
fn any_false_mask8x32(self, a: mask8x32<Self>) -> bool {
let (a0, a1) = self.split_mask8x32(a);
self.any_false_mask8x16(a0) || self.any_false_mask8x16(a1)
}
#[inline(always)]
fn all_false_mask8x32(self, a: mask8x32<Self>) -> bool {
let (a0, a1) = self.split_mask8x32(a);
self.all_false_mask8x16(a0) && self.all_false_mask8x16(a1)
}
#[inline(always)]
fn combine_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x64<Self> {
mask8x64 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_mask8x32(self, a: mask8x32<Self>) -> (mask8x16<Self>, mask8x16<Self>) {
(
mask8x16 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
mask8x16 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn splat_i16x16(self, val: i16) -> i16x16<Self> {
let half = self.splat_i16x8(val);
self.combine_i16x8(half, half)
}
#[inline(always)]
fn load_array_i16x16(self, val: [i16; 16usize]) -> i16x16<Self> {
i16x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i16x16(self, val: &[i16; 16usize]) -> i16x16<Self> {
i16x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i16x16(self, a: i16x16<Self>) -> [i16; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i16x16(self, a: &i16x16<Self>) -> &[i16; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i16x16(self, a: &mut i16x16<Self>) -> &mut [i16; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i16x16(self, a: i16x16<Self>, dest: &mut [i16; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i16x16(self, a: u8x32<Self>) -> i16x16<Self> {
unsafe {
i16x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_i16x16(b).val.0,
self.cvt_to_bytes_i16x16(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_i16x16(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i16x16<const SHIFT: usize>(
self,
a: i16x16<Self>,
b: i16x16<Self>,
) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(
self.slide_within_blocks_i16x8::<SHIFT>(a0, b0),
self.slide_within_blocks_i16x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1))
}
#[inline(always)]
fn sub_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1))
}
#[inline(always)]
fn mul_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1))
}
#[inline(always)]
fn and_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1))
}
#[inline(always)]
fn or_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1))
}
#[inline(always)]
fn xor_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1))
}
#[inline(always)]
fn not_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1))
}
#[inline(always)]
fn shl_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_i16x8(self.shl_i16x8(a0, shift), self.shl_i16x8(a1, shift))
}
#[inline(always)]
fn shlv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.shlv_i16x8(a0, b0), self.shlv_i16x8(a1, b1))
}
#[inline(always)]
fn shr_i16x16(self, a: i16x16<Self>, shift: u32) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_i16x8(self.shr_i16x8(a0, shift), self.shr_i16x8(a1, shift))
}
#[inline(always)]
fn shrv_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1))
}
#[inline(always)]
fn simd_eq_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1))
}
#[inline(always)]
fn simd_lt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1))
}
#[inline(always)]
fn simd_le_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1))
}
#[inline(always)]
fn simd_ge_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1))
}
#[inline(always)]
fn simd_gt_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1))
}
#[inline(always)]
fn zip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, _) = self.split_i16x16(a);
let (b0, _) = self.split_i16x16(b);
self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0))
}
#[inline(always)]
fn zip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (_, a1) = self.split_i16x16(a);
let (_, b1) = self.split_i16x16(b);
self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1))
}
#[inline(always)]
fn unzip_low_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1))
}
#[inline(always)]
fn unzip_high_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1))
}
#[inline(always)]
fn select_i16x16(self, a: mask16x16<Self>, b: i16x16<Self>, c: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_i16x16(b);
let (c0, c1) = self.split_i16x16(c);
self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1))
}
#[inline(always)]
fn min_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1))
}
#[inline(always)]
fn max_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
let (b0, b1) = self.split_i16x16(b);
self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1))
}
#[inline(always)]
fn combine_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x32<Self> {
i16x32 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_i16x16(self, a: i16x16<Self>) -> (i16x8<Self>, i16x8<Self>) {
(
i16x8 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
i16x8 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i16x16(self, a: i16x16<Self>) -> i16x16<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1))
}
#[inline(always)]
fn reinterpret_u8_i16x16(self, a: i16x16<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1))
}
#[inline(always)]
fn reinterpret_u32_i16x16(self, a: i16x16<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_i16x16(a);
self.combine_u32x4(
self.reinterpret_u32_i16x8(a0),
self.reinterpret_u32_i16x8(a1),
)
}
#[inline(always)]
fn splat_u16x16(self, val: u16) -> u16x16<Self> {
let half = self.splat_u16x8(val);
self.combine_u16x8(half, half)
}
#[inline(always)]
fn load_array_u16x16(self, val: [u16; 16usize]) -> u16x16<Self> {
u16x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u16x16(self, val: &[u16; 16usize]) -> u16x16<Self> {
u16x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u16x16(self, a: u16x16<Self>) -> [u16; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [u16; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u16x16(self, a: &u16x16<Self>) -> &[u16; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u16; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u16x16(self, a: &mut u16x16<Self>) -> &mut [u16; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u16; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u16x16(self, a: u16x16<Self>, dest: &mut [u16; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u16,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u16x16(self, a: u8x32<Self>) -> u16x16<Self> {
unsafe {
u16x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_u16x16(b).val.0,
self.cvt_to_bytes_u16x16(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_u16x16(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u16x16<const SHIFT: usize>(
self,
a: u16x16<Self>,
b: u16x16<Self>,
) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(
self.slide_within_blocks_u16x8::<SHIFT>(a0, b0),
self.slide_within_blocks_u16x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1))
}
#[inline(always)]
fn sub_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1))
}
#[inline(always)]
fn mul_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1))
}
#[inline(always)]
fn and_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1))
}
#[inline(always)]
fn or_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1))
}
#[inline(always)]
fn xor_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1))
}
#[inline(always)]
fn not_u16x16(self, a: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1))
}
#[inline(always)]
fn shl_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
self.combine_u16x8(self.shl_u16x8(a0, shift), self.shl_u16x8(a1, shift))
}
#[inline(always)]
fn shlv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.shlv_u16x8(a0, b0), self.shlv_u16x8(a1, b1))
}
#[inline(always)]
fn shr_u16x16(self, a: u16x16<Self>, shift: u32) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
self.combine_u16x8(self.shr_u16x8(a0, shift), self.shr_u16x8(a1, shift))
}
#[inline(always)]
fn shrv_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1))
}
#[inline(always)]
fn simd_eq_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1))
}
#[inline(always)]
fn simd_lt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1))
}
#[inline(always)]
fn simd_le_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1))
}
#[inline(always)]
fn simd_ge_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1))
}
#[inline(always)]
fn simd_gt_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1))
}
#[inline(always)]
fn zip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, _) = self.split_u16x16(a);
let (b0, _) = self.split_u16x16(b);
self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0))
}
#[inline(always)]
fn zip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (_, a1) = self.split_u16x16(a);
let (_, b1) = self.split_u16x16(b);
self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1))
}
#[inline(always)]
fn unzip_low_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1))
}
#[inline(always)]
fn unzip_high_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1))
}
#[inline(always)]
fn select_u16x16(self, a: mask16x16<Self>, b: u16x16<Self>, c: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_u16x16(b);
let (c0, c1) = self.split_u16x16(c);
self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1))
}
#[inline(always)]
fn min_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1))
}
#[inline(always)]
fn max_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
let (a0, a1) = self.split_u16x16(a);
let (b0, b1) = self.split_u16x16(b);
self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1))
}
#[inline(always)]
fn combine_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x32<Self> {
u16x32 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_u16x16(self, a: u16x16<Self>) -> (u16x8<Self>, u16x8<Self>) {
(
u16x8 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
u16x8 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn narrow_u16x16(self, a: u16x16<Self>) -> u8x16<Self> {
let (a, b) = self.split_u16x16(a);
unsafe {
let mask = _mm_set1_epi16(0xFF);
let lo_masked = _mm_and_si128(a.into(), mask);
let hi_masked = _mm_and_si128(b.into(), mask);
let result = _mm_packus_epi16(lo_masked, hi_masked);
result.simd_into(self)
}
}
#[inline(always)]
fn reinterpret_u8_u16x16(self, a: u16x16<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u16x16(a);
self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1))
}
#[inline(always)]
fn reinterpret_u32_u16x16(self, a: u16x16<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u16x16(a);
self.combine_u32x4(
self.reinterpret_u32_u16x8(a0),
self.reinterpret_u32_u16x8(a1),
)
}
#[inline(always)]
fn splat_mask16x16(self, val: i16) -> mask16x16<Self> {
let half = self.splat_mask16x8(val);
self.combine_mask16x8(half, half)
}
#[inline(always)]
fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self> {
mask16x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask16x16(self, val: &[i16; 16usize]) -> mask16x16<Self> {
mask16x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask16x16(self, a: &mask16x16<Self>) -> &[i16; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i16; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask16x16(self, a: &mut mask16x16<Self>) -> &mut [i16; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i16; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask16x16(self, a: mask16x16<Self>, dest: &mut [i16; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask16x16(self, a: u8x32<Self>) -> mask16x16<Self> {
unsafe {
mask16x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask16x16(self, a: mask16x16<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask16x16<const SHIFT: usize>(
self,
a: mask16x16<Self>,
b: mask16x16<Self>,
) -> mask16x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_mask16x16(b).val.0,
self.cvt_to_bytes_mask16x16(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_mask16x16(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask16x16<const SHIFT: usize>(
self,
a: mask16x16<Self>,
b: mask16x16<Self>,
) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
self.combine_mask16x8(
self.slide_within_blocks_mask16x8::<SHIFT>(a0, b0),
self.slide_within_blocks_mask16x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1))
}
#[inline(always)]
fn or_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1))
}
#[inline(always)]
fn xor_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1))
}
#[inline(always)]
fn not_mask16x16(self, a: mask16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1))
}
#[inline(always)]
fn select_mask16x16(
self,
a: mask16x16<Self>,
b: mask16x16<Self>,
c: mask16x16<Self>,
) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
let (c0, c1) = self.split_mask16x16(c);
self.combine_mask16x8(
self.select_mask16x8(a0, b0, c0),
self.select_mask16x8(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
let (a0, a1) = self.split_mask16x16(a);
let (b0, b1) = self.split_mask16x16(b);
self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1))
}
#[inline(always)]
fn any_true_mask16x16(self, a: mask16x16<Self>) -> bool {
let (a0, a1) = self.split_mask16x16(a);
self.any_true_mask16x8(a0) || self.any_true_mask16x8(a1)
}
#[inline(always)]
fn all_true_mask16x16(self, a: mask16x16<Self>) -> bool {
let (a0, a1) = self.split_mask16x16(a);
self.all_true_mask16x8(a0) && self.all_true_mask16x8(a1)
}
#[inline(always)]
fn any_false_mask16x16(self, a: mask16x16<Self>) -> bool {
let (a0, a1) = self.split_mask16x16(a);
self.any_false_mask16x8(a0) || self.any_false_mask16x8(a1)
}
#[inline(always)]
fn all_false_mask16x16(self, a: mask16x16<Self>) -> bool {
let (a0, a1) = self.split_mask16x16(a);
self.all_false_mask16x8(a0) && self.all_false_mask16x8(a1)
}
#[inline(always)]
fn combine_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x32<Self> {
mask16x32 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_mask16x16(self, a: mask16x16<Self>) -> (mask16x8<Self>, mask16x8<Self>) {
(
mask16x8 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
mask16x8 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn splat_i32x8(self, val: i32) -> i32x8<Self> {
let half = self.splat_i32x4(val);
self.combine_i32x4(half, half)
}
#[inline(always)]
fn load_array_i32x8(self, val: [i32; 8usize]) -> i32x8<Self> {
i32x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i32x8(self, val: &[i32; 8usize]) -> i32x8<Self> {
i32x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i32x8(self, a: i32x8<Self>) -> [i32; 8usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i32x8(self, a: &i32x8<Self>) -> &[i32; 8usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i32x8(self, a: &mut i32x8<Self>) -> &mut [i32; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i32x8(self, a: i32x8<Self>, dest: &mut [i32; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i32x8(self, a: u8x32<Self>) -> i32x8<Self> {
unsafe {
i32x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_i32x8(b).val.0,
self.cvt_to_bytes_i32x8(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_i32x8(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i32x8<const SHIFT: usize>(
self,
a: i32x8<Self>,
b: i32x8<Self>,
) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(
self.slide_within_blocks_i32x4::<SHIFT>(a0, b0),
self.slide_within_blocks_i32x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1))
}
#[inline(always)]
fn sub_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1))
}
#[inline(always)]
fn mul_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1))
}
#[inline(always)]
fn and_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1))
}
#[inline(always)]
fn or_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1))
}
#[inline(always)]
fn xor_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1))
}
#[inline(always)]
fn not_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1))
}
#[inline(always)]
fn shl_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_i32x4(self.shl_i32x4(a0, shift), self.shl_i32x4(a1, shift))
}
#[inline(always)]
fn shlv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.shlv_i32x4(a0, b0), self.shlv_i32x4(a1, b1))
}
#[inline(always)]
fn shr_i32x8(self, a: i32x8<Self>, shift: u32) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_i32x4(self.shr_i32x4(a0, shift), self.shr_i32x4(a1, shift))
}
#[inline(always)]
fn shrv_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1))
}
#[inline(always)]
fn simd_eq_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1))
}
#[inline(always)]
fn simd_lt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1))
}
#[inline(always)]
fn simd_le_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1))
}
#[inline(always)]
fn simd_ge_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1))
}
#[inline(always)]
fn simd_gt_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1))
}
#[inline(always)]
fn zip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, _) = self.split_i32x8(a);
let (b0, _) = self.split_i32x8(b);
self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0))
}
#[inline(always)]
fn zip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (_, a1) = self.split_i32x8(a);
let (_, b1) = self.split_i32x8(b);
self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1))
}
#[inline(always)]
fn unzip_low_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1))
}
#[inline(always)]
fn unzip_high_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1))
}
#[inline(always)]
fn select_i32x8(self, a: mask32x8<Self>, b: i32x8<Self>, c: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_i32x8(b);
let (c0, c1) = self.split_i32x8(c);
self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1))
}
#[inline(always)]
fn min_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1))
}
#[inline(always)]
fn max_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
let (b0, b1) = self.split_i32x8(b);
self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1))
}
#[inline(always)]
fn combine_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x16<Self> {
i32x16 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_i32x8(self, a: i32x8<Self>) -> (i32x4<Self>, i32x4<Self>) {
(
i32x4 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
i32x4 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i32x8(self, a: i32x8<Self>) -> i32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1))
}
#[inline(always)]
fn reinterpret_u8_i32x8(self, a: i32x8<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1))
}
#[inline(always)]
fn reinterpret_u32_i32x8(self, a: i32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_u32x4(
self.reinterpret_u32_i32x4(a0),
self.reinterpret_u32_i32x4(a1),
)
}
#[inline(always)]
fn cvt_f32_i32x8(self, a: i32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_i32x8(a);
self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1))
}
#[inline(always)]
fn splat_u32x8(self, val: u32) -> u32x8<Self> {
let half = self.splat_u32x4(val);
self.combine_u32x4(half, half)
}
#[inline(always)]
fn load_array_u32x8(self, val: [u32; 8usize]) -> u32x8<Self> {
u32x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u32x8(self, val: &[u32; 8usize]) -> u32x8<Self> {
u32x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u32x8(self, a: u32x8<Self>) -> [u32; 8usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [u32; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u32x8(self, a: &u32x8<Self>) -> &[u32; 8usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[u32; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u32x8(self, a: &mut u32x8<Self>) -> &mut [u32; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [u32; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u32x8(self, a: u32x8<Self>, dest: &mut [u32; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u32,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u32x8(self, a: u8x32<Self>) -> u32x8<Self> {
unsafe {
u32x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_u32x8(b).val.0,
self.cvt_to_bytes_u32x8(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_u32x8(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u32x8<const SHIFT: usize>(
self,
a: u32x8<Self>,
b: u32x8<Self>,
) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(
self.slide_within_blocks_u32x4::<SHIFT>(a0, b0),
self.slide_within_blocks_u32x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1))
}
#[inline(always)]
fn sub_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1))
}
#[inline(always)]
fn mul_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1))
}
#[inline(always)]
fn and_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1))
}
#[inline(always)]
fn or_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1))
}
#[inline(always)]
fn xor_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1))
}
#[inline(always)]
fn not_u32x8(self, a: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1))
}
#[inline(always)]
fn shl_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
self.combine_u32x4(self.shl_u32x4(a0, shift), self.shl_u32x4(a1, shift))
}
#[inline(always)]
fn shlv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.shlv_u32x4(a0, b0), self.shlv_u32x4(a1, b1))
}
#[inline(always)]
fn shr_u32x8(self, a: u32x8<Self>, shift: u32) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
self.combine_u32x4(self.shr_u32x4(a0, shift), self.shr_u32x4(a1, shift))
}
#[inline(always)]
fn shrv_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1))
}
#[inline(always)]
fn simd_eq_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1))
}
#[inline(always)]
fn simd_lt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1))
}
#[inline(always)]
fn simd_le_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1))
}
#[inline(always)]
fn simd_ge_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1))
}
#[inline(always)]
fn simd_gt_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1))
}
#[inline(always)]
fn zip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, _) = self.split_u32x8(a);
let (b0, _) = self.split_u32x8(b);
self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0))
}
#[inline(always)]
fn zip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (_, a1) = self.split_u32x8(a);
let (_, b1) = self.split_u32x8(b);
self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1))
}
#[inline(always)]
fn unzip_low_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1))
}
#[inline(always)]
fn unzip_high_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1))
}
#[inline(always)]
fn select_u32x8(self, a: mask32x8<Self>, b: u32x8<Self>, c: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_u32x8(b);
let (c0, c1) = self.split_u32x8(c);
self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1))
}
#[inline(always)]
fn min_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1))
}
#[inline(always)]
fn max_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
let (b0, b1) = self.split_u32x8(b);
self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1))
}
#[inline(always)]
fn combine_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x16<Self> {
u32x16 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_u32x8(self, a: u32x8<Self>) -> (u32x4<Self>, u32x4<Self>) {
(
u32x4 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
u32x4 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn reinterpret_u8_u32x8(self, a: u32x8<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u32x8(a);
self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1))
}
#[inline(always)]
fn cvt_f32_u32x8(self, a: u32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_u32x8(a);
self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1))
}
#[inline(always)]
fn splat_mask32x8(self, val: i32) -> mask32x8<Self> {
let half = self.splat_mask32x4(val);
self.combine_mask32x4(half, half)
}
#[inline(always)]
fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self> {
mask32x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask32x8(self, val: &[i32; 8usize]) -> mask32x8<Self> {
mask32x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask32x8(self, a: &mask32x8<Self>) -> &[i32; 8usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i32; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask32x8(self, a: &mut mask32x8<Self>) -> &mut [i32; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i32; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask32x8(self, a: mask32x8<Self>, dest: &mut [i32; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask32x8(self, a: u8x32<Self>) -> mask32x8<Self> {
unsafe {
mask32x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask32x8(self, a: mask32x8<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask32x8<const SHIFT: usize>(
self,
a: mask32x8<Self>,
b: mask32x8<Self>,
) -> mask32x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_mask32x8(b).val.0,
self.cvt_to_bytes_mask32x8(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_mask32x8(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask32x8<const SHIFT: usize>(
self,
a: mask32x8<Self>,
b: mask32x8<Self>,
) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
self.combine_mask32x4(
self.slide_within_blocks_mask32x4::<SHIFT>(a0, b0),
self.slide_within_blocks_mask32x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1))
}
#[inline(always)]
fn or_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1))
}
#[inline(always)]
fn xor_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1))
}
#[inline(always)]
fn not_mask32x8(self, a: mask32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1))
}
#[inline(always)]
fn select_mask32x8(
self,
a: mask32x8<Self>,
b: mask32x8<Self>,
c: mask32x8<Self>,
) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
let (c0, c1) = self.split_mask32x8(c);
self.combine_mask32x4(
self.select_mask32x4(a0, b0, c0),
self.select_mask32x4(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
let (a0, a1) = self.split_mask32x8(a);
let (b0, b1) = self.split_mask32x8(b);
self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1))
}
#[inline(always)]
fn any_true_mask32x8(self, a: mask32x8<Self>) -> bool {
let (a0, a1) = self.split_mask32x8(a);
self.any_true_mask32x4(a0) || self.any_true_mask32x4(a1)
}
#[inline(always)]
fn all_true_mask32x8(self, a: mask32x8<Self>) -> bool {
let (a0, a1) = self.split_mask32x8(a);
self.all_true_mask32x4(a0) && self.all_true_mask32x4(a1)
}
#[inline(always)]
fn any_false_mask32x8(self, a: mask32x8<Self>) -> bool {
let (a0, a1) = self.split_mask32x8(a);
self.any_false_mask32x4(a0) || self.any_false_mask32x4(a1)
}
#[inline(always)]
fn all_false_mask32x8(self, a: mask32x8<Self>) -> bool {
let (a0, a1) = self.split_mask32x8(a);
self.all_false_mask32x4(a0) && self.all_false_mask32x4(a1)
}
#[inline(always)]
fn combine_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x16<Self> {
mask32x16 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_mask32x8(self, a: mask32x8<Self>) -> (mask32x4<Self>, mask32x4<Self>) {
(
mask32x4 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
mask32x4 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn splat_f64x4(self, val: f64) -> f64x4<Self> {
let half = self.splat_f64x2(val);
self.combine_f64x2(half, half)
}
#[inline(always)]
fn load_array_f64x4(self, val: [f64; 4usize]) -> f64x4<Self> {
f64x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f64x4(self, val: &[f64; 4usize]) -> f64x4<Self> {
f64x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f64x4(self, a: f64x4<Self>) -> [f64; 4usize] {
unsafe { core::mem::transmute::<[__m128d; 2usize], [f64; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f64x4(self, a: &f64x4<Self>) -> &[f64; 4usize] {
unsafe { core::mem::transmute::<&[__m128d; 2usize], &[f64; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f64x4(self, a: &mut f64x4<Self>) -> &mut [f64; 4usize] {
unsafe { core::mem::transmute::<&mut [__m128d; 2usize], &mut [f64; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f64x4(self, a: f64x4<Self>, dest: &mut [f64; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f64,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f64x4(self, a: u8x32<Self>) -> f64x4<Self> {
unsafe {
f64x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f64x4(self, a: f64x4<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_f64x4(b).val.0,
self.cvt_to_bytes_f64x4(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_f64x4(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f64x4<const SHIFT: usize>(
self,
a: f64x4<Self>,
b: f64x4<Self>,
) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(
self.slide_within_blocks_f64x2::<SHIFT>(a0, b0),
self.slide_within_blocks_f64x2::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1))
}
#[inline(always)]
fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1))
}
#[inline(always)]
fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1))
}
#[inline(always)]
fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1))
}
#[inline(always)]
fn sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1))
}
#[inline(always)]
fn mul_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1))
}
#[inline(always)]
fn div_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1))
}
#[inline(always)]
fn copysign_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1))
}
#[inline(always)]
fn simd_eq_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1))
}
#[inline(always)]
fn simd_lt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1))
}
#[inline(always)]
fn simd_le_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1))
}
#[inline(always)]
fn simd_ge_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1))
}
#[inline(always)]
fn simd_gt_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1))
}
#[inline(always)]
fn zip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, _) = self.split_f64x4(a);
let (b0, _) = self.split_f64x4(b);
self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0))
}
#[inline(always)]
fn zip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (_, a1) = self.split_f64x4(a);
let (_, b1) = self.split_f64x4(b);
self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1))
}
#[inline(always)]
fn unzip_low_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1))
}
#[inline(always)]
fn unzip_high_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1))
}
#[inline(always)]
fn max_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1))
}
#[inline(always)]
fn min_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1))
}
#[inline(always)]
fn max_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(
self.max_precise_f64x2(a0, b0),
self.max_precise_f64x2(a1, b1),
)
}
#[inline(always)]
fn min_precise_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
self.combine_f64x2(
self.min_precise_f64x2(a0, b0),
self.min_precise_f64x2(a1, b1),
)
}
#[inline(always)]
fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
let (c0, c1) = self.split_f64x4(c);
self.combine_f64x2(
self.mul_add_f64x2(a0, b0, c0),
self.mul_add_f64x2(a1, b1, c1),
)
}
#[inline(always)]
fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
let (c0, c1) = self.split_f64x4(c);
self.combine_f64x2(
self.mul_sub_f64x2(a0, b0, c0),
self.mul_sub_f64x2(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
}
#[inline(always)]
fn ceil_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.ceil_f64x2(a0), self.ceil_f64x2(a1))
}
#[inline(always)]
fn round_ties_even_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(
self.round_ties_even_f64x2(a0),
self.round_ties_even_f64x2(a1),
)
}
#[inline(always)]
fn fract_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1))
}
#[inline(always)]
fn trunc_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1))
}
#[inline(always)]
fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_f64x4(b);
let (c0, c1) = self.split_f64x4(c);
self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1))
}
#[inline(always)]
fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self> {
f64x8 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>) {
(
f64x2 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
f64x2 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f32x4(
self.reinterpret_f32_f64x2(a0),
self.reinterpret_f32_f64x2(a1),
)
}
#[inline(always)]
fn splat_mask64x4(self, val: i64) -> mask64x4<Self> {
let half = self.splat_mask64x2(val);
self.combine_mask64x2(half, half)
}
#[inline(always)]
fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self> {
mask64x4 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask64x4(self, val: &[i64; 4usize]) -> mask64x4<Self> {
mask64x4 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize] {
unsafe { core::mem::transmute::<[__m128i; 2usize], [i64; 4usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask64x4(self, a: &mask64x4<Self>) -> &[i64; 4usize] {
unsafe { core::mem::transmute::<&[__m128i; 2usize], &[i64; 4usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask64x4(self, a: &mut mask64x4<Self>) -> &mut [i64; 4usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 2usize], &mut [i64; 4usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask64x4(self, a: mask64x4<Self>, dest: &mut [i64; 4usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i64,
dest.as_mut_ptr(),
4usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask64x4(self, a: u8x32<Self>) -> mask64x4<Self> {
unsafe {
mask64x4 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask64x4(self, a: mask64x4<Self>) -> u8x32<Self> {
unsafe {
u8x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask64x4<const SHIFT: usize>(
self,
a: mask64x4<Self>,
b: mask64x4<Self>,
) -> mask64x4<Self> {
unsafe {
if SHIFT >= 4usize {
return b;
}
let result = cross_block_alignr_128x2(
self.cvt_to_bytes_mask64x4(b).val.0,
self.cvt_to_bytes_mask64x4(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_mask64x4(u8x32 {
val: crate::support::Aligned256(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask64x4<const SHIFT: usize>(
self,
a: mask64x4<Self>,
b: mask64x4<Self>,
) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
self.combine_mask64x2(
self.slide_within_blocks_mask64x2::<SHIFT>(a0, b0),
self.slide_within_blocks_mask64x2::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1))
}
#[inline(always)]
fn or_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1))
}
#[inline(always)]
fn xor_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1))
}
#[inline(always)]
fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1))
}
#[inline(always)]
fn select_mask64x4(
self,
a: mask64x4<Self>,
b: mask64x4<Self>,
c: mask64x4<Self>,
) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
let (c0, c1) = self.split_mask64x4(c);
self.combine_mask64x2(
self.select_mask64x2(a0, b0, c0),
self.select_mask64x2(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
let (a0, a1) = self.split_mask64x4(a);
let (b0, b1) = self.split_mask64x4(b);
self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1))
}
#[inline(always)]
fn any_true_mask64x4(self, a: mask64x4<Self>) -> bool {
let (a0, a1) = self.split_mask64x4(a);
self.any_true_mask64x2(a0) || self.any_true_mask64x2(a1)
}
#[inline(always)]
fn all_true_mask64x4(self, a: mask64x4<Self>) -> bool {
let (a0, a1) = self.split_mask64x4(a);
self.all_true_mask64x2(a0) && self.all_true_mask64x2(a1)
}
#[inline(always)]
fn any_false_mask64x4(self, a: mask64x4<Self>) -> bool {
let (a0, a1) = self.split_mask64x4(a);
self.any_false_mask64x2(a0) || self.any_false_mask64x2(a1)
}
#[inline(always)]
fn all_false_mask64x4(self, a: mask64x4<Self>) -> bool {
let (a0, a1) = self.split_mask64x4(a);
self.all_false_mask64x2(a0) && self.all_false_mask64x2(a1)
}
#[inline(always)]
fn combine_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x8<Self> {
mask64x8 {
val: crate::support::Aligned512([a.val.0[0], a.val.0[1], b.val.0[0], b.val.0[1]]),
simd: self,
}
}
#[inline(always)]
fn split_mask64x4(self, a: mask64x4<Self>) -> (mask64x2<Self>, mask64x2<Self>) {
(
mask64x2 {
val: crate::support::Aligned128(a.val.0[0]),
simd: self,
},
mask64x2 {
val: crate::support::Aligned128(a.val.0[1]),
simd: self,
},
)
}
#[inline(always)]
fn splat_f32x16(self, val: f32) -> f32x16<Self> {
let half = self.splat_f32x8(val);
self.combine_f32x8(half, half)
}
#[inline(always)]
fn load_array_f32x16(self, val: [f32; 16usize]) -> f32x16<Self> {
f32x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f32x16(self, val: &[f32; 16usize]) -> f32x16<Self> {
f32x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f32x16(self, a: f32x16<Self>) -> [f32; 16usize] {
unsafe { core::mem::transmute::<[__m128; 4usize], [f32; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f32x16(self, a: &f32x16<Self>) -> &[f32; 16usize] {
unsafe { core::mem::transmute::<&[__m128; 4usize], &[f32; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f32x16(self, a: &mut f32x16<Self>) -> &mut [f32; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128; 4usize], &mut [f32; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f32,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f32x16(self, a: u8x64<Self>) -> f32x16<Self> {
unsafe {
f32x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_f32x16(b).val.0,
self.cvt_to_bytes_f32x16(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_f32x16(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f32x16<const SHIFT: usize>(
self,
a: f32x16<Self>,
b: f32x16<Self>,
) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(
self.slide_within_blocks_f32x8::<SHIFT>(a0, b0),
self.slide_within_blocks_f32x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn abs_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1))
}
#[inline(always)]
fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1))
}
#[inline(always)]
fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1))
}
#[inline(always)]
fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1))
}
#[inline(always)]
fn sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1))
}
#[inline(always)]
fn mul_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1))
}
#[inline(always)]
fn div_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1))
}
#[inline(always)]
fn copysign_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1))
}
#[inline(always)]
fn simd_eq_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1))
}
#[inline(always)]
fn simd_lt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1))
}
#[inline(always)]
fn simd_le_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1))
}
#[inline(always)]
fn simd_ge_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1))
}
#[inline(always)]
fn simd_gt_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1))
}
#[inline(always)]
fn zip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, _) = self.split_f32x16(a);
let (b0, _) = self.split_f32x16(b);
self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0))
}
#[inline(always)]
fn zip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (_, a1) = self.split_f32x16(a);
let (_, b1) = self.split_f32x16(b);
self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1))
}
#[inline(always)]
fn unzip_low_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1))
}
#[inline(always)]
fn unzip_high_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1))
}
#[inline(always)]
fn max_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1))
}
#[inline(always)]
fn min_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1))
}
#[inline(always)]
fn max_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(
self.max_precise_f32x8(a0, b0),
self.max_precise_f32x8(a1, b1),
)
}
#[inline(always)]
fn min_precise_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
self.combine_f32x8(
self.min_precise_f32x8(a0, b0),
self.min_precise_f32x8(a1, b1),
)
}
#[inline(always)]
fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(
self.mul_add_f32x8(a0, b0, c0),
self.mul_add_f32x8(a1, b1, c1),
)
}
#[inline(always)]
fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(
self.mul_sub_f32x8(a0, b0, c0),
self.mul_sub_f32x8(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
}
#[inline(always)]
fn ceil_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.ceil_f32x8(a0), self.ceil_f32x8(a1))
}
#[inline(always)]
fn round_ties_even_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(
self.round_ties_even_f32x8(a0),
self.round_ties_even_f32x8(a1),
)
}
#[inline(always)]
fn fract_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1))
}
#[inline(always)]
fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1))
}
#[inline(always)]
fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1))
}
#[inline(always)]
fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>) {
(
f32x8 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
f32x8 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f64x4(
self.reinterpret_f64_f32x8(a0),
self.reinterpret_f64_f32x8(a1),
)
}
#[inline(always)]
fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_i32x8(
self.reinterpret_i32_f32x8(a0),
self.reinterpret_i32_f32x8(a1),
)
}
#[inline(always)]
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
unsafe {
let v0 = _mm_loadu_ps(src.as_ptr() as *const _);
let v1 = _mm_loadu_ps(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_ps(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_ps(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_ps(v0, v1);
let tmp1 = _mm_unpackhi_ps(v0, v1);
let tmp2 = _mm_unpacklo_ps(v2, v3);
let tmp3 = _mm_unpackhi_ps(v2, v3);
let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
self.combine_f32x8(
self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
)
}
}
#[inline(always)]
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
let (v01, v23) = self.split_f32x16(a);
let (v0, v1) = self.split_f32x8(v01);
let (v2, v3) = self.split_f32x8(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();
unsafe {
let tmp0 = _mm_unpacklo_ps(v0, v1);
let tmp1 = _mm_unpackhi_ps(v0, v1);
let tmp2 = _mm_unpacklo_ps(v2, v3);
let tmp3 = _mm_unpackhi_ps(v2, v3);
let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
_mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
_mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
_mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
_mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
}
}
#[inline(always)]
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
}
#[inline(always)]
fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_u32x8(
self.reinterpret_u32_f32x8(a0),
self.reinterpret_u32_f32x8(a1),
)
}
#[inline(always)]
fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1))
}
#[inline(always)]
fn cvt_u32_precise_f32x16(self, a: f32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_u32x8(
self.cvt_u32_precise_f32x8(a0),
self.cvt_u32_precise_f32x8(a1),
)
}
#[inline(always)]
fn cvt_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1))
}
#[inline(always)]
fn cvt_i32_precise_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_i32x8(
self.cvt_i32_precise_f32x8(a0),
self.cvt_i32_precise_f32x8(a1),
)
}
#[inline(always)]
fn splat_i8x64(self, val: i8) -> i8x64<Self> {
let half = self.splat_i8x32(val);
self.combine_i8x32(half, half)
}
#[inline(always)]
fn load_array_i8x64(self, val: [i8; 64usize]) -> i8x64<Self> {
i8x64 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i8x64(self, val: &[i8; 64usize]) -> i8x64<Self> {
i8x64 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i8x64(self, a: i8x64<Self>) -> [i8; 64usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i8x64(self, a: &i8x64<Self>) -> &[i8; 64usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i8x64(self, a: &mut i8x64<Self>) -> &mut [i8; 64usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i8x64(self, a: i8x64<Self>, dest: &mut [i8; 64usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
64usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i8x64(self, a: u8x64<Self>) -> i8x64<Self> {
unsafe {
i8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
unsafe {
if SHIFT >= 64usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_i8x64(b).val.0,
self.cvt_to_bytes_i8x64(a).val.0,
SHIFT,
);
self.cvt_from_bytes_i8x64(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i8x64<const SHIFT: usize>(
self,
a: i8x64<Self>,
b: i8x64<Self>,
) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(
self.slide_within_blocks_i8x32::<SHIFT>(a0, b0),
self.slide_within_blocks_i8x32::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1))
}
#[inline(always)]
fn sub_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1))
}
#[inline(always)]
fn mul_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1))
}
#[inline(always)]
fn and_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1))
}
#[inline(always)]
fn or_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1))
}
#[inline(always)]
fn xor_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1))
}
#[inline(always)]
fn not_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1))
}
#[inline(always)]
fn shl_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_i8x32(self.shl_i8x32(a0, shift), self.shl_i8x32(a1, shift))
}
#[inline(always)]
fn shlv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.shlv_i8x32(a0, b0), self.shlv_i8x32(a1, b1))
}
#[inline(always)]
fn shr_i8x64(self, a: i8x64<Self>, shift: u32) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_i8x32(self.shr_i8x32(a0, shift), self.shr_i8x32(a1, shift))
}
#[inline(always)]
fn shrv_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1))
}
#[inline(always)]
fn simd_eq_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1))
}
#[inline(always)]
fn simd_lt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1))
}
#[inline(always)]
fn simd_le_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1))
}
#[inline(always)]
fn simd_ge_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1))
}
#[inline(always)]
fn simd_gt_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1))
}
#[inline(always)]
fn zip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, _) = self.split_i8x64(a);
let (b0, _) = self.split_i8x64(b);
self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0))
}
#[inline(always)]
fn zip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (_, a1) = self.split_i8x64(a);
let (_, b1) = self.split_i8x64(b);
self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1))
}
#[inline(always)]
fn unzip_low_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1))
}
#[inline(always)]
fn unzip_high_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1))
}
#[inline(always)]
fn select_i8x64(self, a: mask8x64<Self>, b: i8x64<Self>, c: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_i8x64(b);
let (c0, c1) = self.split_i8x64(c);
self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1))
}
#[inline(always)]
fn min_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1))
}
#[inline(always)]
fn max_i8x64(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
let (b0, b1) = self.split_i8x64(b);
self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1))
}
#[inline(always)]
fn split_i8x64(self, a: i8x64<Self>) -> (i8x32<Self>, i8x32<Self>) {
(
i8x32 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
i8x32 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i8x64(self, a: i8x64<Self>) -> i8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1))
}
#[inline(always)]
fn reinterpret_u8_i8x64(self, a: i8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1))
}
#[inline(always)]
fn reinterpret_u32_i8x64(self, a: i8x64<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_i8x64(a);
self.combine_u32x8(
self.reinterpret_u32_i8x32(a0),
self.reinterpret_u32_i8x32(a1),
)
}
#[inline(always)]
fn splat_u8x64(self, val: u8) -> u8x64<Self> {
let half = self.splat_u8x32(val);
self.combine_u8x32(half, half)
}
#[inline(always)]
fn load_array_u8x64(self, val: [u8; 64usize]) -> u8x64<Self> {
u8x64 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u8x64(self, val: &[u8; 64usize]) -> u8x64<Self> {
u8x64 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u8x64(self, a: u8x64<Self>) -> [u8; 64usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [u8; 64usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u8x64(self, a: &u8x64<Self>) -> &[u8; 64usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u8; 64usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u8x64(self, a: &mut u8x64<Self>) -> &mut [u8; 64usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u8; 64usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u8,
dest.as_mut_ptr(),
64usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
unsafe {
if SHIFT >= 64usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_u8x64(b).val.0,
self.cvt_to_bytes_u8x64(a).val.0,
SHIFT,
);
self.cvt_from_bytes_u8x64(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u8x64<const SHIFT: usize>(
self,
a: u8x64<Self>,
b: u8x64<Self>,
) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(
self.slide_within_blocks_u8x32::<SHIFT>(a0, b0),
self.slide_within_blocks_u8x32::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1))
}
#[inline(always)]
fn sub_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1))
}
#[inline(always)]
fn mul_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1))
}
#[inline(always)]
fn and_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1))
}
#[inline(always)]
fn or_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1))
}
#[inline(always)]
fn xor_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1))
}
#[inline(always)]
fn not_u8x64(self, a: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1))
}
#[inline(always)]
fn shl_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
self.combine_u8x32(self.shl_u8x32(a0, shift), self.shl_u8x32(a1, shift))
}
#[inline(always)]
fn shlv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.shlv_u8x32(a0, b0), self.shlv_u8x32(a1, b1))
}
#[inline(always)]
fn shr_u8x64(self, a: u8x64<Self>, shift: u32) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
self.combine_u8x32(self.shr_u8x32(a0, shift), self.shr_u8x32(a1, shift))
}
#[inline(always)]
fn shrv_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1))
}
#[inline(always)]
fn simd_eq_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1))
}
#[inline(always)]
fn simd_lt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1))
}
#[inline(always)]
fn simd_le_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1))
}
#[inline(always)]
fn simd_ge_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1))
}
#[inline(always)]
fn simd_gt_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1))
}
#[inline(always)]
fn zip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, _) = self.split_u8x64(a);
let (b0, _) = self.split_u8x64(b);
self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0))
}
#[inline(always)]
fn zip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (_, a1) = self.split_u8x64(a);
let (_, b1) = self.split_u8x64(b);
self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1))
}
#[inline(always)]
fn unzip_low_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1))
}
#[inline(always)]
fn unzip_high_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1))
}
#[inline(always)]
fn select_u8x64(self, a: mask8x64<Self>, b: u8x64<Self>, c: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_u8x64(b);
let (c0, c1) = self.split_u8x64(c);
self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1))
}
#[inline(always)]
fn min_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1))
}
#[inline(always)]
fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u8x64(a);
let (b0, b1) = self.split_u8x64(b);
self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1))
}
#[inline(always)]
fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>) {
(
u8x32 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
u8x32 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(16usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 16usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 16usize) as *const _);
let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
let v2 = _mm_shuffle_epi8(v2, mask);
let v3 = _mm_shuffle_epi8(v3, mask);
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
self.combine_u8x32(
self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
)
}
}
#[inline(always)]
fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
let (v01, v23) = self.split_u8x64(a);
let (v0, v1) = self.split_u8x32(v01);
let (v2, v3) = self.split_u8x32(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();
unsafe {
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
let out0 = _mm_shuffle_epi8(out0, mask);
let out1 = _mm_shuffle_epi8(out1, mask);
let out2 = _mm_shuffle_epi8(out2, mask);
let out3 = _mm_shuffle_epi8(out3, mask);
_mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
_mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
_mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
_mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
}
}
#[inline(always)]
fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u8x64(a);
self.combine_u32x8(
self.reinterpret_u32_u8x32(a0),
self.reinterpret_u32_u8x32(a1),
)
}
#[inline(always)]
fn splat_mask8x64(self, val: i8) -> mask8x64<Self> {
let half = self.splat_mask8x32(val);
self.combine_mask8x32(half, half)
}
#[inline(always)]
fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self> {
mask8x64 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask8x64(self, val: &[i8; 64usize]) -> mask8x64<Self> {
mask8x64 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask8x64(self, a: &mask8x64<Self>) -> &[i8; 64usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i8; 64usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask8x64(self, a: &mut mask8x64<Self>) -> &mut [i8; 64usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i8; 64usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask8x64(self, a: mask8x64<Self>, dest: &mut [i8; 64usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i8,
dest.as_mut_ptr(),
64usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask8x64(self, a: u8x64<Self>) -> mask8x64<Self> {
unsafe {
mask8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask8x64(self, a: mask8x64<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask8x64<const SHIFT: usize>(
self,
a: mask8x64<Self>,
b: mask8x64<Self>,
) -> mask8x64<Self> {
unsafe {
if SHIFT >= 64usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_mask8x64(b).val.0,
self.cvt_to_bytes_mask8x64(a).val.0,
SHIFT,
);
self.cvt_from_bytes_mask8x64(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask8x64<const SHIFT: usize>(
self,
a: mask8x64<Self>,
b: mask8x64<Self>,
) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
self.combine_mask8x32(
self.slide_within_blocks_mask8x32::<SHIFT>(a0, b0),
self.slide_within_blocks_mask8x32::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1))
}
#[inline(always)]
fn or_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1))
}
#[inline(always)]
fn xor_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1))
}
#[inline(always)]
fn not_mask8x64(self, a: mask8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1))
}
#[inline(always)]
fn select_mask8x64(
self,
a: mask8x64<Self>,
b: mask8x64<Self>,
c: mask8x64<Self>,
) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
let (c0, c1) = self.split_mask8x64(c);
self.combine_mask8x32(
self.select_mask8x32(a0, b0, c0),
self.select_mask8x32(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
let (a0, a1) = self.split_mask8x64(a);
let (b0, b1) = self.split_mask8x64(b);
self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1))
}
#[inline(always)]
fn any_true_mask8x64(self, a: mask8x64<Self>) -> bool {
let (a0, a1) = self.split_mask8x64(a);
self.any_true_mask8x32(a0) || self.any_true_mask8x32(a1)
}
#[inline(always)]
fn all_true_mask8x64(self, a: mask8x64<Self>) -> bool {
let (a0, a1) = self.split_mask8x64(a);
self.all_true_mask8x32(a0) && self.all_true_mask8x32(a1)
}
#[inline(always)]
fn any_false_mask8x64(self, a: mask8x64<Self>) -> bool {
let (a0, a1) = self.split_mask8x64(a);
self.any_false_mask8x32(a0) || self.any_false_mask8x32(a1)
}
#[inline(always)]
fn all_false_mask8x64(self, a: mask8x64<Self>) -> bool {
let (a0, a1) = self.split_mask8x64(a);
self.all_false_mask8x32(a0) && self.all_false_mask8x32(a1)
}
#[inline(always)]
fn split_mask8x64(self, a: mask8x64<Self>) -> (mask8x32<Self>, mask8x32<Self>) {
(
mask8x32 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
mask8x32 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn splat_i16x32(self, val: i16) -> i16x32<Self> {
let half = self.splat_i16x16(val);
self.combine_i16x16(half, half)
}
#[inline(always)]
fn load_array_i16x32(self, val: [i16; 32usize]) -> i16x32<Self> {
i16x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i16x32(self, val: &[i16; 32usize]) -> i16x32<Self> {
i16x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i16x32(self, a: i16x32<Self>) -> [i16; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i16x32(self, a: &i16x32<Self>) -> &[i16; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i16x32(self, a: &mut i16x32<Self>) -> &mut [i16; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i16x32(self, a: i16x32<Self>, dest: &mut [i16; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i16x32(self, a: u8x64<Self>) -> i16x32<Self> {
unsafe {
i16x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_i16x32(b).val.0,
self.cvt_to_bytes_i16x32(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_i16x32(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i16x32<const SHIFT: usize>(
self,
a: i16x32<Self>,
b: i16x32<Self>,
) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(
self.slide_within_blocks_i16x16::<SHIFT>(a0, b0),
self.slide_within_blocks_i16x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1))
}
#[inline(always)]
fn sub_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1))
}
#[inline(always)]
fn mul_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1))
}
#[inline(always)]
fn and_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1))
}
#[inline(always)]
fn or_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1))
}
#[inline(always)]
fn xor_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1))
}
#[inline(always)]
fn not_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1))
}
#[inline(always)]
fn shl_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_i16x16(self.shl_i16x16(a0, shift), self.shl_i16x16(a1, shift))
}
#[inline(always)]
fn shlv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.shlv_i16x16(a0, b0), self.shlv_i16x16(a1, b1))
}
#[inline(always)]
fn shr_i16x32(self, a: i16x32<Self>, shift: u32) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_i16x16(self.shr_i16x16(a0, shift), self.shr_i16x16(a1, shift))
}
#[inline(always)]
fn shrv_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1))
}
#[inline(always)]
fn simd_eq_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1))
}
#[inline(always)]
fn simd_lt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1))
}
#[inline(always)]
fn simd_le_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1))
}
#[inline(always)]
fn simd_ge_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1))
}
#[inline(always)]
fn simd_gt_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1))
}
#[inline(always)]
fn zip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, _) = self.split_i16x32(a);
let (b0, _) = self.split_i16x32(b);
self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0))
}
#[inline(always)]
fn zip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (_, a1) = self.split_i16x32(a);
let (_, b1) = self.split_i16x32(b);
self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1))
}
#[inline(always)]
fn unzip_low_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1))
}
#[inline(always)]
fn unzip_high_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(
self.unzip_high_i16x16(a0, a1),
self.unzip_high_i16x16(b0, b1),
)
}
#[inline(always)]
fn select_i16x32(self, a: mask16x32<Self>, b: i16x32<Self>, c: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_i16x32(b);
let (c0, c1) = self.split_i16x32(c);
self.combine_i16x16(
self.select_i16x16(a0, b0, c0),
self.select_i16x16(a1, b1, c1),
)
}
#[inline(always)]
fn min_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1))
}
#[inline(always)]
fn max_i16x32(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
let (b0, b1) = self.split_i16x32(b);
self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1))
}
#[inline(always)]
fn split_i16x32(self, a: i16x32<Self>) -> (i16x16<Self>, i16x16<Self>) {
(
i16x16 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
i16x16 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i16x32(self, a: i16x32<Self>) -> i16x32<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1))
}
#[inline(always)]
fn reinterpret_u8_i16x32(self, a: i16x32<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_u8x32(
self.reinterpret_u8_i16x16(a0),
self.reinterpret_u8_i16x16(a1),
)
}
#[inline(always)]
fn reinterpret_u32_i16x32(self, a: i16x32<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_i16x32(a);
self.combine_u32x8(
self.reinterpret_u32_i16x16(a0),
self.reinterpret_u32_i16x16(a1),
)
}
#[inline(always)]
fn splat_u16x32(self, val: u16) -> u16x32<Self> {
let half = self.splat_u16x16(val);
self.combine_u16x16(half, half)
}
#[inline(always)]
fn load_array_u16x32(self, val: [u16; 32usize]) -> u16x32<Self> {
u16x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u16x32(self, val: &[u16; 32usize]) -> u16x32<Self> {
u16x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u16x32(self, a: u16x32<Self>) -> [u16; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [u16; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u16x32(self, a: &u16x32<Self>) -> &[u16; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u16; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u16x32(self, a: &mut u16x32<Self>) -> &mut [u16; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u16; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u16,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u16x32(self, a: u8x64<Self>) -> u16x32<Self> {
unsafe {
u16x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_u16x32(b).val.0,
self.cvt_to_bytes_u16x32(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_u16x32(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u16x32<const SHIFT: usize>(
self,
a: u16x32<Self>,
b: u16x32<Self>,
) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(
self.slide_within_blocks_u16x16::<SHIFT>(a0, b0),
self.slide_within_blocks_u16x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1))
}
#[inline(always)]
fn sub_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1))
}
#[inline(always)]
fn mul_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1))
}
#[inline(always)]
fn and_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1))
}
#[inline(always)]
fn or_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1))
}
#[inline(always)]
fn xor_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1))
}
#[inline(always)]
fn not_u16x32(self, a: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1))
}
#[inline(always)]
fn shl_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u16x16(self.shl_u16x16(a0, shift), self.shl_u16x16(a1, shift))
}
#[inline(always)]
fn shlv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.shlv_u16x16(a0, b0), self.shlv_u16x16(a1, b1))
}
#[inline(always)]
fn shr_u16x32(self, a: u16x32<Self>, shift: u32) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u16x16(self.shr_u16x16(a0, shift), self.shr_u16x16(a1, shift))
}
#[inline(always)]
fn shrv_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1))
}
#[inline(always)]
fn simd_eq_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1))
}
#[inline(always)]
fn simd_lt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1))
}
#[inline(always)]
fn simd_le_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1))
}
#[inline(always)]
fn simd_ge_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1))
}
#[inline(always)]
fn simd_gt_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1))
}
#[inline(always)]
fn zip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, _) = self.split_u16x32(a);
let (b0, _) = self.split_u16x32(b);
self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0))
}
#[inline(always)]
fn zip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (_, a1) = self.split_u16x32(a);
let (_, b1) = self.split_u16x32(b);
self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1))
}
#[inline(always)]
fn unzip_low_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1))
}
#[inline(always)]
fn unzip_high_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(
self.unzip_high_u16x16(a0, a1),
self.unzip_high_u16x16(b0, b1),
)
}
#[inline(always)]
fn select_u16x32(self, a: mask16x32<Self>, b: u16x32<Self>, c: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_u16x32(b);
let (c0, c1) = self.split_u16x32(c);
self.combine_u16x16(
self.select_u16x16(a0, b0, c0),
self.select_u16x16(a1, b1, c1),
)
}
#[inline(always)]
fn min_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1))
}
#[inline(always)]
fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
let (a0, a1) = self.split_u16x32(a);
let (b0, b1) = self.split_u16x32(b);
self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1))
}
#[inline(always)]
fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>) {
(
u16x16 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
u16x16 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(8usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 8usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 8usize) as *const _);
let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
let v0 = _mm_shuffle_epi8(v0, mask);
let v1 = _mm_shuffle_epi8(v1, mask);
let v2 = _mm_shuffle_epi8(v2, mask);
let v3 = _mm_shuffle_epi8(v3, mask);
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
self.combine_u16x16(
self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
)
}
}
#[inline(always)]
fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
let (v01, v23) = self.split_u16x32(a);
let (v0, v1) = self.split_u16x16(v01);
let (v2, v3) = self.split_u16x16(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();
unsafe {
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
let out0 = _mm_shuffle_epi8(out0, mask);
let out1 = _mm_shuffle_epi8(out1, mask);
let out2 = _mm_shuffle_epi8(out2, mask);
let out3 = _mm_shuffle_epi8(out3, mask);
_mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
_mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
_mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
_mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
}
}
#[inline(always)]
fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1))
}
#[inline(always)]
fn reinterpret_u8_u16x32(self, a: u16x32<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u8x32(
self.reinterpret_u8_u16x16(a0),
self.reinterpret_u8_u16x16(a1),
)
}
#[inline(always)]
fn reinterpret_u32_u16x32(self, a: u16x32<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u16x32(a);
self.combine_u32x8(
self.reinterpret_u32_u16x16(a0),
self.reinterpret_u32_u16x16(a1),
)
}
#[inline(always)]
fn splat_mask16x32(self, val: i16) -> mask16x32<Self> {
let half = self.splat_mask16x16(val);
self.combine_mask16x16(half, half)
}
#[inline(always)]
fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self> {
mask16x32 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask16x32(self, val: &[i16; 32usize]) -> mask16x32<Self> {
mask16x32 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask16x32(self, a: &mask16x32<Self>) -> &[i16; 32usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i16; 32usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask16x32(self, a: &mut mask16x32<Self>) -> &mut [i16; 32usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i16; 32usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask16x32(self, a: mask16x32<Self>, dest: &mut [i16; 32usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i16,
dest.as_mut_ptr(),
32usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask16x32(self, a: u8x64<Self>) -> mask16x32<Self> {
unsafe {
mask16x32 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask16x32(self, a: mask16x32<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask16x32<const SHIFT: usize>(
self,
a: mask16x32<Self>,
b: mask16x32<Self>,
) -> mask16x32<Self> {
unsafe {
if SHIFT >= 32usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_mask16x32(b).val.0,
self.cvt_to_bytes_mask16x32(a).val.0,
SHIFT * 2usize,
);
self.cvt_from_bytes_mask16x32(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask16x32<const SHIFT: usize>(
self,
a: mask16x32<Self>,
b: mask16x32<Self>,
) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
self.combine_mask16x16(
self.slide_within_blocks_mask16x16::<SHIFT>(a0, b0),
self.slide_within_blocks_mask16x16::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1))
}
#[inline(always)]
fn or_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1))
}
#[inline(always)]
fn xor_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1))
}
#[inline(always)]
fn not_mask16x32(self, a: mask16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1))
}
#[inline(always)]
fn select_mask16x32(
self,
a: mask16x32<Self>,
b: mask16x32<Self>,
c: mask16x32<Self>,
) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
let (c0, c1) = self.split_mask16x32(c);
self.combine_mask16x16(
self.select_mask16x16(a0, b0, c0),
self.select_mask16x16(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
let (a0, a1) = self.split_mask16x32(a);
let (b0, b1) = self.split_mask16x32(b);
self.combine_mask16x16(
self.simd_eq_mask16x16(a0, b0),
self.simd_eq_mask16x16(a1, b1),
)
}
#[inline(always)]
fn any_true_mask16x32(self, a: mask16x32<Self>) -> bool {
let (a0, a1) = self.split_mask16x32(a);
self.any_true_mask16x16(a0) || self.any_true_mask16x16(a1)
}
#[inline(always)]
fn all_true_mask16x32(self, a: mask16x32<Self>) -> bool {
let (a0, a1) = self.split_mask16x32(a);
self.all_true_mask16x16(a0) && self.all_true_mask16x16(a1)
}
#[inline(always)]
fn any_false_mask16x32(self, a: mask16x32<Self>) -> bool {
let (a0, a1) = self.split_mask16x32(a);
self.any_false_mask16x16(a0) || self.any_false_mask16x16(a1)
}
#[inline(always)]
fn all_false_mask16x32(self, a: mask16x32<Self>) -> bool {
let (a0, a1) = self.split_mask16x32(a);
self.all_false_mask16x16(a0) && self.all_false_mask16x16(a1)
}
#[inline(always)]
fn split_mask16x32(self, a: mask16x32<Self>) -> (mask16x16<Self>, mask16x16<Self>) {
(
mask16x16 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
mask16x16 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn splat_i32x16(self, val: i32) -> i32x16<Self> {
let half = self.splat_i32x8(val);
self.combine_i32x8(half, half)
}
#[inline(always)]
fn load_array_i32x16(self, val: [i32; 16usize]) -> i32x16<Self> {
i32x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_i32x16(self, val: &[i32; 16usize]) -> i32x16<Self> {
i32x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_i32x16(self, a: i32x16<Self>) -> [i32; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_i32x16(self, a: &i32x16<Self>) -> &[i32; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_i32x16(self, a: &mut i32x16<Self>) -> &mut [i32; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_i32x16(self, a: i32x16<Self>, dest: &mut [i32; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_i32x16(self, a: u8x64<Self>) -> i32x16<Self> {
unsafe {
i32x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_i32x16(b).val.0,
self.cvt_to_bytes_i32x16(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_i32x16(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_i32x16<const SHIFT: usize>(
self,
a: i32x16<Self>,
b: i32x16<Self>,
) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(
self.slide_within_blocks_i32x8::<SHIFT>(a0, b0),
self.slide_within_blocks_i32x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1))
}
#[inline(always)]
fn sub_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1))
}
#[inline(always)]
fn mul_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1))
}
#[inline(always)]
fn and_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1))
}
#[inline(always)]
fn or_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1))
}
#[inline(always)]
fn xor_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1))
}
#[inline(always)]
fn not_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1))
}
#[inline(always)]
fn shl_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_i32x8(self.shl_i32x8(a0, shift), self.shl_i32x8(a1, shift))
}
#[inline(always)]
fn shlv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.shlv_i32x8(a0, b0), self.shlv_i32x8(a1, b1))
}
#[inline(always)]
fn shr_i32x16(self, a: i32x16<Self>, shift: u32) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_i32x8(self.shr_i32x8(a0, shift), self.shr_i32x8(a1, shift))
}
#[inline(always)]
fn shrv_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1))
}
#[inline(always)]
fn simd_eq_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1))
}
#[inline(always)]
fn simd_lt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1))
}
#[inline(always)]
fn simd_le_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1))
}
#[inline(always)]
fn simd_ge_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1))
}
#[inline(always)]
fn simd_gt_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1))
}
#[inline(always)]
fn zip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, _) = self.split_i32x16(a);
let (b0, _) = self.split_i32x16(b);
self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0))
}
#[inline(always)]
fn zip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (_, a1) = self.split_i32x16(a);
let (_, b1) = self.split_i32x16(b);
self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1))
}
#[inline(always)]
fn unzip_low_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1))
}
#[inline(always)]
fn unzip_high_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1))
}
#[inline(always)]
fn select_i32x16(self, a: mask32x16<Self>, b: i32x16<Self>, c: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_i32x16(b);
let (c0, c1) = self.split_i32x16(c);
self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1))
}
#[inline(always)]
fn min_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1))
}
#[inline(always)]
fn max_i32x16(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
let (b0, b1) = self.split_i32x16(b);
self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1))
}
#[inline(always)]
fn split_i32x16(self, a: i32x16<Self>) -> (i32x8<Self>, i32x8<Self>) {
(
i32x8 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
i32x8 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn neg_i32x16(self, a: i32x16<Self>) -> i32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1))
}
#[inline(always)]
fn reinterpret_u8_i32x16(self, a: i32x16<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1))
}
#[inline(always)]
fn reinterpret_u32_i32x16(self, a: i32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_u32x8(
self.reinterpret_u32_i32x8(a0),
self.reinterpret_u32_i32x8(a1),
)
}
#[inline(always)]
fn cvt_f32_i32x16(self, a: i32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_i32x16(a);
self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1))
}
#[inline(always)]
fn splat_u32x16(self, val: u32) -> u32x16<Self> {
let half = self.splat_u32x8(val);
self.combine_u32x8(half, half)
}
#[inline(always)]
fn load_array_u32x16(self, val: [u32; 16usize]) -> u32x16<Self> {
u32x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_u32x16(self, val: &[u32; 16usize]) -> u32x16<Self> {
u32x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_u32x16(self, a: u32x16<Self>) -> [u32; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [u32; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_u32x16(self, a: &u32x16<Self>) -> &[u32; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[u32; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_u32x16(self, a: &mut u32x16<Self>) -> &mut [u32; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [u32; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const u32,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_u32x16(self, a: u8x64<Self>) -> u32x16<Self> {
unsafe {
u32x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_u32x16(b).val.0,
self.cvt_to_bytes_u32x16(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_u32x16(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_u32x16<const SHIFT: usize>(
self,
a: u32x16<Self>,
b: u32x16<Self>,
) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(
self.slide_within_blocks_u32x8::<SHIFT>(a0, b0),
self.slide_within_blocks_u32x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn add_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1))
}
#[inline(always)]
fn sub_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1))
}
#[inline(always)]
fn mul_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1))
}
#[inline(always)]
fn and_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1))
}
#[inline(always)]
fn or_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1))
}
#[inline(always)]
fn xor_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1))
}
#[inline(always)]
fn not_u32x16(self, a: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1))
}
#[inline(always)]
fn shl_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
self.combine_u32x8(self.shl_u32x8(a0, shift), self.shl_u32x8(a1, shift))
}
#[inline(always)]
fn shlv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.shlv_u32x8(a0, b0), self.shlv_u32x8(a1, b1))
}
#[inline(always)]
fn shr_u32x16(self, a: u32x16<Self>, shift: u32) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
self.combine_u32x8(self.shr_u32x8(a0, shift), self.shr_u32x8(a1, shift))
}
#[inline(always)]
fn shrv_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1))
}
#[inline(always)]
fn simd_eq_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1))
}
#[inline(always)]
fn simd_lt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1))
}
#[inline(always)]
fn simd_le_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1))
}
#[inline(always)]
fn simd_ge_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1))
}
#[inline(always)]
fn simd_gt_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1))
}
#[inline(always)]
fn zip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, _) = self.split_u32x16(a);
let (b0, _) = self.split_u32x16(b);
self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0))
}
#[inline(always)]
fn zip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (_, a1) = self.split_u32x16(a);
let (_, b1) = self.split_u32x16(b);
self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1))
}
#[inline(always)]
fn unzip_low_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1))
}
#[inline(always)]
fn unzip_high_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1))
}
#[inline(always)]
fn select_u32x16(self, a: mask32x16<Self>, b: u32x16<Self>, c: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_u32x16(b);
let (c0, c1) = self.split_u32x16(c);
self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1))
}
#[inline(always)]
fn min_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1))
}
#[inline(always)]
fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
let (b0, b1) = self.split_u32x16(b);
self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1))
}
#[inline(always)]
fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>) {
(
u32x8 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
u32x8 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
unsafe {
let v0 = _mm_loadu_si128(src.as_ptr() as *const _);
let v1 = _mm_loadu_si128(src.as_ptr().add(4usize) as *const _);
let v2 = _mm_loadu_si128(src.as_ptr().add(2 * 4usize) as *const _);
let v3 = _mm_loadu_si128(src.as_ptr().add(3 * 4usize) as *const _);
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
self.combine_u32x8(
self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
)
}
}
#[inline(always)]
fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
let (v01, v23) = self.split_u32x16(a);
let (v0, v1) = self.split_u32x8(v01);
let (v2, v3) = self.split_u32x8(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();
unsafe {
let tmp0 = _mm_unpacklo_epi32(v0, v1);
let tmp1 = _mm_unpackhi_epi32(v0, v1);
let tmp2 = _mm_unpacklo_epi32(v2, v3);
let tmp3 = _mm_unpackhi_epi32(v2, v3);
let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
_mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
_mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
_mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
_mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
}
}
#[inline(always)]
fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
let (a0, a1) = self.split_u32x16(a);
self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1))
}
#[inline(always)]
fn cvt_f32_u32x16(self, a: u32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_u32x16(a);
self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1))
}
#[inline(always)]
fn splat_mask32x16(self, val: i32) -> mask32x16<Self> {
let half = self.splat_mask32x8(val);
self.combine_mask32x8(half, half)
}
#[inline(always)]
fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self> {
mask32x16 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask32x16(self, val: &[i32; 16usize]) -> mask32x16<Self> {
mask32x16 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask32x16(self, a: &mask32x16<Self>) -> &[i32; 16usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i32; 16usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask32x16(self, a: &mut mask32x16<Self>) -> &mut [i32; 16usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i32; 16usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask32x16(self, a: mask32x16<Self>, dest: &mut [i32; 16usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i32,
dest.as_mut_ptr(),
16usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask32x16(self, a: u8x64<Self>) -> mask32x16<Self> {
unsafe {
mask32x16 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask32x16(self, a: mask32x16<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask32x16<const SHIFT: usize>(
self,
a: mask32x16<Self>,
b: mask32x16<Self>,
) -> mask32x16<Self> {
unsafe {
if SHIFT >= 16usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_mask32x16(b).val.0,
self.cvt_to_bytes_mask32x16(a).val.0,
SHIFT * 4usize,
);
self.cvt_from_bytes_mask32x16(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask32x16<const SHIFT: usize>(
self,
a: mask32x16<Self>,
b: mask32x16<Self>,
) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
self.combine_mask32x8(
self.slide_within_blocks_mask32x8::<SHIFT>(a0, b0),
self.slide_within_blocks_mask32x8::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1))
}
#[inline(always)]
fn or_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1))
}
#[inline(always)]
fn xor_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1))
}
#[inline(always)]
fn not_mask32x16(self, a: mask32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1))
}
#[inline(always)]
fn select_mask32x16(
self,
a: mask32x16<Self>,
b: mask32x16<Self>,
c: mask32x16<Self>,
) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
let (c0, c1) = self.split_mask32x16(c);
self.combine_mask32x8(
self.select_mask32x8(a0, b0, c0),
self.select_mask32x8(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
let (a0, a1) = self.split_mask32x16(a);
let (b0, b1) = self.split_mask32x16(b);
self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1))
}
#[inline(always)]
fn any_true_mask32x16(self, a: mask32x16<Self>) -> bool {
let (a0, a1) = self.split_mask32x16(a);
self.any_true_mask32x8(a0) || self.any_true_mask32x8(a1)
}
#[inline(always)]
fn all_true_mask32x16(self, a: mask32x16<Self>) -> bool {
let (a0, a1) = self.split_mask32x16(a);
self.all_true_mask32x8(a0) && self.all_true_mask32x8(a1)
}
#[inline(always)]
fn any_false_mask32x16(self, a: mask32x16<Self>) -> bool {
let (a0, a1) = self.split_mask32x16(a);
self.any_false_mask32x8(a0) || self.any_false_mask32x8(a1)
}
#[inline(always)]
fn all_false_mask32x16(self, a: mask32x16<Self>) -> bool {
let (a0, a1) = self.split_mask32x16(a);
self.all_false_mask32x8(a0) && self.all_false_mask32x8(a1)
}
#[inline(always)]
fn split_mask32x16(self, a: mask32x16<Self>) -> (mask32x8<Self>, mask32x8<Self>) {
(
mask32x8 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
mask32x8 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn splat_f64x8(self, val: f64) -> f64x8<Self> {
let half = self.splat_f64x4(val);
self.combine_f64x4(half, half)
}
#[inline(always)]
fn load_array_f64x8(self, val: [f64; 8usize]) -> f64x8<Self> {
f64x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_f64x8(self, val: &[f64; 8usize]) -> f64x8<Self> {
f64x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_f64x8(self, a: f64x8<Self>) -> [f64; 8usize] {
unsafe { core::mem::transmute::<[__m128d; 4usize], [f64; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_f64x8(self, a: &f64x8<Self>) -> &[f64; 8usize] {
unsafe { core::mem::transmute::<&[__m128d; 4usize], &[f64; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_f64x8(self, a: &mut f64x8<Self>) -> &mut [f64; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128d; 4usize], &mut [f64; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_f64x8(self, a: f64x8<Self>, dest: &mut [f64; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const f64,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_f64x8(self, a: u8x64<Self>) -> f64x8<Self> {
unsafe {
f64x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_f64x8(self, a: f64x8<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_f64x8(b).val.0,
self.cvt_to_bytes_f64x8(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_f64x8(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_f64x8<const SHIFT: usize>(
self,
a: f64x8<Self>,
b: f64x8<Self>,
) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(
self.slide_within_blocks_f64x4::<SHIFT>(a0, b0),
self.slide_within_blocks_f64x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn abs_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1))
}
#[inline(always)]
fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1))
}
#[inline(always)]
fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1))
}
#[inline(always)]
fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1))
}
#[inline(always)]
fn sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1))
}
#[inline(always)]
fn mul_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1))
}
#[inline(always)]
fn div_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1))
}
#[inline(always)]
fn copysign_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1))
}
#[inline(always)]
fn simd_eq_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1))
}
#[inline(always)]
fn simd_lt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1))
}
#[inline(always)]
fn simd_le_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1))
}
#[inline(always)]
fn simd_ge_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1))
}
#[inline(always)]
fn simd_gt_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1))
}
#[inline(always)]
fn zip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, _) = self.split_f64x8(a);
let (b0, _) = self.split_f64x8(b);
self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0))
}
#[inline(always)]
fn zip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (_, a1) = self.split_f64x8(a);
let (_, b1) = self.split_f64x8(b);
self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1))
}
#[inline(always)]
fn unzip_low_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1))
}
#[inline(always)]
fn unzip_high_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1))
}
#[inline(always)]
fn max_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1))
}
#[inline(always)]
fn min_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1))
}
#[inline(always)]
fn max_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(
self.max_precise_f64x4(a0, b0),
self.max_precise_f64x4(a1, b1),
)
}
#[inline(always)]
fn min_precise_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
self.combine_f64x4(
self.min_precise_f64x4(a0, b0),
self.min_precise_f64x4(a1, b1),
)
}
#[inline(always)]
fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(
self.mul_add_f64x4(a0, b0, c0),
self.mul_add_f64x4(a1, b1, c1),
)
}
#[inline(always)]
fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(
self.mul_sub_f64x4(a0, b0, c0),
self.mul_sub_f64x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
}
#[inline(always)]
fn ceil_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.ceil_f64x4(a0), self.ceil_f64x4(a1))
}
#[inline(always)]
fn round_ties_even_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(
self.round_ties_even_f64x4(a0),
self.round_ties_even_f64x4(a1),
)
}
#[inline(always)]
fn fract_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1))
}
#[inline(always)]
fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1))
}
#[inline(always)]
fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1))
}
#[inline(always)]
fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>) {
(
f64x4 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
f64x4 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
#[inline(always)]
fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f32x8(
self.reinterpret_f32_f64x4(a0),
self.reinterpret_f32_f64x4(a1),
)
}
#[inline(always)]
fn splat_mask64x8(self, val: i64) -> mask64x8<Self> {
let half = self.splat_mask64x4(val);
self.combine_mask64x4(half, half)
}
#[inline(always)]
fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self> {
mask64x8 {
val: unsafe { core::mem::transmute_copy(&val) },
simd: self,
}
}
#[inline(always)]
fn load_array_ref_mask64x8(self, val: &[i64; 8usize]) -> mask64x8<Self> {
mask64x8 {
val: unsafe { core::mem::transmute_copy(val) },
simd: self,
}
}
#[inline(always)]
fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize] {
unsafe { core::mem::transmute::<[__m128i; 4usize], [i64; 8usize]>(a.val.0) }
}
#[inline(always)]
fn as_array_ref_mask64x8(self, a: &mask64x8<Self>) -> &[i64; 8usize] {
unsafe { core::mem::transmute::<&[__m128i; 4usize], &[i64; 8usize]>(&a.val.0) }
}
#[inline(always)]
fn as_array_mut_mask64x8(self, a: &mut mask64x8<Self>) -> &mut [i64; 8usize] {
unsafe { core::mem::transmute::<&mut [__m128i; 4usize], &mut [i64; 8usize]>(&mut a.val.0) }
}
#[inline(always)]
fn store_array_mask64x8(self, a: mask64x8<Self>, dest: &mut [i64; 8usize]) -> () {
unsafe {
core::ptr::copy_nonoverlapping(
(&raw const a.val.0) as *const i64,
dest.as_mut_ptr(),
8usize,
);
}
}
#[inline(always)]
fn cvt_from_bytes_mask64x8(self, a: u8x64<Self>) -> mask64x8<Self> {
unsafe {
mask64x8 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn cvt_to_bytes_mask64x8(self, a: mask64x8<Self>) -> u8x64<Self> {
unsafe {
u8x64 {
val: core::mem::transmute(a.val),
simd: self,
}
}
}
#[inline(always)]
fn slide_mask64x8<const SHIFT: usize>(
self,
a: mask64x8<Self>,
b: mask64x8<Self>,
) -> mask64x8<Self> {
unsafe {
if SHIFT >= 8usize {
return b;
}
let result = cross_block_alignr_128x4(
self.cvt_to_bytes_mask64x8(b).val.0,
self.cvt_to_bytes_mask64x8(a).val.0,
SHIFT * 8usize,
);
self.cvt_from_bytes_mask64x8(u8x64 {
val: crate::support::Aligned512(result),
simd: self,
})
}
}
#[inline(always)]
fn slide_within_blocks_mask64x8<const SHIFT: usize>(
self,
a: mask64x8<Self>,
b: mask64x8<Self>,
) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
self.combine_mask64x4(
self.slide_within_blocks_mask64x4::<SHIFT>(a0, b0),
self.slide_within_blocks_mask64x4::<SHIFT>(a1, b1),
)
}
#[inline(always)]
fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1))
}
#[inline(always)]
fn or_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1))
}
#[inline(always)]
fn xor_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1))
}
#[inline(always)]
fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1))
}
#[inline(always)]
fn select_mask64x8(
self,
a: mask64x8<Self>,
b: mask64x8<Self>,
c: mask64x8<Self>,
) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
let (c0, c1) = self.split_mask64x8(c);
self.combine_mask64x4(
self.select_mask64x4(a0, b0, c0),
self.select_mask64x4(a1, b1, c1),
)
}
#[inline(always)]
fn simd_eq_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
let (a0, a1) = self.split_mask64x8(a);
let (b0, b1) = self.split_mask64x8(b);
self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1))
}
#[inline(always)]
fn any_true_mask64x8(self, a: mask64x8<Self>) -> bool {
let (a0, a1) = self.split_mask64x8(a);
self.any_true_mask64x4(a0) || self.any_true_mask64x4(a1)
}
#[inline(always)]
fn all_true_mask64x8(self, a: mask64x8<Self>) -> bool {
let (a0, a1) = self.split_mask64x8(a);
self.all_true_mask64x4(a0) && self.all_true_mask64x4(a1)
}
#[inline(always)]
fn any_false_mask64x8(self, a: mask64x8<Self>) -> bool {
let (a0, a1) = self.split_mask64x8(a);
self.any_false_mask64x4(a0) || self.any_false_mask64x4(a1)
}
#[inline(always)]
fn all_false_mask64x8(self, a: mask64x8<Self>) -> bool {
let (a0, a1) = self.split_mask64x8(a);
self.all_false_mask64x4(a0) && self.all_false_mask64x4(a1)
}
#[inline(always)]
fn split_mask64x8(self, a: mask64x8<Self>) -> (mask64x4<Self>, mask64x4<Self>) {
(
mask64x4 {
val: crate::support::Aligned256([a.val.0[0], a.val.0[1]]),
simd: self,
},
mask64x4 {
val: crate::support::Aligned256([a.val.0[2], a.val.0[3]]),
simd: self,
},
)
}
}
impl<S: Simd> SimdFrom<__m128, S> for f32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<f32x4<S>> for __m128 {
#[inline(always)]
fn from(value: f32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for i8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<i8x16<S>> for __m128i {
#[inline(always)]
fn from(value: i8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for u8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<u8x16<S>> for __m128i {
#[inline(always)]
fn from(value: u8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for mask8x16<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<mask8x16<S>> for __m128i {
#[inline(always)]
fn from(value: mask8x16<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for i16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<i16x8<S>> for __m128i {
#[inline(always)]
fn from(value: i16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for u16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<u16x8<S>> for __m128i {
#[inline(always)]
fn from(value: u16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for mask16x8<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<mask16x8<S>> for __m128i {
#[inline(always)]
fn from(value: mask16x8<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for i32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<i32x4<S>> for __m128i {
#[inline(always)]
fn from(value: i32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for u32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<u32x4<S>> for __m128i {
#[inline(always)]
fn from(value: u32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for mask32x4<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<mask32x4<S>> for __m128i {
#[inline(always)]
fn from(value: mask32x4<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128d, S> for f64x2<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128d) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<f64x2<S>> for __m128d {
#[inline(always)]
fn from(value: f64x2<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
impl<S: Simd> SimdFrom<__m128i, S> for mask64x2<S> {
#[inline(always)]
fn simd_from(simd: S, arch: __m128i) -> Self {
Self {
val: unsafe { core::mem::transmute_copy(&arch) },
simd,
}
}
}
impl<S: Simd> From<mask64x2<S>> for __m128i {
#[inline(always)]
fn from(value: mask64x2<S>) -> Self {
unsafe { core::mem::transmute_copy(&value.val) }
}
}
#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
#[doc = r" Rust doesn't currently let you do math on const generics."]
#[inline(always)]
unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
unsafe {
match shift {
0usize => _mm_alignr_epi8::<0i32>(a, b),
1usize => _mm_alignr_epi8::<1i32>(a, b),
2usize => _mm_alignr_epi8::<2i32>(a, b),
3usize => _mm_alignr_epi8::<3i32>(a, b),
4usize => _mm_alignr_epi8::<4i32>(a, b),
5usize => _mm_alignr_epi8::<5i32>(a, b),
6usize => _mm_alignr_epi8::<6i32>(a, b),
7usize => _mm_alignr_epi8::<7i32>(a, b),
8usize => _mm_alignr_epi8::<8i32>(a, b),
9usize => _mm_alignr_epi8::<9i32>(a, b),
10usize => _mm_alignr_epi8::<10i32>(a, b),
11usize => _mm_alignr_epi8::<11i32>(a, b),
12usize => _mm_alignr_epi8::<12i32>(a, b),
13usize => _mm_alignr_epi8::<13i32>(a, b),
14usize => _mm_alignr_epi8::<14i32>(a, b),
15usize => _mm_alignr_epi8::<15i32>(a, b),
_ => unreachable!(),
}
}
}
#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
#[inline(always)]
unsafe fn cross_block_alignr_128x2(
a: [__m128i; 2usize],
b: [__m128i; 2usize],
shift_bytes: usize,
) -> [__m128i; 2usize] {
[
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
]
}
#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
#[inline(always)]
unsafe fn cross_block_alignr_128x4(
a: [__m128i; 4usize],
b: [__m128i; 4usize],
shift_bytes: usize,
) -> [__m128i; 4usize] {
[
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
{
let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes);
unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
},
]
}