use super::*;
simd_type!({
#[allow(missing_docs)]
pub struct V1 {
pub sse: f!("sse"),
pub sse2: f!("sse2"),
pub fxsr: f!("fxsr"),
}
});
impl Seal for V1 {}
impl V1 {
binop_128_nosign!(sse: add, "Computes `a + b` for each lane of `a` and `b`.", f32 x 4);
binop_128_nosign!(sse2: add, "Adds the elements of each lane of `a` and `b`.", f64 x 2);
binop_128_nosign!(sse2: add, "Adds the elements of each lane of `a` and `b`, with wrapping on overflow.", wrapping_add, u8 x 16, i8 x 16, u16 x 8, i16 x 8, u32 x 4, i32 x 4, u64 x 2, i64 x 2);
binop_128!(sse: and, "Returns `a & b` for each bit in `a` and `b`.", f32 x 4);
binop_128!(sse2: and, "Returns `a & b` for each bit in `a` and `b`.", f64 x 2);
binop_128_full!(sse2: and, "Returns `a & b` for each bit in `a` and `b`.", m8 x 16, u8 x 16, i8 x 16, m16 x 8, u16 x 8, i16 x 8, m32 x 4, u32 x 4, i32 x 4, m64 x 2, u64 x 2, i64 x 2);
binop_128!(sse: andnot, "Returns `!a & b` for each bit in `a` and `b`.", f32 x 4);
binop_128!(sse2: andnot, "Returns `!a & b` for each bit in `a` and `b`.", f64 x 2);
binop_128_full!(sse2: andnot, "Returns `!a & b` for each bit in `a` and `b`.", m8 x 16, u8 x 16, i8 x 16, m16 x 8, u16 x 8, i16 x 8, m32 x 4, u32 x 4, i32 x 4, m64 x 2, u64 x 2, i64 x 2);
binop_128!(sse2: avg, "Computes `average(a, b)` for each lane of `a` and `b`.", average, u8 x 16, u16 x 8);
binop_128_nosign!(sse: cmpeq, "Compares the elements in each lane of `a` and `b` for equality.", cmp_eq, f32 x 4 => m32);
binop_128_nosign!(sse2: cmpeq, "Compares the elements in each lane of `a` and `b` for equality.", cmp_eq, m8 x 16 => m8, u8 x 16 => m8, i8 x 16 => m8, m16 x 8 => m16, u16 x 8 => m16, i16 x 8 => m16, m32 x 4 => m32, u32 x 4 => m32, i32 x 4 => m32, f64 x 2 => m64);
binop_128!(sse: cmpge, "Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.", cmp_ge, f32 x 4 => m32);
binop_128!(sse2: cmpge, "Compares the elements in each lane of `a` and `b` for greater-than-or-equal-to.", cmp_ge, f64 x 2 => m64);
binop_128!(sse: cmpgt, "Compares the elements in each lane of `a` and `b` for greater-than.", cmp_gt, f32 x 4 => m32);
binop_128!(sse2: cmpgt, "Compares the elements in each lane of `a` and `b` for equality.", cmp_gt, i8 x 16 => m8, i16 x 8 => m16, i32 x 4 => m32, f64 x 2 => m64);
binop_128!(sse: cmplt, "Compares the elements in each lane of `a` and `b` for greater-than.", cmp_lt, f32 x 4 => m32);
binop_128!(sse2: cmplt, "Compares the elements in each lane of `a` and `b` for less-than.", cmp_lt, i8 x 16 => m8, i16 x 8 => m16, i32 x 4 => m32, f64 x 2 => m64);
binop_128!(sse: cmple, "Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.", cmp_le, f32 x 4 => m32);
binop_128!(sse2: cmple, "Compares the elements in each lane of `a` and `b` for less-than-or-equal-to.", cmp_le, f64 x 2 => m64);
binop_128!(sse: cmpneq, "Compares the elements in each lane of `a` and `b` for inequality.", cmp_not_eq, f32 x 4 => m32);
binop_128!(sse2: cmpneq, "Compares the elements in each lane of `a` and `b` for inequality.", cmp_not_eq, f64 x 2 => m64);
binop_128!(sse: cmpnge, "Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.", cmp_not_ge, f32 x 4 => m32);
binop_128!(sse2: cmpnge, "Compares the elements in each lane of `a` and `b` for not-greater-than-or-equal.", cmp_not_ge, f64 x 2 => m64);
binop_128!(sse: cmpngt, "Compares the elements in each lane of `a` and `b` for not-greater-than.", cmp_not_gt, f32 x 4 => m32);
binop_128!(sse2: cmpngt, "Compares the elements in each lane of `a` and `b` for not-greater-than.", cmp_not_gt, f64 x 2 => m64);
binop_128!(sse: cmpnle, "Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.", cmp_not_le, f32 x 4 => m32);
binop_128!(sse2: cmpnle, "Compares the elements in each lane of `a` and `b` for not-less-than-or-equal.", cmp_not_le, f64 x 2 => m64);
binop_128!(sse: cmpnlt, "Compares the elements in each lane of `a` and `b` for not-less-than.", cmp_not_lt, f32 x 4 => m32);
binop_128!(sse2: cmpnlt, "Compares the elements in each lane of `a` and `b` for not-less-than.", cmp_not_lt, f64 x 2 => m64);
binop_128!(sse: div, "Divides the elements of each lane of `a` and `b`.", f32 x 4);
binop_128!(sse2: div, "Divides the elements of each lane of `a` and `b`.", f64 x 2);
binop_128!(sse: max, "Computes `max(a, b)`. for each lane in `a` and `b`.", f32 x 4);
binop_128!(sse2: max, "Computes `max(a, b)`. for each lane in `a` and `b`.", u8 x 16, i16 x 8, f64 x 2);
binop_128!(sse: min, "Computes `max(a, b)`. for each lane in `a` and `b`.", f32 x 4);
binop_128!(sse2: min, "Computes `max(a, b)`. for each lane in `a` and `b`.", u8 x 16, i16 x 8, f64 x 2);
binop_128!(sse: mul, "Computes `a * b` for each lane in `a` and `b`.", f32 x 4);
binop_128!(sse2: mul, "Computes `a * b` for each lane in `a` and `b`.", f64 x 2);
binop_128_nosign!(sse2: mullo, "Computes `a * b` for each lane in `a` and `b`, with wrapping overflow.", wrapping_mul, u16 x 8, i16 x 8);
binop_128!(sse: or, "Returns `a | b` for each bit in `a` and `b`.", f32 x 4);
binop_128!(sse2: or, "Returns `a | b` for each bit in `a` and `b`.", f64 x 2);
binop_128_full!(sse2: or, "Returns `a | b` for each bit in `a` and `b`.", m8 x 16, u8 x 16, i8 x 16, m16 x 8, u16 x 8, i16 x 8, m32 x 4, u32 x 4, i32 x 4, m64 x 2, u64 x 2, i64 x 2);
binop_128!(sse2: adds, "Adds the elements of each lane of `a` and `b`, with saturation.", saturating_add, u8 x 16, i8 x 16, u16 x 8, i16 x 8);
binop_128!(sse2: subs, "Subtracts the elements of each lane of `a` and `b`, with saturation.", saturating_sub, u8 x 16, i8 x 16, u16 x 8, i16 x 8);
binop_128_nosign!(sse: sub, "Subtracts the elements of each lane of `a` and `b`.", f32 x 4);
binop_128_nosign!(sse2: sub, "Subtracts the elements of each lane of `a` and `b`.", f64 x 2);
binop_128_nosign!(sse2: sub, "Subtracts the elements of each lane of `a` and `b`, with wrapping overflow.", wrapping_sub, u8 x 16, i8 x 16, u16 x 8, i16 x 8, u32 x 4, i32 x 4, u64 x 2, i64 x 2);
binop_128!(sse: xor, "Returns `a ^ b` for each bit in `a` and `b`.", f32 x 4);
binop_128!(sse2: xor, "Returns `a ^ b` for each bit in `a` and `b`.", f64 x 2);
binop_128_full!(sse2: xor, "Returns `a ^ b` for each bit in `a` and `b`.", m8 x 16, u8 x 16, i8 x 16, m16 x 8, u16 x 8, i16 x 8, m32 x 4, u32 x 4, i32 x 4, m64 x 2, u64 x 2, i64 x 2);
#[inline(always)]
pub fn abs_f32x4(self, a: f32x4) -> f32x4 {
self.and_f32x4(a, cast!(self.splat_u32x4((1 << 31) - 1)))
}
#[inline(always)]
pub fn abs_f64x2(self, a: f64x2) -> f64x2 {
self.and_f64x2(a, cast!(self.splat_u64x2((1 << 63) - 1)))
}
#[inline(always)]
pub fn approx_reciprocal_f32x4(self, a: f32x4) -> f32x4 {
cast!(self.sse._mm_rcp_ps(cast!(a)))
}
#[inline(always)]
pub fn approx_reciprocal_sqrt_f32x4(self, a: f32x4) -> f32x4 {
cast!(self.sse._mm_rsqrt_ps(cast!(a)))
}
#[inline(always)]
pub fn cmp_ge_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
self.not_m16x8(self.cmp_lt_i16x8(a, b))
}
#[inline(always)]
pub fn cmp_ge_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
self.not_m32x4(self.cmp_lt_i32x4(a, b))
}
#[inline(always)]
pub fn cmp_ge_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
self.not_m8x16(self.cmp_lt_i8x16(a, b))
}
#[inline(always)]
pub fn cmp_ge_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
self.not_m16x8(self.cmp_lt_u16x8(a, b))
}
#[inline(always)]
pub fn cmp_ge_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
self.not_m32x4(self.cmp_lt_u32x4(a, b))
}
#[inline(always)]
pub fn cmp_ge_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
self.not_m8x16(self.cmp_lt_u8x16(a, b))
}
#[inline(always)]
pub fn cmp_gt_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
let k = self.splat_u16x8(0x8000);
self.cmp_gt_i16x8(cast!(self.xor_u16x8(a, k)), cast!(self.xor_u16x8(b, k)))
}
#[inline(always)]
pub fn cmp_gt_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
let k = self.splat_u32x4(0x80000000);
self.cmp_gt_i32x4(cast!(self.xor_u32x4(a, k)), cast!(self.xor_u32x4(b, k)))
}
#[inline(always)]
pub fn cmp_gt_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
let k = self.splat_u8x16(0x80);
self.cmp_gt_i8x16(cast!(self.xor_u8x16(a, k)), cast!(self.xor_u8x16(b, k)))
}
#[inline(always)]
pub fn cmp_le_i16x8(self, a: i16x8, b: i16x8) -> m16x8 {
self.not_m16x8(self.cmp_gt_i16x8(a, b))
}
#[inline(always)]
pub fn cmp_le_i32x4(self, a: i32x4, b: i32x4) -> m32x4 {
self.not_m32x4(self.cmp_gt_i32x4(a, b))
}
#[inline(always)]
pub fn cmp_le_i8x16(self, a: i8x16, b: i8x16) -> m8x16 {
self.not_m8x16(self.cmp_gt_i8x16(a, b))
}
#[inline(always)]
pub fn cmp_le_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
self.not_m16x8(self.cmp_gt_u16x8(a, b))
}
#[inline(always)]
pub fn cmp_le_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
self.not_m32x4(self.cmp_gt_u32x4(a, b))
}
#[inline(always)]
pub fn cmp_le_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
self.not_m8x16(self.cmp_gt_u8x16(a, b))
}
#[inline(always)]
pub fn cmp_lt_u16x8(self, a: u16x8, b: u16x8) -> m16x8 {
let k = self.splat_u16x8(0x8000);
self.cmp_lt_i16x8(cast!(self.xor_u16x8(a, k)), cast!(self.xor_u16x8(b, k)))
}
#[inline(always)]
pub fn cmp_lt_u32x4(self, a: u32x4, b: u32x4) -> m32x4 {
let k = self.splat_u32x4(0x80000000);
self.cmp_lt_i32x4(cast!(self.xor_u32x4(a, k)), cast!(self.xor_u32x4(b, k)))
}
#[inline(always)]
pub fn cmp_lt_u8x16(self, a: u8x16, b: u8x16) -> m8x16 {
let k = self.splat_u8x16(0x80);
self.cmp_lt_i8x16(cast!(self.xor_u8x16(a, k)), cast!(self.xor_u8x16(b, k)))
}
#[inline(always)]
pub fn convert_f32x4_to_f64x2(self, a: f32x4) -> f64x2 {
cast!(self.sse2._mm_cvtps_pd(cast!(a)))
}
#[inline(always)]
pub fn convert_f32x4_to_i32x4(self, a: f32x4) -> i32x4 {
cast!(self.sse2._mm_cvttps_epi32(cast!(a)))
}
#[inline(always)]
pub fn convert_f64x2_to_f32x4(self, a: f64x2) -> f32x4 {
cast!(self.sse2._mm_cvtpd_ps(cast!(a)))
}
#[inline(always)]
pub fn convert_f64x2_to_i32x4(self, a: f64x2) -> i32x4 {
cast!(self.sse2._mm_cvttpd_epi32(cast!(a)))
}
#[inline(always)]
pub fn convert_i16x8_to_u16x8(self, a: i16x8) -> u16x8 {
cast!(a)
}
#[inline(always)]
pub fn convert_i32x4_to_f32x4(self, a: i32x4) -> f32x4 {
cast!(self.sse2._mm_cvtepi32_ps(cast!(a)))
}
#[inline(always)]
pub fn convert_i32x4_to_f64x2(self, a: i32x4) -> f64x2 {
cast!(self.sse2._mm_cvtepi32_pd(cast!(a)))
}
#[inline(always)]
pub fn convert_i32x4_to_u32x4(self, a: i32x4) -> u32x4 {
cast!(a)
}
#[inline(always)]
pub fn convert_i8x16_to_u8x16(self, a: i8x16) -> u8x16 {
cast!(a)
}
#[inline(always)]
pub fn convert_u16x8_to_i16x8(self, a: u16x8) -> i16x8 {
cast!(a)
}
#[inline(always)]
pub fn convert_u32x4_to_i32x4(self, a: u32x4) -> i32x4 {
cast!(a)
}
#[inline(always)]
pub fn convert_u8x16_to_i8x16(self, a: u8x16) -> i8x16 {
cast!(a)
}
#[inline(always)]
pub fn is_nan_f32x4(self, a: f32x4) -> m32x4 {
cast!(self.sse._mm_cmpunord_ps(cast!(a), cast!(a)))
}
#[inline(always)]
pub fn is_nan_f64x2(self, a: f64x2) -> m64x2 {
cast!(self.sse2._mm_cmpunord_pd(cast!(a), cast!(a)))
}
#[inline(always)]
pub fn is_not_nan_f32x4(self, a: f32x4) -> m32x4 {
cast!(self.sse._mm_cmpord_ps(cast!(a), cast!(a)))
}
#[inline(always)]
pub fn is_not_nan_f64x2(self, a: f64x2) -> m64x2 {
cast!(self.sse2._mm_cmpord_pd(cast!(a), cast!(a)))
}
#[inline(always)]
pub fn multiply_wrapping_add_adjacent_i16x8(self, a: i16x8, b: i16x8) -> i32x4 {
cast!(self.sse2._mm_madd_epi16(cast!(a), cast!(b)))
}
#[inline(always)]
pub fn not_i16x8(self, a: i16x8) -> i16x8 {
self.xor_i16x8(a, self.splat_i16x8(!0))
}
#[inline(always)]
pub fn not_i32x4(self, a: i32x4) -> i32x4 {
self.xor_i32x4(a, self.splat_i32x4(!0))
}
#[inline(always)]
pub fn not_i64x2(self, a: i64x2) -> i64x2 {
self.xor_i64x2(a, self.splat_i64x2(!0))
}
#[inline(always)]
pub fn not_i8x16(self, a: i8x16) -> i8x16 {
self.xor_i8x16(a, self.splat_i8x16(!0))
}
#[inline(always)]
pub fn not_m16x8(self, a: m16x8) -> m16x8 {
self.xor_m16x8(a, self.splat_m16x8(m16::new(true)))
}
#[inline(always)]
pub fn not_m32x4(self, a: m32x4) -> m32x4 {
self.xor_m32x4(a, self.splat_m32x4(m32::new(true)))
}
#[inline(always)]
pub fn not_m64x2(self, a: m64x2) -> m64x2 {
self.xor_m64x2(a, self.splat_m64x2(m64::new(true)))
}
#[inline(always)]
pub fn not_m8x16(self, a: m8x16) -> m8x16 {
self.xor_m8x16(a, self.splat_m8x16(m8::new(true)))
}
#[inline(always)]
pub fn not_u16x8(self, a: u16x8) -> u16x8 {
self.xor_u16x8(a, self.splat_u16x8(!0))
}
#[inline(always)]
pub fn not_u32x4(self, a: u32x4) -> u32x4 {
self.xor_u32x4(a, self.splat_u32x4(!0))
}
#[inline(always)]
pub fn not_u64x2(self, a: u64x2) -> u64x2 {
self.xor_u64x2(a, self.splat_u64x2(!0))
}
#[inline(always)]
pub fn not_u8x16(self, a: u8x16) -> u8x16 {
self.xor_u8x16(a, self.splat_u8x16(!0))
}
#[inline(always)]
pub fn pack_with_signed_saturation_i16x8(self, a: i16x8, b: i16x8) -> i8x16 {
cast!(self.sse2._mm_packs_epi16(cast!(a), cast!(b)))
}
#[inline(always)]
pub fn pack_with_signed_saturation_i32x4(self, a: i32x4, b: i32x4) -> i16x8 {
cast!(self.sse2._mm_packs_epi32(cast!(a), cast!(b)))
}
#[inline(always)]
pub fn pack_with_unsigned_saturation_i16x8(self, a: i16x8, b: i16x8) -> u8x16 {
cast!(self.sse2._mm_packus_epi16(cast!(a), cast!(b)))
}
#[inline(always)]
pub fn reduce_max_c32x2(self, a: f32x4) -> c32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_max_ps(a, hi);
cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
}
#[inline(always)]
pub fn reduce_max_c64x1(self, a: f64x2) -> c64 {
cast!(a)
}
#[inline(always)]
pub fn reduce_max_f32x4(self, a: f32x4) -> f32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_max_ps(a, hi);
let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
let r = self.sse._mm_max_ss(r0, r0_shuffled);
self.sse._mm_cvtss_f32(r)
}
#[inline(always)]
pub fn reduce_max_f64x2(self, a: f64x2) -> f64 {
let a: __m128d = cast!(a);
let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
let r = self.sse2._mm_max_sd(a, hi);
self.sse2._mm_cvtsd_f64(r)
}
#[inline(always)]
pub fn reduce_min_c32x2(self, a: f32x4) -> c32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_min_ps(a, hi);
cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
}
#[inline(always)]
pub fn reduce_min_c64x1(self, a: f64x2) -> c64 {
cast!(a)
}
#[inline(always)]
pub fn reduce_min_f32x4(self, a: f32x4) -> f32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_min_ps(a, hi);
let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
let r = self.sse._mm_min_ss(r0, r0_shuffled);
self.sse._mm_cvtss_f32(r)
}
#[inline(always)]
pub fn reduce_min_f64x2(self, a: f64x2) -> f64 {
let a: __m128d = cast!(a);
let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
let r = self.sse2._mm_min_sd(a, hi);
self.sse2._mm_cvtsd_f64(r)
}
#[inline(always)]
pub fn reduce_product_f32x4(self, a: f32x4) -> f32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_mul_ps(a, hi);
let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
let r = self.sse._mm_mul_ss(r0, r0_shuffled);
self.sse._mm_cvtss_f32(r)
}
#[inline(always)]
pub fn reduce_product_f64x2(self, a: f64x2) -> f64 {
let a: __m128d = cast!(a);
let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
let r = self.sse2._mm_mul_sd(a, hi);
self.sse2._mm_cvtsd_f64(r)
}
#[inline(always)]
pub fn reduce_sum_c32x2(self, a: f32x4) -> c32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_add_ps(a, hi);
cast!(self.sse2._mm_cvtsd_f64(cast!(r0)))
}
#[inline(always)]
pub fn reduce_sum_c64x1(self, a: f64x2) -> c64 {
cast!(a)
}
#[inline(always)]
pub fn reduce_sum_f32x4(self, a: f32x4) -> f32 {
let a: __m128 = cast!(a);
let hi = self.sse._mm_movehl_ps(a, a);
let r0 = self.sse._mm_add_ps(a, hi);
let r0_shuffled = self.sse._mm_shuffle_ps::<0b0001>(r0, r0);
let r = self.sse._mm_add_ss(r0, r0_shuffled);
self.sse._mm_cvtss_f32(r)
}
#[inline(always)]
pub fn reduce_sum_f64x2(self, a: f64x2) -> f64 {
let a: __m128d = cast!(a);
let hi = cast!(self.sse._mm_movehl_ps(cast!(a), cast!(a)));
let r = self.sse2._mm_add_sd(a, hi);
self.sse2._mm_cvtsd_f64(r)
}
#[inline(always)]
pub fn shl_const_i16x8<const AMOUNT: i32>(self, a: i16x8) -> i16x8 {
cast!(self.sse2._mm_slli_epi16::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_const_i32x4<const AMOUNT: i32>(self, a: i32x4) -> i32x4 {
cast!(self.sse2._mm_slli_epi32::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_const_i64x2<const AMOUNT: i32>(self, a: i64x2) -> i64x2 {
cast!(self.sse2._mm_slli_epi64::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_const_u16x8<const AMOUNT: i32>(self, a: u16x8) -> u16x8 {
cast!(self.sse2._mm_slli_epi16::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_const_u32x4<const AMOUNT: i32>(self, a: u32x4) -> u32x4 {
cast!(self.sse2._mm_slli_epi32::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_const_u64x2<const AMOUNT: i32>(self, a: u64x2) -> u64x2 {
cast!(self.sse2._mm_slli_epi64::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shl_i16x8(self, a: i16x8, amount: u64x2) -> i16x8 {
cast!(self.sse2._mm_sll_epi16(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shl_i32x4(self, a: i32x4, amount: u64x2) -> i32x4 {
cast!(self.sse2._mm_sll_epi32(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shl_i64x2(self, a: i64x2, amount: u64x2) -> u64x2 {
cast!(self.sse2._mm_sll_epi64(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shl_u16x8(self, a: u16x8, amount: u64x2) -> u16x8 {
cast!(self.sse2._mm_sll_epi16(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shl_u32x4(self, a: u32x4, amount: u64x2) -> u32x4 {
cast!(self.sse2._mm_sll_epi32(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shl_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
cast!(self.sse2._mm_sll_epi64(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shr_const_i16x8<const AMOUNT: i32>(self, a: i16x8) -> i16x8 {
cast!(self.sse2._mm_srai_epi16::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shr_const_i32x4<const AMOUNT: i32>(self, a: i32x4) -> i32x4 {
cast!(self.sse2._mm_srai_epi32::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shr_const_u16x8<const AMOUNT: i32>(self, a: u16x8) -> u16x8 {
cast!(self.sse2._mm_srli_epi16::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shr_const_u32x4<const AMOUNT: i32>(self, a: u32x4) -> u32x4 {
cast!(self.sse2._mm_srli_epi32::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shr_const_u64x2<const AMOUNT: i32>(self, a: u64x2) -> u64x2 {
cast!(self.sse2._mm_srli_epi64::<AMOUNT>(cast!(a)))
}
#[inline(always)]
pub fn shr_i16x8(self, a: i16x8, amount: u64x2) -> i16x8 {
cast!(self.sse2._mm_sra_epi16(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shr_i32x4(self, a: i32x4, amount: u64x2) -> i32x4 {
cast!(self.sse2._mm_sra_epi32(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shr_u16x8(self, a: u16x8, amount: u64x2) -> u16x8 {
cast!(self.sse2._mm_srl_epi16(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shr_u32x4(self, a: u32x4, amount: u64x2) -> u32x4 {
cast!(self.sse2._mm_srl_epi32(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn shr_u64x2(self, a: u64x2, amount: u64x2) -> u64x2 {
cast!(self.sse2._mm_srl_epi64(cast!(a), cast!(amount)))
}
#[inline(always)]
pub fn splat_f32x4(self, value: f32) -> f32x4 {
cast!(self.sse._mm_set1_ps(value))
}
#[inline(always)]
pub fn splat_f64x2(self, value: f64) -> f64x2 {
cast!(self.sse2._mm_set1_pd(value))
}
#[inline(always)]
pub fn splat_i16x8(self, value: i16) -> i16x8 {
cast!(self.sse2._mm_set1_epi16(value))
}
#[inline(always)]
pub fn splat_i32x4(self, value: i32) -> i32x4 {
cast!(self.sse2._mm_set1_epi32(value))
}
#[inline(always)]
pub fn splat_i64x2(self, value: i64) -> i64x2 {
cast!(self.sse2._mm_set1_epi64x(value))
}
#[inline(always)]
pub fn splat_i8x16(self, value: i8) -> i8x16 {
cast!(self.sse2._mm_set1_epi8(value))
}
#[inline(always)]
pub fn splat_m16x8(self, value: m16) -> m16x8 {
cast!(self.sse2._mm_set1_epi16(value.0 as i16))
}
#[inline(always)]
pub fn splat_m32x4(self, value: m32) -> m32x4 {
cast!(self.sse2._mm_set1_epi32(value.0 as i32))
}
#[inline(always)]
pub fn splat_m64x2(self, value: m64) -> m64x2 {
cast!(self.sse2._mm_set1_epi64x(value.0 as i64))
}
#[inline(always)]
pub fn splat_m8x16(self, value: m8) -> m8x16 {
cast!(self.sse2._mm_set1_epi8(value.0 as i8))
}
#[inline(always)]
pub fn splat_u16x8(self, value: u16) -> u16x8 {
cast!(self.sse2._mm_set1_epi16(value as i16))
}
#[inline(always)]
pub fn splat_u32x4(self, value: u32) -> u32x4 {
cast!(self.sse2._mm_set1_epi32(value as i32))
}
#[inline(always)]
pub fn splat_u64x2(self, value: u64) -> u64x2 {
cast!(self.sse2._mm_set1_epi64x(value as i64))
}
#[inline(always)]
pub fn splat_u8x16(self, value: u8) -> u8x16 {
cast!(self.sse2._mm_set1_epi8(value as i8))
}
#[inline(always)]
pub fn sqrt_f32x4(self, a: f32x4) -> f32x4 {
cast!(self.sse._mm_sqrt_ps(cast!(a)))
}
#[inline(always)]
pub fn sqrt_f64x2(self, a: f64x2) -> f64x2 {
cast!(self.sse2._mm_sqrt_pd(cast!(a)))
}
#[inline(always)]
pub fn sum_of_absolute_differences_u8x16(self, a: u8x16, b: u8x16) -> u64x2 {
cast!(self.sse2._mm_sad_epu8(cast!(a), cast!(b)))
}
#[inline(always)]
pub fn widening_mul_i16x8(self, a: i16x8, b: i16x8) -> (u16x8, i16x8) {
(
cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b))),
cast!(self.sse2._mm_mulhi_epi16(cast!(a), cast!(b))),
)
}
#[inline(always)]
pub fn widening_mul_u16x8(self, a: u16x8, b: u16x8) -> (u16x8, u16x8) {
(
cast!(self.sse2._mm_mullo_epi16(cast!(a), cast!(b))),
cast!(self.sse2._mm_mulhi_epu16(cast!(a), cast!(b))),
)
}
}