Trait simdeez::Simd[][src]

pub trait Simd {
    type Vi32: Copy + Debug + Add<Self::Vi32, Output = Self::Vi32> + Sub<Self::Vi32, Output = Self::Vi32> + Mul<Self::Vi32, Output = Self::Vi32> + AddAssign<Self::Vi32> + SubAssign<Self::Vi32> + MulAssign<Self::Vi32> + BitAnd<Self::Vi32, Output = Self::Vi32> + BitOr<Self::Vi32, Output = Self::Vi32> + BitXor<Self::Vi32, Output = Self::Vi32> + BitAndAssign<Self::Vi32> + BitOrAssign<Self::Vi32> + BitXorAssign<Self::Vi32> + Index<usize, Output = i32> + IndexMut<usize>;
    type Vf32: Copy + Debug + Add<Self::Vf32, Output = Self::Vf32> + Sub<Self::Vf32, Output = Self::Vf32> + Mul<Self::Vf32, Output = Self::Vf32> + Div<Self::Vf32, Output = Self::Vf32> + AddAssign<Self::Vf32> + SubAssign<Self::Vf32> + MulAssign<Self::Vf32> + DivAssign<Self::Vf32> + BitAnd<Self::Vf32, Output = Self::Vf32> + BitOr<Self::Vf32, Output = Self::Vf32> + BitXor<Self::Vf32, Output = Self::Vf32> + BitAndAssign<Self::Vf32> + BitOrAssign<Self::Vf32> + BitXorAssign<Self::Vf32> + Index<usize, Output = f32> + IndexMut<usize>;
    type Vf64: Copy + Debug + Index<usize, Output = f64> + IndexMut<usize> + Add<Self::Vf64, Output = Self::Vf64> + Sub<Self::Vf64, Output = Self::Vf64> + Mul<Self::Vf64, Output = Self::Vf64> + Div<Self::Vf64, Output = Self::Vf64> + AddAssign<Self::Vf64> + SubAssign<Self::Vf64> + MulAssign<Self::Vf64> + DivAssign<Self::Vf64> + BitAnd<Self::Vf64, Output = Self::Vf64> + BitOr<Self::Vf64, Output = Self::Vf64> + BitXor<Self::Vf64, Output = Self::Vf64> + BitAndAssign<Self::Vf64> + BitOrAssign<Self::Vf64> + BitXorAssign<Self::Vf64>;
    type Vi64: Copy + Debug + Index<usize, Output = i64> + IndexMut<usize> + Add<Self::Vi64, Output = Self::Vi64> + Sub<Self::Vi64, Output = Self::Vi64> + AddAssign<Self::Vi64> + SubAssign<Self::Vi64> + BitAnd<Self::Vi64, Output = Self::Vi64> + BitOr<Self::Vi64, Output = Self::Vi64> + BitXor<Self::Vi64, Output = Self::Vi64> + BitAndAssign<Self::Vi64> + BitOrAssign<Self::Vi64> + BitXorAssign<Self::Vi64>;

    const VF32_WIDTH: usize;
    const VF64_WIDTH: usize;
    const VI32_WIDTH: usize;
    const VI64_WIDTH: usize;

    unsafe fn div_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn div_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn abs_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn abs_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn add_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn add_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn add_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn and_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn and_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn andnot_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn andnot_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn andnot_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn andnot_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn blendv_epi32(
        a: Self::Vi32,
        b: Self::Vi32,
        mask: Self::Vi32
    ) -> Self::Vi32;
unsafe fn blendv_epi64(
        a: Self::Vi64,
        b: Self::Vi64,
        mask: Self::Vi64
    ) -> Self::Vi64;
unsafe fn blendv_ps(
        a: Self::Vf32,
        b: Self::Vf32,
        mask: Self::Vf32
    ) -> Self::Vf32;
unsafe fn blendv_pd(
        a: Self::Vf64,
        b: Self::Vf64,
        mask: Self::Vf64
    ) -> Self::Vf64;
unsafe fn castps_epi32(a: Self::Vf32) -> Self::Vi32;
unsafe fn castpd_epi64(a: Self::Vf64) -> Self::Vi64;
unsafe fn castepi32_ps(a: Self::Vi32) -> Self::Vf32;
unsafe fn castepi64_pd(a: Self::Vi64) -> Self::Vf64;
unsafe fn castepi32_epi64(a: Self::Vi32) -> Self::Vi64;
unsafe fn castepi64_epi32(a: Self::Vi64) -> Self::Vi32;
unsafe fn castps_pd(a: Self::Vf32) -> Self::Vf64;
unsafe fn castpd_ps(a: Self::Vf64) -> Self::Vf32;
unsafe fn ceil_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn ceil_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn cmpeq_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmpneq_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmpge_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmpgt_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmple_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmplt_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn cmpeq_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmpneq_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmpge_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmpgt_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmple_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmplt_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn cmpeq_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmpneq_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmpge_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmpgt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmple_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmplt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn cmpeq_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cmpneq_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cmpge_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cmpgt_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cmple_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cmplt_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn cvtepi32_ps(a: Self::Vi32) -> Self::Vf32;
unsafe fn cvtps_epi32(a: Self::Vf32) -> Self::Vi32;
unsafe fn floor_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn floor_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn fastfloor_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn fmadd_ps(
        a: Self::Vf32,
        b: Self::Vf32,
        c: Self::Vf32
    ) -> Self::Vf32;
unsafe fn fnmadd_ps(
        a: Self::Vf32,
        b: Self::Vf32,
        c: Self::Vf32
    ) -> Self::Vf32;
unsafe fn horizontal_add_ps(a: Self::Vf32) -> f32;
unsafe fn horizontal_add_pd(a: Self::Vf64) -> f64;
unsafe fn i32gather_epi32(arr: &[i32], index: Self::Vi32) -> Self::Vi32;
unsafe fn i32gather_ps(arr: &[f32], index: Self::Vi32) -> Self::Vf32;
unsafe fn load_ps(a: &f32) -> Self::Vf32;
unsafe fn load_pd(a: &f64) -> Self::Vf64;
unsafe fn load_epi32(a: &i32) -> Self::Vi32;
unsafe fn load_epi64(a: &i64) -> Self::Vi64;
unsafe fn loadu_ps(a: &f32) -> Self::Vf32;
unsafe fn loadu_pd(a: &f64) -> Self::Vf64;
unsafe fn loadu_epi32(a: &i32) -> Self::Vi32;
unsafe fn loadu_epi64(a: &i64) -> Self::Vi64;
unsafe fn maskload_epi32(mem_addr: &i32, mask: Self::Vi32) -> Self::Vi32;
unsafe fn maskload_epi64(mem_addr: &i64, mask: Self::Vi64) -> Self::Vi64;
unsafe fn maskload_ps(mem_addr: &f32, mask: Self::Vi32) -> Self::Vf32;
unsafe fn maskload_pd(mem_addr: &f64, mask: Self::Vi64) -> Self::Vf64;
unsafe fn store_ps(mem_addr: &mut f32, a: Self::Vf32);
unsafe fn store_pd(mem_addr: &mut f64, a: Self::Vf64);
unsafe fn store_epi32(mem_addr: &mut i32, a: Self::Vi32);
unsafe fn store_epi64(mem_addr: &mut i64, a: Self::Vi64);
unsafe fn storeu_ps(mem_addr: &mut f32, a: Self::Vf32);
unsafe fn storeu_pd(mem_addr: &mut f64, a: Self::Vf64);
unsafe fn storeu_epi32(mem_addr: &mut i32, a: Self::Vi32);
unsafe fn storeu_epi64(mem_addr: &mut i64, a: Self::Vi64);
unsafe fn maskstore_epi32(
        mem_addr: &mut i32,
        mask: Self::Vi32,
        a: Self::Vi32
    );
unsafe fn maskstore_epi64(
        mem_addr: &mut i64,
        mask: Self::Vi64,
        a: Self::Vi64
    );
unsafe fn maskstore_ps(mem_addr: &mut f32, mask: Self::Vi32, a: Self::Vf32);
unsafe fn maskstore_pd(mem_addr: &mut f64, mask: Self::Vi64, a: Self::Vf64);
unsafe fn max_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn min_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn max_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn min_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn max_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn min_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn mul_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn mul_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn mullo_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn not_epi32(a: Self::Vi32) -> Self::Vi32;
unsafe fn not_epi64(a: Self::Vi64) -> Self::Vi64;
unsafe fn or_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn or_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn or_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn or_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn rcp_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn round_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn round_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn set1_epi32(a: i32) -> Self::Vi32;
unsafe fn set1_epi64(a: i64) -> Self::Vi64;
unsafe fn set1_ps(a: f32) -> Self::Vf32;
unsafe fn set1_pd(a: f64) -> Self::Vf64;
unsafe fn setzero_ps() -> Self::Vf32;
unsafe fn setzero_pd() -> Self::Vf64;
unsafe fn setzero_epi32() -> Self::Vi32;
unsafe fn setzero_epi64() -> Self::Vi64;
unsafe fn srai_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32;
unsafe fn srli_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32;
unsafe fn slli_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32;
unsafe fn sra_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32;
unsafe fn srl_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32;
unsafe fn sll_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32;
unsafe fn sub_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn sub_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn sqrt_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn rsqrt_ps(a: Self::Vf32) -> Self::Vf32;
unsafe fn sqrt_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn rsqrt_pd(a: Self::Vf64) -> Self::Vf64;
unsafe fn shuffle_epi32(a: Self::Vi32, imm8: i32) -> Self::Vi32;
unsafe fn shuffle_ps(a: Self::Vf32, _: Self::Vf32, imm8: i32) -> Self::Vf32;
unsafe fn unpackhi_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn unpacklo_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn unpackhi_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn unpacklo_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn unpackhi_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn unpacklo_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64;
unsafe fn xor_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32;
unsafe fn xor_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64;
unsafe fn xor_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32;
unsafe fn xor_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; }

Associated Types

Vi32 stands for Vector of i32s. Corresponds to __m128i when used with the Sse impl, __m256i when used with Avx2, or a single i32 when used with Scalar.

Vf32 stands for Vector of f32s. Corresponds to __m128 when used with the Sse impl, __m256 when used with Avx2, or a single f32 when used with Scalar.

Vi64 stands for Vector of f64s. Corresponds to __m128 when used with the Sse impl, __m256 when used with Avx2, or a single f64 when used with Scalar.

Associated Constants

The width of the vector lane. Necessary for creating lane width agnostic code.

Required Methods

Equivalent to transmuting the SIMD type to an array and accessing it at the index i.

This is provided for convenience, it uses casts and the blendv_ps intrinsics to implement it.

When using Sse2, fastfloor uses a faster version of floor that only works on floating point values small enough to fit in an i32. This is important for performance if you don't need a complete floor.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Adds all lanes together. Distinct from h_add which adds pairs.

Adds all lanes together. Distinct from h_add which adds pairs.

Sse2 and Sse41 paths will simulate a gather by breaking out and doing scalar array accesses, because gather doesn't exist until Avx2.

Sse2 and Sse41 paths will simulate a gather by breaking out and doing scalar array accesses, because gather doesn't exist until Avx2.

Mullo is implemented for Sse2 by combining other Sse2 operations.

Round is implemented for Sse2 by combining other Sse2 operations.

amt must be a constant

amt must be a constant

amt must be a constant

amt does not have to be a constant, but may be slower than the srai version

amt does not have to be a constant, but may be slower than the srli version

amt does not have to be a constant, but may be slower than the slli version

Implementors