Trait simdeez::Simd

source ·
pub trait Simd: Sync + Send {
    type Vi16: SimdSmallInt<Self::Vi16, i16>;
    type Vi32: SimdSmallInt<Self::Vi32, i32>;
    type Vi64: SimdBase<Self::Vi64, i64> + Not<Output = Self::Vi64>;
    type Vf32: SimdFloat<Self::Vf32, f32>;
    type Vf64: SimdFloat<Self::Vf64, f64>;

    const VF32_WIDTH: usize;
    const VF64_WIDTH: usize;
    const VI16_WIDTH: usize;
    const VI32_WIDTH: usize;
    const VI64_WIDTH: usize;
Show 150 methods unsafe fn abs_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn abs_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn mullo_epi16(a: Self::Vi16, b: Self::Vi16) -> Self::Vi16; unsafe fn andnot_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn andnot_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn andnot_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn andnot_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn blendv_epi32(
        a: Self::Vi32,
        b: Self::Vi32,
        mask: Self::Vi32
    ) -> Self::Vi32; unsafe fn blendv_epi64(
        a: Self::Vi64,
        b: Self::Vi64,
        mask: Self::Vi64
    ) -> Self::Vi64; unsafe fn blendv_ps(
        a: Self::Vf32,
        b: Self::Vf32,
        mask: Self::Vf32
    ) -> Self::Vf32; unsafe fn blendv_pd(
        a: Self::Vf64,
        b: Self::Vf64,
        mask: Self::Vf64
    ) -> Self::Vf64; unsafe fn castps_epi32(a: Self::Vf32) -> Self::Vi32; unsafe fn castpd_epi64(a: Self::Vf64) -> Self::Vi64; unsafe fn castepi32_ps(a: Self::Vi32) -> Self::Vf32; unsafe fn castepi64_pd(a: Self::Vi64) -> Self::Vf64; unsafe fn castps_pd(a: Self::Vf32) -> Self::Vf64; unsafe fn castpd_ps(a: Self::Vf64) -> Self::Vf32; unsafe fn ceil_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn ceil_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn cmpeq_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmpneq_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmpge_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmpgt_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmple_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmplt_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn cmpeq_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmpneq_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmpge_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmpgt_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmple_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmplt_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn cmpeq_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmpneq_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmpge_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmpgt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmple_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmplt_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn cmpeq_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cmpneq_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cmpge_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cmpgt_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cmple_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cmplt_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn cvtepi32_ps(a: Self::Vi32) -> Self::Vf32; unsafe fn cvtepi64_pd(a: Self::Vi64) -> Self::Vf64; unsafe fn cvtps_epi32(a: Self::Vf32) -> Self::Vi32; unsafe fn cvtpd_epi64(a: Self::Vf64) -> Self::Vi64; unsafe fn floor_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn floor_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn fast_round_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn fast_ceil_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn fast_floor_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn fast_floor_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn fmadd_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32; unsafe fn fnmadd_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32; unsafe fn fmadd_pd(a: Self::Vf64, b: Self::Vf64, c: Self::Vf64) -> Self::Vf64; unsafe fn fnmadd_pd(a: Self::Vf64, b: Self::Vf64, c: Self::Vf64) -> Self::Vf64; unsafe fn fmsub_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32; unsafe fn fnmsub_ps(a: Self::Vf32, b: Self::Vf32, c: Self::Vf32) -> Self::Vf32; unsafe fn fmsub_pd(a: Self::Vf64, b: Self::Vf64, c: Self::Vf64) -> Self::Vf64; unsafe fn fnmsub_pd(a: Self::Vf64, b: Self::Vf64, c: Self::Vf64) -> Self::Vf64; unsafe fn horizontal_add_ps(a: Self::Vf32) -> f32; unsafe fn horizontal_add_pd(a: Self::Vf64) -> f64; unsafe fn i32gather_epi32(arr: &[i32], index: Self::Vi32) -> Self::Vi32; unsafe fn i64gather_epi64(arr: &[i64], index: Self::Vi64) -> Self::Vi64; unsafe fn i32gather_ps(arr: &[f32], index: Self::Vi32) -> Self::Vf32; unsafe fn load_ps(a: &f32) -> Self::Vf32; unsafe fn load_pd(a: &f64) -> Self::Vf64; unsafe fn load_epi16(a: &i16) -> Self::Vi16; unsafe fn load_epi32(a: &i32) -> Self::Vi32; unsafe fn load_epi64(a: &i64) -> Self::Vi64; unsafe fn loadu_ps(a: &f32) -> Self::Vf32; unsafe fn loadu_pd(a: &f64) -> Self::Vf64; unsafe fn loadu_epi32(a: &i32) -> Self::Vi32; unsafe fn loadu_epi64(a: &i64) -> Self::Vi64; unsafe fn maskload_epi32(mem_addr: &i32, mask: Self::Vi32) -> Self::Vi32; unsafe fn maskload_epi64(mem_addr: &i64, mask: Self::Vi64) -> Self::Vi64; unsafe fn maskload_ps(mem_addr: &f32, mask: Self::Vi32) -> Self::Vf32; unsafe fn maskload_pd(mem_addr: &f64, mask: Self::Vi64) -> Self::Vf64; unsafe fn store_ps(mem_addr: &mut f32, a: Self::Vf32); unsafe fn store_pd(mem_addr: &mut f64, a: Self::Vf64); unsafe fn store_epi32(mem_addr: &mut i32, a: Self::Vi32); unsafe fn store_epi64(mem_addr: &mut i64, a: Self::Vi64); unsafe fn storeu_ps(mem_addr: &mut f32, a: Self::Vf32); unsafe fn storeu_pd(mem_addr: &mut f64, a: Self::Vf64); unsafe fn storeu_epi32(mem_addr: &mut i32, a: Self::Vi32); unsafe fn storeu_epi64(mem_addr: &mut i64, a: Self::Vi64); unsafe fn maskstore_epi32(mem_addr: &mut i32, mask: Self::Vi32, a: Self::Vi32); unsafe fn maskstore_epi64(mem_addr: &mut i64, mask: Self::Vi64, a: Self::Vi64); unsafe fn maskstore_ps(mem_addr: &mut f32, mask: Self::Vi32, a: Self::Vf32); unsafe fn maskstore_pd(mem_addr: &mut f64, mask: Self::Vi64, a: Self::Vf64); unsafe fn max_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn min_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn max_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn min_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32; unsafe fn max_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn min_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64; unsafe fn mullo_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32; unsafe fn mullo_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64; unsafe fn rcp_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn round_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn round_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn set1_epi32(a: i32) -> Self::Vi32; unsafe fn set1_epi64(a: i64) -> Self::Vi64; unsafe fn set1_ps(a: f32) -> Self::Vf32; unsafe fn set1_pd(a: f64) -> Self::Vf64; unsafe fn setzero_ps() -> Self::Vf32; unsafe fn setzero_pd() -> Self::Vf64; unsafe fn setzero_epi32() -> Self::Vi32; unsafe fn setzero_epi64() -> Self::Vi64; unsafe fn srai_epi64(a: Self::Vi64, amt_const: i32) -> Self::Vi64; unsafe fn srli_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32; unsafe fn sra_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32; unsafe fn srl_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32; unsafe fn sll_epi32(a: Self::Vi32, amt: i32) -> Self::Vi32; unsafe fn sqrt_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn rsqrt_ps(a: Self::Vf32) -> Self::Vf32; unsafe fn sqrt_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn rsqrt_pd(a: Self::Vf64) -> Self::Vf64; unsafe fn shuffle_epi32(a: Self::Vi32, imm8: i32) -> Self::Vi32; unsafe fn mul_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn mul_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn not_epi32(a: Self::Vi32) -> Self::Vi32 { ... } unsafe fn not_epi64(a: Self::Vi64) -> Self::Vi64 { ... } unsafe fn or_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { ... } unsafe fn or_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64 { ... } unsafe fn or_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn or_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn xor_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { ... } unsafe fn xor_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64 { ... } unsafe fn xor_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn xor_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn slli_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32 { ... } unsafe fn srai_epi32(a: Self::Vi32, amt_const: i32) -> Self::Vi32 { ... } unsafe fn div_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn div_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn add_epi16(a: Self::Vi16, b: Self::Vi16) -> Self::Vi16 { ... } unsafe fn sub_epi16(a: Self::Vi16, b: Self::Vi16) -> Self::Vi16 { ... } unsafe fn add_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { ... } unsafe fn add_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64 { ... } unsafe fn add_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn add_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn and_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { ... } unsafe fn and_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64 { ... } unsafe fn and_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn and_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... } unsafe fn sub_epi32(a: Self::Vi32, b: Self::Vi32) -> Self::Vi32 { ... } unsafe fn sub_epi64(a: Self::Vi64, b: Self::Vi64) -> Self::Vi64 { ... } unsafe fn sub_ps(a: Self::Vf32, b: Self::Vf32) -> Self::Vf32 { ... } unsafe fn sub_pd(a: Self::Vf64, b: Self::Vf64) -> Self::Vf64 { ... }
}
Expand description

The abstract SIMD trait which is implemented by Avx2, Sse41, etc

Required Associated Types§

Vector of i16s. Corresponds to __m128i when used with the Sse impl, __m256i when used with Avx2, or a single i16 when used with Scalar.

Vector of i32s. Corresponds to __m128i when used with the Sse impl, __m256i when used with Avx2, or a single i32 when used with Scalar.

Vector of i64s. Corresponds to __m128i when used with the Sse impl, __m256i when used with Avx2, or a single i64 when used with Scalar.

Vector of f32s. Corresponds to __m128 when used with the Sse impl, __m256 when used with Avx2, or a single f32 when used with Scalar.

Vector of f64s. Corresponds to __m128d when used with the Sse impl, __m256d when used with Avx2, or a single f64 when used with Scalar.

Required Associated Constants§

The width of the vector lane. Necessary for creating lane width agnostic code.

Required Methods§

Note SSE2 will select B only when all bits are 1, while SSE41 and AVX2 only check the high bit. To maintain portability ensure all bits are 1 when using blend. Results of comparison operations adhere to this.

Note SSE2 will select B only when all bits are 1, while SSE41 and AVX2 only check the high bit. To maintain portability ensure all bits are 1 when using blend. Results of comparison operations adhere to this.

Note SSE2 will select B only when all bits are 1, while SSE41 and AVX2 only check the high bit. To maintain portability ensure all bits are 1 when using blend. Results of comparison operations adhere to this.

Note SSE2 will select B only when all bits are 1, while SSE41 and AVX2 only check the high bit. To maintain portability ensure all bits are 1 when using blend. Results of comparison operations adhere to this.

Currently scalar will have different results in some cases depending on the current SSE rounding mode.

When using Sse2, fastround uses a faster version of floor that only works on floating point values small enough to fit in an i32. This is a big performance boost if you don’t need a complete floor.

When using Sse2, fastceil uses a faster version of floor that only works on floating point values small enough to fit in an i32. This is a big performance boost if you don’t need a complete floor.

When using Sse2, fastfloor uses a faster version of floor that only works on floating point values small enough to fit in an i32. This is a big performance boost if you don’t need a complete floor.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and add are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and sub are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and sub are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and sub are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Actual FMA instructions will be used when Avx2 is used, otherwise a mul and sub are used to replicate it, allowing you to just always use FMA in your code and get best perf in both cases.

Adds all lanes together. Distinct from h_add which adds pairs.

Adds all lanes together. Distinct from h_add which adds pairs.

Sse2 and Sse41 paths will simulate a gather by breaking out and doing scalar array accesses, because gather doesn’t exist until Avx2.

Sse2 and Sse41 paths will simulate a gather by breaking out and doing scalar array accesses, because gather doesn’t exist until Avx2.

Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure that the high bit is set.

Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure that the high bit is set.

Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure that the high bit is set.

Note, SSE2 and SSE4 will load when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure that the high bit is set.

Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure the high bit is set.

Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure the high bit is set.

Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure the high bit is set.

Note, SSE2 and SSE4 will store when mask[i] is nonzero, where AVX2 will store only when the high bit is set. To ensure portability ensure the high bit is set.

Mullo is implemented for Sse2 by combining other Sse2 operations.

Round is implemented for Sse2 by combining other Sse2 operations.

amt must be a constant

amt must be a constant

amt does not have to be a constant, but may be slower than the srai version

amt does not have to be a constant, but may be slower than the srli version

amt does not have to be a constant, but may be slower than the slli version

Provided Methods§

amt must be a constant

amt must be a constant

Implementors§