npsimd 0.3.0 - Docs.rs

//! Intrinsics for AVX.

use core::ptr;

use super::*;

/// AVX (Advanced Vector eXtensions).
pub struct AVX(());

impl Feature<FeatureGroup> for AVX {
    fn get_support(runtime: &RuntimeSupport) -> Option<Self> {
        runtime.avx().then_some(Self(()))
    }
}

/// AVX intrinsics.
///
/// # Instructions
///
/// - [x] `vmovd`
///   - [`Use::set_first_u32x8`]
///   - [`Use::set_first_s32x8`]
///   - [`Use::get_first_u32x8`]
///   - [`Use::get_first_s32x8`]
///
/// - [x] `vmovq`
///   - [`Use::set_first_u64x4`]
///   - [`Use::set_first_s64x4`]
///   - [`Use::get_first_u64x4`]
///   - [`Use::get_first_s64x4`]
///
/// - [x] `vmovdqu`
///
///   - [`Use::load_u8x32`]
///   - [`Use::load_u16x16`]
///   - [`Use::load_u32x8`]
///   - [`Use::load_u64x4`]
///   - [`Use::load_s8x32`]
///   - [`Use::load_s16x16`]
///   - [`Use::load_s32x8`]
///   - [`Use::load_s64x4`]
///
///   - [`Use::store_u8x32`]
///   - [`Use::store_u16x16`]
///   - [`Use::store_u32x8`]
///   - [`Use::store_u64x4`]
///   - [`Use::store_s8x32`]
///   - [`Use::store_s16x16`]
///   - [`Use::store_s32x8`]
///   - [`Use::store_s64x4`]
///
/// - [x] `vmovdqa`
///
///   - [`Use::load_aligned_u8x32`]
///   - [`Use::load_aligned_u16x16`]
///   - [`Use::load_aligned_u32x8`]
///   - [`Use::load_aligned_u64x4`]
///   - [`Use::load_aligned_s8x32`]
///   - [`Use::load_aligned_s16x16`]
///   - [`Use::load_aligned_s32x8`]
///   - [`Use::load_aligned_s64x4`]
///
///   - [`Use::store_aligned_u8x32`]
///   - [`Use::store_aligned_u16x16`]
///   - [`Use::store_aligned_u32x8`]
///   - [`Use::store_aligned_u64x4`]
///   - [`Use::store_aligned_s8x32`]
///   - [`Use::store_aligned_s16x16`]
///   - [`Use::store_aligned_s32x8`]
///   - [`Use::store_aligned_s64x4`]
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, AVX> {
    // Construct a 32-byte integer vector from a primitive first element.
    defn_simd_shared!("avx", { simd_set_first(value) }, {
        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi32_si128")]
        pub fn set_first_u32x8(value: u32) -> u32x8;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi32_si128")]
        pub fn set_first_s32x8(value: i32) -> s32x8;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
        #[cfg(target_arch = "x86_64")]
        pub fn set_first_u64x4(value: u64) -> u64x4;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
        #[cfg(target_arch = "x86_64")]
        pub fn set_first_s64x4(value: i64) -> s64x4;
    });

    // Construct a 32-byte integer vector from a low 16-byte vector.
    defn_simd_shared!("avx", fn(T) -> R {
        simd_shuffle(value, T::splat(0), const {
            simd_slice_indices::<R>(0)
        })
    }, {
        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_u8x16x2(value: u8x16) -> u8x32;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_u16x8x2(value: u16x8) -> u16x16;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_u32x4x2(value: u32x4) -> u32x8;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_u64x2x2(value: u64x2) -> u64x4;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_s8x16x2(value: s8x16) -> s8x32;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_s16x8x2(value: s16x8) -> s16x16;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_s32x4x2(value: s32x4) -> s32x8;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi128_si256", "_mm256_zextsi128_si256")]
        pub fn set_first_s64x2x2(value: s64x2) -> s64x4;
    });

    // Extract the first primitive element in a 32-byte vector.
    defn_simd_shared!("avx", { simd_extract(x, 0) }, {
        /// Extract the first unsigned 32-bit integer in the vector.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm256_cvtsi256_si32", "_mm_cvtsi128_si32")]
        pub fn get_first_u32x8(x: u32x8) -> u32;

        /// Extract the first unsigned 64-bit integer in the vector.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
        #[cfg(target_arch = "x86_64")]
        pub fn get_first_u64x4(x: u64x4) -> u64;

        /// Extract the first signed 32-bit integer in the vector.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm256_cvtsi256_si32", "_mm_cvtsi128_si32")]
        pub fn get_first_s32x8(x: s32x8) -> i32;

        /// Extract the first signed 64-bit integer in the vector.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
        #[cfg(target_arch = "x86_64")]
        pub fn get_first_s64x4(x: s64x4) -> i64;
    });

    // Extract the first 16-byte vector in a 32-byte vector.
    defn_simd_shared!("avx", fn(T) -> R {
        simd_shuffle(value, T::splat(0), const {
            simd_slice_indices::<R>(0)
        })
    }, {
        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_u8x16x2(value: u8x32) -> u8x16;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_u16x8x2(value: u16x16) -> u16x8;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_u32x4x2(value: u32x8) -> u32x4;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_u64x2x2(value: u64x4) -> u64x2;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_s8x16x2(value: s8x32) -> s8x16;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_s16x8x2(value: s16x16) -> s16x8;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_s32x4x2(value: s32x8) -> s32x4;

        /// Create a vector with a given first element and all else zero.
        #[intel_equivalents("_mm256_castsi256_si128")]
        pub fn get_first_s64x2x2(value: s64x4) -> s64x2;
    });

    // Load a 32-byte integer vector from unaligned memory.
    defn_simd_shared!("avx", {
        ptr::read_unaligned(ptr as *const _ as *const _)
    }, {
        /// Load 32 unsigned 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_u8x32(ptr: &[u8; 32]) -> u8x32;

        /// Load 16 unsigned 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_u16x16(ptr: &[u16; 16]) -> u16x16;

        /// Load 8 unsigned 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_u32x8(ptr: &[u32; 8]) -> u32x8;

        /// Load 4 unsigned 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_u64x4(ptr: &[u64; 4]) -> u64x4;

        /// Load 32 signed 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_s8x32(ptr: &[i8; 32]) -> s8x32;

        /// Load 16 signed 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_s16x16(ptr: &[i16; 16]) -> s16x16;

        /// Load 8 signed 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_s32x8(ptr: &[i32; 8]) -> s32x8;

        /// Load 4 signed 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_loadu_si256")]
        pub fn load_s64x4(ptr: &[i64; 4]) -> s64x4;
    });

    // Load a 32-byte integer vector from aligned memory.
    defn_simd_shared!("avx", { *ptr }, {
        /// Load 32 unsigned 8-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_u8x32(ptr: &u8x32) -> u8x32;

        /// Load 16 unsigned 16-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_u16x16(ptr: &u16x16) -> u16x16;

        /// Load 8 unsigned 32-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_u32x8(ptr: &u32x8) -> u32x8;

        /// Load 4 unsigned 64-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_u64x4(ptr: &u64x4) -> u64x4;

        /// Load 32 signed 8-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_s8x32(ptr: &s8x32) -> s8x32;

        /// Load 16 signed 16-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_s16x16(ptr: &s16x16) -> s16x16;

        /// Load 8 signed 32-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_s32x8(ptr: &s32x8) -> s32x8;

        /// Load 4 signed 64-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm256_load_si256")]
        pub fn load_aligned_s64x4(ptr: &s64x4) -> s64x4;
    });

    // Store a 32-byte integer vector to unaligned memory.
    defn_simd_shared!("avx", {
        ptr::write_unaligned(ptr as *mut _ as *mut _, x)
    }, {
        /// Store 32 unsigned 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_u8x32(x: u8x32, ptr: &mut [u8; 32]);

        /// Store 16 unsigned 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_u16x16(x: u16x16, ptr: &mut [u16; 16]);

        /// Store 8 unsigned 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_u32x8(x: u32x8, ptr: &mut [u32; 8]);

        /// Store 4 unsigned 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_u64x4(x: u64x4, ptr: &mut [u64; 4]);

        /// Store 32 signed 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_s8x32(x: s8x32, ptr: &mut [i8; 32]);

        /// Store 16 signed 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_s16x16(x: s16x16, ptr: &mut [i16; 16]);

        /// Store 8 signed 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_s32x8(x: s32x8, ptr: &mut [i32; 8]);

        /// Store 4 signed 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_s64x4(x: s64x4, ptr: &mut [i64; 4]);
    });

    // Store a 32-byte integer vector to aligned memory.
    defn_simd_shared!("avx", { *ptr = x }, {
        /// Store 32 unsigned 8-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_u8x32(x: u8x32, ptr: &mut u8x32);

        /// Store 16 unsigned 16-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_u16x16(x: u16x16, ptr: &mut u16x16);

        /// Store 8 unsigned 32-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_u32x8(x: u32x8, ptr: &mut u32x8);

        /// Store 4 unsigned 64-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_u64x4(x: u64x4, ptr: &mut u64x4);

        /// Store 32 signed 8-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_s8x32(x: s8x32, ptr: &mut s8x32);

        /// Store 16 signed 16-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_s16x16(x: s16x16, ptr: &mut s16x16);

        /// Store 8 signed 32-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_s32x8(x: s32x8, ptr: &mut s32x8);

        /// Store 4 signed 64-bit integers to aligned memory.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm256_storeu_si256")]
        pub fn store_aligned_s64x4(x: s64x4, ptr: &mut s64x4);
    });
}


/// AVX intrinsics carried over from SSE2.
///
/// # Instructions
///
/// Converting between general-purpose and vector registers:
///
/// - [x] `vmovd`
///   - [`Use::set_first_u32x4`]
///   - [`Use::set_first_s32x4`]
///   - [`Use::get_first_u32x4`]
///   - [`Use::get_first_s32x4`]
///
/// - [x] `vmovq`
///   - [`Use::set_first_u64x2`]
///   - [`Use::set_first_s64x2`]
///   - [`Use::get_first_u64x2`]
///   - [`Use::get_first_s64x2`]
///
/// Element insertion and extraction:
///
/// - [x] `vpextrw`
///   - [`Use::get_u16x8`]
///   - [`Use::get_s16x8`]
///
/// - [x] `vpinsrw`
///   - [`Use::put_u16x8`]
///   - [`Use::put_s16x8`]
///
/// Loading from and storing to memory:
///
/// - [x] `vmovdqa`
///   - [`Use::load_aligned_u8x16`]
///   - [`Use::load_aligned_u16x8`]
///   - [`Use::load_aligned_u32x4`]
///   - [`Use::load_aligned_u64x2`]
///   - [`Use::load_aligned_s8x16`]
///   - [`Use::load_aligned_s16x8`]
///   - [`Use::load_aligned_s32x4`]
///   - [`Use::load_aligned_s64x2`]
///
/// - [x] `vmovdqu`
///   - [`Use::load_u8x16`]
///   - [`Use::load_u16x8`]
///   - [`Use::load_u32x4`]
///   - [`Use::load_u64x2`]
///   - [`Use::load_s8x16`]
///   - [`Use::load_s16x8`]
///   - [`Use::load_s32x4`]
///   - [`Use::load_s64x2`]
///
/// Basic integer operations:
///
/// - [x] `vpaddb`
///   - [`Use::add_u8x16`]
///   - [`Use::add_s8x16`]
/// - [x] `vpaddw`
///   - [`Use::add_u16x8`]
///   - [`Use::add_s16x8`]
/// - [x] `vpaddd`
///   - [`Use::add_u32x4`]
///   - [`Use::add_s32x4`]
/// - [x] `vpaddq`
///   - [`Use::add_u64x2`]
///   - [`Use::add_s64x2`]
///
/// - [x] `vpsubb`
///   - [`Use::sub_u8x16`]
///   - [`Use::sub_s8x16`]
/// - [x] `vpsubw`
///   - [`Use::sub_u16x8`]
///   - [`Use::sub_s16x8`]
/// - [x] `vpsubd`
///   - [`Use::sub_u32x4`]
///   - [`Use::sub_s32x4`]
/// - [x] `vpsubq`
///   - [`Use::sub_u64x2`]
///   - [`Use::sub_s64x2`]
///
/// - [x] `vpaddsb`
///   - [`Use::saturating_add_s8x16`]
/// - [x] `vpaddsw`
///   - [`Use::saturating_add_s16x8`]
///
/// - [x] `vpsubsb`
///   - [`Use::saturating_sub_s8x16`]
/// - [x] `vpsubsw`
///   - [`Use::saturating_sub_s16x8`]
///
/// - [x] `vpmulhuw`
///   - [`Use::mul_hi_u16x8`]
/// - [x] `vpmulhw`
///   - [`Use::mul_hi_s16x8`]
/// - [x] `vpmullw`
///   - [`Use::mul_lo_u16x8`]
///   - [`Use::mul_lo_s16x8`]
/// - [x] `vpmuludq`
///   - [`Use::mul_u32_u64x2`]
///
/// - [x] `vpmaxub`
///   - [`Use::max_u8x16`]
/// - [x] `vpmaxsw`
///   - [`Use::max_s16x8`]
///
/// - [x] `vpminub`
///   - [`Use::min_u8x16`]
/// - [x] `vpminsw`
///   - [`Use::min_s16x8`]
///
/// - [x] `vpavgb`
///   - [`Use::avg_u8x16`]
/// - [x] `vpavgw`
///   - [`Use::avg_u16x8`]
///
/// Bitwise operations:
///
/// - [x] `vpand`
///   - [`Use::and_u8x16`]
///   - [`Use::and_u16x8`]
///   - [`Use::and_u32x4`]
///   - [`Use::and_u64x2`]
///   - [`Use::and_s8x16`]
///   - [`Use::and_s16x8`]
///   - [`Use::and_s32x4`]
///   - [`Use::and_s64x2`]
///
/// - [x] `vpandn`
///   - [`Use::and_not_u8x16`]
///   - [`Use::and_not_u16x8`]
///   - [`Use::and_not_u32x4`]
///   - [`Use::and_not_u64x2`]
///   - [`Use::and_not_s8x16`]
///   - [`Use::and_not_s16x8`]
///   - [`Use::and_not_s32x4`]
///   - [`Use::and_not_s64x2`]
///
/// - [x] `vpor`
///   - [`Use::ior_u8x16`]
///   - [`Use::ior_u16x8`]
///   - [`Use::ior_u32x4`]
///   - [`Use::ior_u64x2`]
///   - [`Use::ior_s8x16`]
///   - [`Use::ior_s16x8`]
///   - [`Use::ior_s32x4`]
///   - [`Use::ior_s64x2`]
///
/// - [x] `vpxor`
///   - [`Use::xor_u8x16`]
///   - [`Use::xor_u16x8`]
///   - [`Use::xor_u32x4`]
///   - [`Use::xor_u64x2`]
///   - [`Use::xor_s8x16`]
///   - [`Use::xor_s16x8`]
///   - [`Use::xor_s32x4`]
///   - [`Use::xor_s64x2`]
///
/// - [x] `vpsllw`
///   - [`Use::shl_all_u16x8`]
///   - [`Use::shl_all_by_u16x8`]
///   - [`Use::shl_all_s16x8`]
///   - [`Use::shl_all_by_s16x8`]
///
/// - [x] `vpslld`
///   - [`Use::shl_all_u32x4`]
///   - [`Use::shl_all_by_u32x4`]
///   - [`Use::shl_all_s32x4`]
///   - [`Use::shl_all_by_s32x4`]
///
/// - [x] `vpsllq`
///   - [`Use::shl_all_u64x2`]
///   - [`Use::shl_all_by_u64x2`]
///   - [`Use::shl_all_s64x2`]
///   - [`Use::shl_all_by_s64x2`]
///
/// - [x] `vpsrlw`
///   - [`Use::shr_all_u16x8`]
///   - [`Use::shr_all_by_u16x8`]
/// - [x] `vpsraw`
///   - [`Use::shr_all_s16x8`]
///   - [`Use::shr_all_by_s16x8`]
///
/// - [x] `vpsrld`
///   - [`Use::shr_all_u32x4`]
///   - [`Use::shr_all_by_u32x4`]
/// - [x] `vpsrad`
///   - [`Use::shr_all_s32x4`]
///   - [`Use::shr_all_by_s32x4`]
///
/// - [x] `vpsrlq`
///   - [`Use::shr_all_u64x2`]
///   - [`Use::shr_all_by_u64x2`]
///
/// Specialized integer operations:
///
/// - [x] `vpsadbw`
///   - [`Use::sum_of_abs_diff_u8x16`]
/// - [x] `vpmaddwd`
///   - [`Use::sum_of_prod_s16x2x4`]
///
/// Integer comparisons:
///
/// - [x] `vpcmpeqb`
///   - [`Use::cmp_eq_u8x16`]
///   - [`Use::cmp_eq_s8x16`]
/// - [x] `vpcmpeqd`
///   - [`Use::cmp_eq_u16x8`]
///   - [`Use::cmp_eq_s16x8`]
/// - [x] `vpcmpeqw`
///   - [`Use::cmp_eq_u32x4`]
///   - [`Use::cmp_eq_s32x4`]
///
/// - [x] `vpcmpgtb`
///   - [`Use::cmp_gt_s8x16`]
/// - [x] `vpcmpgtw`
///   - [`Use::cmp_gt_s16x8`]
/// - [x] `vpcmpgtd`
///   - [`Use::cmp_gt_s32x4`]
///
/// Data shuffling:
///
/// - [ ] `vpshufd`
/// - [ ] `vpshuflw`
/// - [ ] `vpshufhw`
///
/// - [x] `vpslldq`
///   - [`Use::move_l_by_u8x16`]
///   - [`Use::move_l_by_s8x16`]
/// - [x] `vpsrldq`
///   - [`Use::move_r_by_u8x16`]
///   - [`Use::move_r_by_s8x16`]
///
/// - [x] `vpackuswb`
///   - [`Use::concat_and_saturate_u8_s16x8`]
/// - [x] `vpackssdw`
///   - [`Use::concat_and_saturate_s8_s16x8`]
/// - [x] `vpacksswb`
///   - [`Use::concat_and_saturate_s16_s32x4`]
///
/// - [x] `vpunpcklbw`
///   - [`Use::interleave_lo_u8x16`]
///   - [`Use::interleave_lo_s8x16`]
/// - [x] `vpunpcklwd`
///   - [`Use::interleave_lo_u16x8`]
///   - [`Use::interleave_lo_s16x8`]
/// - [x] `vpunpckldq`
///   - [`Use::interleave_lo_u32x4`]
///   - [`Use::interleave_lo_s32x4`]
/// - [x] `vpunpcklqdq`
///   - [`Use::interleave_lo_u64x2`]
///   - [`Use::interleave_lo_s64x2`]
///
/// - [x] `vpunpckhbw`
///   - [`Use::interleave_hi_u8x16`]
///   - [`Use::interleave_hi_s8x16`]
/// - [x] `vpunpckhwd`
///   - [`Use::interleave_hi_u16x8`]
///   - [`Use::interleave_hi_s16x8`]
/// - [x] `vpunpckhdq`
///   - [`Use::interleave_hi_u32x4`]
///   - [`Use::interleave_hi_s32x4`]
/// - [x] `vpunpckhqdq`
///   - [`Use::interleave_hi_u64x2`]
///   - [`Use::interleave_hi_s64x2`]
///
/// Bitmasks:
///
/// - [x] `vpmovmskb`
///   - [`Use::bitmask_u8x16`]
///   - [`Use::bitmask_s8x16`]
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, AVX> {
    // Construct a 16-byte integer vector from a primitive first element.
    defn_simd_shared!("avx", { simd_set_first(value) }, {
        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi32_si128")]
        pub fn set_first_u32x4(value: u32) -> u32x4;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi32_si128")]
        pub fn set_first_s32x4(value: i32) -> s32x4;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
        #[cfg(target_arch = "x86_64")]
        pub fn set_first_u64x2(value: u64) -> u64x2;

        /// Create a vector with a given first element and all else zero.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi64_si128", "_mm_cvtsi64x_si128")]
        #[cfg(target_arch = "x86_64")]
        pub fn set_first_s64x2(value: i64) -> s64x2;
    });

    // Extract the first primitive element in a 16-byte vector.
    defn_simd_shared!("avx", { simd_extract(x, 0) }, {
        /// Extract the first 32-bit integer in the vector.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi128_si32")]
        pub fn get_first_u32x4(x: u32x4) -> u32;

        /// Extract the first 64-bit integer in the vector.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
        pub fn get_first_u64x2(x: u64x2) -> u64;

        /// Extract the first 32-bit integer in the vector.
        #[intrinsic_for("vmovd")]
        #[intel_equivalents("_mm_cvtsi128_si32")]
        pub fn get_first_s32x4(x: s32x4) -> i32;

        /// Extract the first 64-bit integer in the vector.
        #[intrinsic_for("vmovq")]
        #[intel_equivalents("_mm_cvtsi128_si64", "_mm_cvtsi128_si64x")]
        pub fn get_first_s64x2(x: s64x2) -> i64;
    });

    // Insert a primitive element into a 16-byte vector.
    defn_simd_shared!("avx", fn(T, E) -> R {
        const_assert!(INDEX < T::LEN as u8);
        simd_insert(x, const { INDEX as u32 }, e)
    }, {
        /// Replace an unsigned 16-bit integer at a constant index.
        #[intrinsic_for("vpinsrw")]
        #[intel_equivalents("_mm_insert_epi16")]
        pub fn put_u16x8<INDEX: u8>(x: u16x8, e: u16) -> u16x8;

        /// Replace a signed 16-bit integer at a constant index.
        #[intrinsic_for("vpinsrw")]
        #[intel_equivalents("_mm_insert_epi16")]
        pub fn put_s16x8<INDEX: u8>(x: s16x8, e: i16) -> s16x8;
    });

    // Extract a primitive element from a 16-byte vector.
    defn_simd_shared!("avx", fn(T) -> E {
        const_assert!(INDEX < T::LEN as u8);
        simd_extract(x, const { INDEX as u32 })
    }, {
        /// Extract an unsigned 16-bit integer from a constant index.
        #[intrinsic_for("vpextrw")]
        #[intel_equivalents("_mm_extract_epi16")]
        pub fn get_u16x8<INDEX: u8>(x: u16x8) -> u16;

        /// Extract a signed 16-bit integer from a constant index.
        #[intrinsic_for("vpextrw")]
        #[intel_equivalents("_mm_extract_epi16")]
        pub fn get_s16x8<INDEX: u8>(x: s16x8) -> i16;
    });

    // Load a 16-byte integer vector from unaligned memory.
    defn_simd_shared!("avx", {
        ptr::read_unaligned(ptr as *const _ as *const _)
    }, {
        /// Load 16 unsigned 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_u8x16(ptr: &[u8; 16]) -> u8x16;

        /// Load 8 unsigned 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_u16x8(ptr: &[u16; 8]) -> u16x8;

        /// Load 4 unsigned 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_u32x4(ptr: &[u32; 4]) -> u32x4;

        /// Load 2 unsigned 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_u64x2(ptr: &[u64; 2]) -> u64x2;

        /// Load 16 signed 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_s8x16(ptr: &[i8; 16]) -> s8x16;

        /// Load 8 signed 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_s16x8(ptr: &[i16; 8]) -> s16x8;

        /// Load 4 signed 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_s32x4(ptr: &[i32; 4]) -> s32x4;

        /// Load 2 signed 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_loadu_si128")]
        pub fn load_s64x2(ptr: &[i64; 2]) -> s64x2;
    });

    // Load a 16-byte integer vector from aligned memory.
    defn_simd_shared!("avx", { *ptr }, {
        /// Load 16 unsigned 8-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_u8x16(ptr: &u8x16) -> u8x16;

        /// Load 8 unsigned 16-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_u16x8(ptr: &u16x8) -> u16x8;

        /// Load 4 unsigned 32-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_u32x4(ptr: &u32x4) -> u32x4;

        /// Load 2 unsigned 64-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_u64x2(ptr: &u64x2) -> u64x2;

        /// Load 16 signed 8-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_s8x16(ptr: &s8x16) -> s8x16;

        /// Load 8 signed 16-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_s16x8(ptr: &s16x8) -> s16x8;

        /// Load 4 signed 32-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_s32x4(ptr: &s32x4) -> s32x4;

        /// Load 2 signed 64-bit integers from aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_load_si128")]
        pub fn load_aligned_s64x2(ptr: &s64x2) -> s64x2;
    });

    // Store a 16-byte integer vector to unaligned memory.
    defn_simd_shared!("avx", {
        ptr::write_unaligned(ptr as *mut _ as *mut _, x)
    }, {
        /// Store 16 unsigned 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_u8x16(x: u8x16, ptr: &mut [u8; 16]);

        /// Store 8 unsigned 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_u16x8(x: u16x8, ptr: &mut [u16; 8]);

        /// Store 4 unsigned 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_u32x4(x: u32x4, ptr: &mut [u32; 4]);

        /// Store 2 unsigned 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_u64x2(x: u64x2, ptr: &mut [u64; 2]);

        /// Store 16 signed 8-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_s8x16(x: s8x16, ptr: &mut [i8; 16]);

        /// Store 8 signed 16-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_s16x8(x: s16x8, ptr: &mut [i16; 8]);

        /// Store 4 signed 32-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_s32x4(x: s32x4, ptr: &mut [i32; 4]);

        /// Store 2 signed 64-bit integers.
        #[intrinsic_for("vmovdqu")]
        #[intel_equivalents("_mm_storeu_si128")]
        pub fn store_s64x2(x: s64x2, ptr: &mut [i64; 2]);
    });

    // Store a 16-byte integer vector to aligned memory.
    defn_simd_shared!("avx", { *ptr = x }, {
        /// Store 16 unsigned 8-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_u8x16(x: u8x16, ptr: &mut u8x16);

        /// Store 8 unsigned 16-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_u16x8(x: u16x8, ptr: &mut u16x8);

        /// Store 4 unsigned 32-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_u32x4(x: u32x4, ptr: &mut u32x4);

        /// Store 2 unsigned 64-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_u64x2(x: u64x2, ptr: &mut u64x2);

        /// Store 16 signed 8-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_s8x16(x: s8x16, ptr: &mut s8x16);

        /// Store 8 signed 16-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_s16x8(x: s16x8, ptr: &mut s16x8);

        /// Store 4 signed 32-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_s32x4(x: s32x4, ptr: &mut s32x4);

        /// Store 2 signed 64-bit integers to aligned memory.
        #[intrinsic_for("vmovdqa")]
        #[intel_equivalents("_mm_store_si128")]
        pub fn store_aligned_s64x2(x: s64x2, ptr: &mut s64x2);
    });

    // Add integer elements in two 16-byte vectors.
    defn_simd_shared!("avx", { simd_add(a, b) }, {
        /// Add unsigned 8-bit integers.
        #[intrinsic_for("vpaddb")]
        #[intel_equivalents("_mm_add_epi8")]
        pub fn add_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Add unsigned 16-bit integers.
        #[intrinsic_for("vpaddw")]
        #[intel_equivalents("_mm_add_epi16")]
        pub fn add_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Add unsigned 32-bit integers.
        #[intrinsic_for("vpaddd")]
        #[intel_equivalents("_mm_add_epi32")]
        pub fn add_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Add unsigned 64-bit integers.
        #[intrinsic_for("vpaddq")]
        #[intel_equivalents("_mm_add_epi64")]
        pub fn add_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Add signed 8-bit integers.
        #[intrinsic_for("vpaddb")]
        #[intel_equivalents("_mm_add_epi8")]
        pub fn add_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Add signed 16-bit integers.
        #[intrinsic_for("vpaddw")]
        #[intel_equivalents("_mm_add_epi16")]
        pub fn add_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Add signed 32-bit integers.
        #[intrinsic_for("vpaddd")]
        #[intel_equivalents("_mm_add_epi32")]
        pub fn add_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Add signed 64-bit integers.
        #[intrinsic_for("vpaddq")]
        #[intel_equivalents("_mm_add_epi64")]
        pub fn add_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Subtract integer elements in two 16-byte vectors.
    defn_simd_shared!("avx", { simd_sub(a, b) }, {
        /// Subtract unsigned 8-bit integers.
        #[intrinsic_for("vpsubb")]
        #[intel_equivalents("_mm_sub_epi8")]
        pub fn sub_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Subtract unsigned 16-bit integers.
        #[intrinsic_for("vpsubw")]
        #[intel_equivalents("_mm_sub_epi16")]
        pub fn sub_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Subtract unsigned 32-bit integers.
        #[intrinsic_for("vpsubd")]
        #[intel_equivalents("_mm_sub_epi32")]
        pub fn sub_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Subtract unsigned 64-bit integers.
        #[intrinsic_for("vpsubq")]
        #[intel_equivalents("_mm_sub_epi64")]
        pub fn sub_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Subtract signed 8-bit integers.
        #[intrinsic_for("vpsubb")]
        #[intel_equivalents("_mm_sub_epi8")]
        pub fn sub_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Subtract signed 16-bit integers.
        #[intrinsic_for("vpsubw")]
        #[intel_equivalents("_mm_sub_epi16")]
        pub fn sub_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Subtract signed 32-bit integers.
        #[intrinsic_for("vpsubd")]
        #[intel_equivalents("_mm_sub_epi32")]
        pub fn sub_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Subtract signed 64-bit integers.
        #[intrinsic_for("vpsubq")]
        #[intel_equivalents("_mm_sub_epi64")]
        pub fn sub_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Add integer elements in two 16-byte vectors with saturation.
    defn_simd_shared!("avx", { simd_saturating_add(a, b) }, {
        /// Add signed 8-bit integers with saturation.
        #[intrinsic_for("vpaddsb")]
        #[intel_equivalents("_mm_adds_epi8")]
        pub fn saturating_add_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Add signed 16-bit integers with saturation.
        #[intrinsic_for("vpaddsw")]
        #[intel_equivalents("_mm_adds_epi16")]
        pub fn saturating_add_s16x8(a: s16x8, b: s16x8) -> s16x8;
    });

    // Subtract integer elements in two 16-byte vectors with saturation.
    defn_simd_shared!("avx", { simd_saturating_sub(a, b) }, {
        /// Subtract signed 8-bit integers with saturation.
        #[intrinsic_for("vpsubsb")]
        #[intel_equivalents("_mm_subs_epi8")]
        pub fn saturating_sub_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Subtract signed 16-bit integers with saturation.
        #[intrinsic_for("vpsubsw")]
        #[intel_equivalents("_mm_subs_epi16")]
        pub fn saturating_sub_s16x8(a: s16x8, b: s16x8) -> s16x8;
    });

    // Multiply integer elements in 16-byte vectors for low words.
    defn_simd_shared!("avx", { simd_mul(a, b) }, {
        /// Multiply unsigned 16-bit integers for the low words.
        #[intrinsic_for("vpmullw")]
        #[intel_equivalents("_mm_mullo_epi16")]
        pub fn mul_lo_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Multiply signed 16-bit integers for the low words.
        #[intrinsic_for("vpmullw")]
        #[intel_equivalents("_mm_mullo_epi16")]
        pub fn mul_lo_s16x8(a: s16x8, b: s16x8) -> s16x8;
    });

    // Multiply integer elements in 16-byte vectors for high words.
    defn_simd_manual!("avx", {
        /// Multiply unsigned 16-bit integers and return the higher half.
        #[intrinsic_for("vpmulhuw")]
        #[intel_equivalents("_mm_mulhi_epu16")]
        pub fn mul_hi_u16x8(a: u16x8, b: u16x8) -> u16x8 {
            let prod = simd_mul(simd_cast(a), simd_cast(b));
            simd_cast(simd_shr(prod, u32x8::splat(16)))
        }

        /// Multiply signed 16-bit integers and return the higher half.
        #[intrinsic_for("vpmulhw")]
        #[intel_equivalents("_mm_mulhi_epi16")]
        pub fn mul_hi_s16x8(a: s16x8, b: s16x8) -> s16x8 {
            let prod = simd_mul(simd_cast(a), simd_cast(b));
            simd_cast(simd_shr(prod, s32x8::splat(16)))
        }
    });

    // Multiply low words in integer elements in 16-byte vectors.
    defn_simd_manual!("avx", {
        /// Multiply the low 32 bits in unsigned 64-bit integers.
        #[intrinsic_for("vpmuludq")]
        #[intel_equivalents("_mm_mul_epu32")]
        pub fn mul_u32_u64x2(a: u64x2, b: u64x2) -> u64x2 {
            let [a, b]: [u32x2; 2] = [simd_cast(a), simd_cast(b)];
            simd_mul(simd_cast(a), simd_cast(b))
        }
    });

    // Multiply integer elements in two 16-byte vectors and horizontally add
    // pairs of full-width products.
    defn_simd_llvm!("avx", {
        /// Multiply signed 16-bit integers and horizontally add pairs of 32-bit
        /// results.
        #[intrinsic_for("vpmaddwd")]
        #[intel_equivalents("_mm_madd_epi16")]
        pub fn sum_of_prod_s16x2x4
            (x: s16x8, y: s16x8) -> s32x4
            = "llvm.x86.sse2.pmadd.wd";
    });

    // Compare integer elements in two 16-byte vectors for equality.
    defn_simd_shared!("avx", { simd_eq(a, b) }, {
        /// Compare unsigned 8-bit integers for equality.
        #[intrinsic_for("vpcmpeqb")]
        #[intel_equivalents("_mm_cmpeq_epi8")]
        pub fn cmp_eq_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Compare unsigned 16-bit integers for equality.
        #[intrinsic_for("vpcmpeqw")]
        #[intel_equivalents("_mm_cmpeq_epi16")]
        pub fn cmp_eq_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Compare unsigned 32-bit integers for equality.
        #[intrinsic_for("vpcmpeqd")]
        #[intel_equivalents("_mm_cmpeq_epi32")]
        pub fn cmp_eq_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Compare signed 8-bit integers for equality.
        #[intrinsic_for("vpcmpeqb")]
        #[intel_equivalents("_mm_cmpeq_epi8")]
        pub fn cmp_eq_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Compare signed 16-bit integers for equality.
        #[intrinsic_for("vpcmpeqw")]
        #[intel_equivalents("_mm_cmpeq_epi16")]
        pub fn cmp_eq_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Compare signed 32-bit integers for equality.
        #[intrinsic_for("vpcmpeqd")]
        #[intel_equivalents("_mm_cmpeq_epi32")]
        pub fn cmp_eq_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Compare integer elements in two 16-byte vectors for greater-than.
    defn_simd_shared!("avx", { simd_gt(a, b) }, {
        /// Compare unsigned 8-bit integers for greater-than.
        #[intrinsic_for("vpcmpgtb")]
        #[intel_equivalents("_mm_cmpgt_epi8", "_mm_cmplt_epi8")]
        pub fn cmp_gt_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Compare unsigned 16-bit integers for greater-than.
        #[intrinsic_for("vpcmpgtw")]
        #[intel_equivalents("_mm_cmpgt_epi16", "_mm_cmplt_epi16")]
        pub fn cmp_gt_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Compare unsigned 32-bit integers for greater-than.
        #[intrinsic_for("vpcmpgtd")]
        #[intel_equivalents("_mm_cmpgt_epi32", "_mm_cmplt_epi32")]
        pub fn cmp_gt_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Bitwise AND two 16-byte vectors.
    defn_simd_shared!("avx", { simd_and(a, b) }, {
        /// Bitwise AND unsigned 8-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Bitwise AND unsigned 16-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Bitwise AND unsigned 32-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Bitwise AND unsigned 64-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Bitwise AND signed 8-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Bitwise AND signed 16-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Bitwise AND signed 32-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Bitwise AND signed 64-bit integers.
        #[intrinsic_for("vpand")]
        #[intel_equivalents("_mm_and_si128")]
        pub fn and_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Bitwise inclusive OR two 16-byte vectors.
    defn_simd_shared!("avx", { simd_or(a, b) }, {
        /// Bitwise inclusive OR unsigned 8-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Bitwise inclusive OR unsigned 16-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Bitwise inclusive OR unsigned 32-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Bitwise inclusive OR unsigned 64-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Bitwise inclusive OR signed 8-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Bitwise inclusive OR signed 16-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Bitwise inclusive OR signed 32-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Bitwise inclusive OR signed 64-bit integers.
        #[intrinsic_for("vpor")]
        #[intel_equivalents("_mm_or_si128")]
        pub fn ior_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Bitwise exclusive OR two 16-byte vectors.
    defn_simd_shared!("avx", { simd_xor(a, b) }, {
        /// Bitwise exclusive OR unsigned 8-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Bitwise exclusive OR unsigned 16-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Bitwise exclusive OR unsigned 32-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Bitwise exclusive OR unsigned 64-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Bitwise exclusive OR signed 8-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Bitwise exclusive OR signed 16-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Bitwise exclusive OR signed 32-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Bitwise exclusive OR signed 64-bit integers.
        #[intrinsic_for("vpxor")]
        #[intel_equivalents("_mm_xor_si128")]
        pub fn xor_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Bitwise AND NOT two 16-byte vectors.
    defn_simd_shared!("avx", { simd_andnot(a, b) }, {
        /// Bitwise AND NOT unsigned 8-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Bitwise AND NOT unsigned 16-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Bitwise AND NOT unsigned 32-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Bitwise AND NOT unsigned 64-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Bitwise AND NOT signed 8-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Bitwise AND NOT signed 16-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Bitwise AND NOT signed 32-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Bitwise AND NOT signed 64-bit integers.
        #[intrinsic_for("vpandn")]
        #[intel_equivalents("_mm_andnot_si128")]
        pub fn and_not_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Average of integer elements in 16-byte vectors.
    defn_simd_manual!("avx", {
        /// Average unsigned 8-bit integers.
        #[intrinsic_for("vpavgb")]
        #[intel_equivalents("_mm_avg_epu8")]
        pub fn avg_u8x16(a: u8x16, b: u8x16) -> u8x16
            = simd_avg::<u8x16, u16x16>;

        /// Average unsigned 16-bit integers.
        #[intrinsic_for("vpavgw")]
        #[intel_equivalents("_mm_avg_epu16")]
        pub fn avg_u16x8(a: u16x8, b: u16x8) -> u16x8
            = simd_avg::<u16x8, u32x8>;
    });

    // Maximum of integer elements in 16-byte vectors.
    defn_simd_shared!("avx", { simd_max(a, b) }, {
        /// Maximum of unsigned 8-bit integers.
        #[intrinsic_for("vpmaxub")]
        #[intel_equivalents("_mm_max_epu8")]
        pub fn max_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Maximum of signed 16-bit integers.
        #[intrinsic_for("vpmaxsw")]
        #[intel_equivalents("_mm_max_epi16")]
        pub fn max_s16x8(a: s16x8, b: s16x8) -> s16x8;
    });

    // Minimum of integer elements in 16-byte vectors.
    defn_simd_shared!("avx", { simd_min(a, b) }, {
        /// Minimum of unsigned 8-bit integers.
        #[intrinsic_for("vpminub")]
        #[intel_equivalents("_mm_min_epu8")]
        pub fn min_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Minimum of signed 16-bit integers.
        #[intrinsic_for("vpminsw")]
        #[intel_equivalents("_mm_min_epi16")]
        pub fn min_s16x8(a: s16x8, b: s16x8) -> s16x8;
    });

    // Left-shift integer elements in a 16-byte vector by a variable.
    defn_simd_llvm!("avx", {
        /// Left-shift every unsigned 16-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsllw")]
        #[intel_equivalents("_mm_sll_epi16")]
        pub fn shl_all_u16x8(x: u16x8, s: u64x2) -> u16x8
            = "llvm.x86.sse2.psll.w";

        /// Left-shift every unsigned 32-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpslld")]
        #[intel_equivalents("_mm_sll_epi32")]
        pub fn shl_all_u32x4(x: u32x4, s: u64x2) -> u32x4
            = "llvm.x86.sse2.psll.d";

        /// Left-shift every unsigned 64-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsllq")]
        #[intel_equivalents("_mm_sll_epi64")]
        pub fn shl_all_u64x2(x: u64x2, s: u64x2) -> u64x2
            = "llvm.x86.sse2.psll.q";

        /// Left-shift every signed 16-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsllw")]
        #[intel_equivalents("_mm_sll_epi16")]
        pub fn shl_all_s16x8(x: s16x8, s: u64x2) -> s16x8
            = "llvm.x86.sse2.psll.w";

        /// Left-shift every signed 32-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpslld")]
        #[intel_equivalents("_mm_sll_epi32")]
        pub fn shl_all_s32x4(x: s32x4, s: u64x2) -> s32x4
            = "llvm.x86.sse2.psll.d";

        /// Left-shift every signed 64-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsllq")]
        #[intel_equivalents("_mm_sll_epi64")]
        pub fn shl_all_s64x2(x: s64x2, s: u64x2) -> s64x2
            = "llvm.x86.sse2.psll.q";
    });

    // Left-shift integer elements in a 16-byte vector by a constant.
    defn_simd_shared!("avx", { simd_shl_all::<_, BITS>(x) }, {
        /// Left-shift every unsigned 16-bit integer by a constant.
        #[intrinsic_for("vpsllw")]
        #[intel_equivalents("_mm_slli_epi16")]
        pub fn shl_all_by_u16x8<BITS: u8>(x: u16x8) -> u16x8;

        /// Left-shift every unsigned 32-bit integer by a constant.
        #[intrinsic_for("vpslld")]
        #[intel_equivalents("_mm_slli_epi32")]
        pub fn shl_all_by_u32x4<BITS: u8>(x: u32x4) -> u32x4;

        /// Left-shift every unsigned 64-bit integer by a constant.
        #[intrinsic_for("vpsllq")]
        #[intel_equivalents("_mm_slli_epi64")]
        pub fn shl_all_by_u64x2<BITS: u8>(x: u64x2) -> u64x2;

        /// Left-shift every signed 16-bit integer by a constant.
        #[intrinsic_for("vpsllw")]
        #[intel_equivalents("_mm_slli_epi16")]
        pub fn shl_all_by_s16x8<BITS: u8>(x: s16x8) -> s16x8;

        /// Left-shift every signed 32-bit integer by a constant.
        #[intrinsic_for("vpslld")]
        #[intel_equivalents("_mm_slli_epi32")]
        pub fn shl_all_by_s32x4<BITS: u8>(x: s32x4) -> s32x4;

        /// Left-shift every signed 64-bit integer by a constant.
        #[intrinsic_for("vpsllq")]
        #[intel_equivalents("_mm_slli_epi64")]
        pub fn shl_all_by_s64x2<BITS: u8>(x: s64x2) -> s64x2;
    });

    // Right-shift integer elements in a 16-byte vector by a variable.
    defn_simd_llvm!("avx", {
        /// Right-shift every unsigned 16-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsrlw")]
        #[intel_equivalents("_mm_srl_epi16")]
        pub fn shr_all_u16x8(x: u16x8, s: u64x2) -> u16x8
            = "llvm.x86.sse2.psrl.w";

        /// Right-shift every unsigned 32-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsrld")]
        #[intel_equivalents("_mm_srl_epi32")]
        pub fn shr_all_u32x4(x: u32x4, s: u64x2) -> u32x4
            = "llvm.x86.sse2.psrl.d";

        /// Right-shift every unsigned 64-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsrlq")]
        #[intel_equivalents("_mm_srl_epi64")]
        pub fn shr_all_u64x2(x: u64x2, s: u64x2) -> u64x2
            = "llvm.x86.sse2.psrl.q";

        /// Right-shift every signed 16-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsraw")]
        #[intel_equivalents("_mm_sra_epi16")]
        pub fn shr_all_s16x8(x: s16x8, s: u64x2) -> s16x8
            = "llvm.x86.sse2.psra.w";

        /// Right-shift every signed 32-bit integer by the first value in the
        /// second argument.
        #[intrinsic_for("vpsrad")]
        #[intel_equivalents("_mm_sra_epi32")]
        pub fn shr_all_s32x4(x: s32x4, s: u64x2) -> s32x4
            = "llvm.x86.sse2.psra.d";
    });

    // Right-shift integer elements in a 16-byte vector by a constant.
    defn_simd_shared!("avx", { simd_shr_all::<_, BITS>(x) }, {
        /// Right-shift every unsigned 16-bit integer by a constant.
        #[intrinsic_for("vpsrlw")]
        #[intel_equivalents("_mm_srli_epi16")]
        pub fn shr_all_by_u16x8<BITS: u8>(x: u16x8) -> u16x8;

        /// Right-shift every unsigned 32-bit integer by a constant.
        #[intrinsic_for("vpsrld")]
        #[intel_equivalents("_mm_srli_epi32")]
        pub fn shr_all_by_u32x4<BITS: u8>(x: u32x4) -> u32x4;

        /// Right-shift every unsigned 64-bit integer by a constant.
        #[intrinsic_for("vpsrlq")]
        #[intel_equivalents("_mm_srli_epi64")]
        pub fn shr_all_by_u64x2<BITS: u8>(x: u64x2) -> u64x2;

        /// Right-shift every signed 16-bit integer by a constant.
        #[intrinsic_for("vpsraw")]
        #[intel_equivalents("_mm_srai_epi16")]
        pub fn shr_all_by_s16x8<BITS: u8>(x: s16x8) -> s16x8;

        /// Right-shift every signed 32-bit integer by a constant.
        #[intrinsic_for("vpsrad")]
        #[intel_equivalents("_mm_srai_epi32")]
        pub fn shr_all_by_s32x4<BITS: u8>(x: s32x4) -> s32x4;
    });

    // Saturate and pack integer elements from two 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Saturate two vectors of signed 16-bit integers to unsigned 8 bits.
        #[intrinsic_for("vpackuswb")]
        #[intel_equivalents("_mm_packus_epi16")]
        pub fn concat_and_saturate_u8_s16x8(a: s16x8, b: s16x8) -> u8x16
            = "llvm.x86.sse2.packuswb.128";

        /// Saturate two vectors of signed 16-bit integers to signed 8 bits.
        #[intrinsic_for("vpacksswb")]
        #[intel_equivalents("_mm_packs_epi16")]
        pub fn concat_and_saturate_s8_s16x8(a: s16x8, b: s16x8) -> s8x16
            = "llvm.x86.sse2.packsswb.128";

        /// Saturate two vectors of signed 32-bit integers to signed 16 bits.
        #[intrinsic_for("vpackssdw")]
        #[intel_equivalents("_mm_packs_epi32")]
        pub fn concat_and_saturate_s16_s32x4(a: s32x4, b: s32x4) -> s16x8
            = "llvm.x86.sse2.packssdw.128";
    });

    // Interleave integer elements from the low halves of two 16-byte vectors.
    defn_simd_shared!("avx", fn(T, U) -> R {
        simd_shuffle(a, b, const { simd_unpack_indices::<R>(0) })
    }, {
        /// Interleave unsigned 8-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklbw")]
        #[intel_equivalents("_mm_unpacklo_epi8")]
        pub fn interleave_lo_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Interleave unsigned 16-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklwd")]
        #[intel_equivalents("_mm_unpacklo_epi16")]
        pub fn interleave_lo_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Interleave unsigned 32-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpckldq")]
        #[intel_equivalents("_mm_unpacklo_epi32")]
        pub fn interleave_lo_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Interleave unsigned 64-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklqdq")]
        #[intel_equivalents("_mm_unpacklo_epi64")]
        pub fn interleave_lo_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Interleave signed 8-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklbw")]
        #[intel_equivalents("_mm_unpacklo_epi8")]
        pub fn interleave_lo_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Interleave signed 16-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklwd")]
        #[intel_equivalents("_mm_unpacklo_epi16")]
        pub fn interleave_lo_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Interleave signed 32-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpckldq")]
        #[intel_equivalents("_mm_unpacklo_epi32")]
        pub fn interleave_lo_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Interleave signed 64-bit integers from the low halves of vectors.
        #[intrinsic_for("vpunpcklqdq")]
        #[intel_equivalents("_mm_unpacklo_epi64")]
        pub fn interleave_lo_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Interleave integer elements from the high halves of two 16-byte vectors.
    defn_simd_shared!("avx", fn(T, U) -> R {
        simd_shuffle(a, b, const { simd_unpack_indices::<R>(T::LEN / 2) })
    }, {
        /// Interleave unsigned 8-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhbw")]
        #[intel_equivalents("_mm_unpackhi_epi8")]
        pub fn interleave_hi_u8x16(a: u8x16, b: u8x16) -> u8x16;

        /// Interleave unsigned 16-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhwd")]
        #[intel_equivalents("_mm_unpackhi_epi16")]
        pub fn interleave_hi_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Interleave unsigned 32-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhdq")]
        #[intel_equivalents("_mm_unpackhi_epi32")]
        pub fn interleave_hi_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Interleave unsigned 64-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhqdq")]
        #[intel_equivalents("_mm_unpackhi_epi64")]
        pub fn interleave_hi_u64x2(a: u64x2, b: u64x2) -> u64x2;

        /// Interleave signed 8-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhbw")]
        #[intel_equivalents("_mm_unpackhi_epi8")]
        pub fn interleave_hi_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Interleave signed 16-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhwd")]
        #[intel_equivalents("_mm_unpackhi_epi16")]
        pub fn interleave_hi_s16x8(a: s16x8, b: s16x8) -> s16x8;

        /// Interleave signed 32-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhdq")]
        #[intel_equivalents("_mm_unpackhi_epi32")]
        pub fn interleave_hi_s32x4(a: s32x4, b: s32x4) -> s32x4;

        /// Interleave signed 64-bit integers from the high halves of vectors.
        #[intrinsic_for("vpunpckhqdq")]
        #[intel_equivalents("_mm_unpackhi_epi64")]
        pub fn interleave_hi_s64x2(a: s64x2, b: s64x2) -> s64x2;
    });

    // Compute absolute differences between integer elements in 16-byte vectors
    // and horizontally add them.
    defn_simd_llvm!("avx", {
        /// Sum of 8 absolute differences of unsigned 8-bit integers.
        ///
        /// Compute the absolute differences between corresponding 8-bit
        /// unsigned integers, then horizontally add groups of unsigned 8-bit
        /// sums into 16-bit sums, and zero-extend them to 64-bit integers.
        #[intrinsic_for("vpsadbw")]
        #[intel_equivalents("_mm_sad_epu8")]
        pub fn sum_of_abs_diff_u8x16(a: u8x16, b: u8x16) -> u64x2
            = "llvm.x86.sse2.psad.bw";
    });

    // Move byte elements in a 16-byte vector left by a constant.
    defn_simd_shared!("avx", fn(T) -> R {
        const_assert!(ELEMS < 16);
        simd_shuffle(T::splat(0), x, const {
            simd_slice_indices::<T>(T::LEN - (ELEMS as usize))
        })
    }, {
        /// Move unsigned 8-bit integer elements to the left.
        #[intrinsic_for("vpslldq")]
        #[intel_equivalents("_mm_bslli_si128", "_mm_slli_si128")]
        pub fn move_l_by_u8x16<ELEMS: u8>(x: u8x16) -> u8x16;

        /// Move signed 8-bit integer elements to the left.
        #[intrinsic_for("vpslldq")]
        #[intel_equivalents("_mm_bslli_si128", "_mm_slli_si128")]
        pub fn move_l_by_s8x16<ELEMS: u8>(x: s8x16) -> s8x16;
    });

    // Move byte elements in a 16-byte vector right by a constant.
    defn_simd_shared!("avx", fn(T) -> R {
        const_assert!(ELEMS < 16);
        simd_shuffle(T::splat(0), x, const {
            simd_slice_indices::<T>(ELEMS as usize)
        })
    }, {
        /// Move unsigned 8-bit integer elements to the right.
        #[intrinsic_for("vpsrldq")]
        #[intel_equivalents("_mm_bsrli_si128", "_mm_srli_si128")]
        pub fn move_r_by_u8x16<ELEMS: u8>(x: u8x16) -> u8x16;

        /// Move signed 8-bit integer elements to the right.
        #[intrinsic_for("vpsrldq")]
        #[intel_equivalents("_mm_bsrli_si128", "_mm_srli_si128")]
        pub fn move_r_by_s8x16<ELEMS: u8>(x: s8x16) -> s8x16;
    });

    // Extract the most significant bit from integer elements in 16-byte
    // vectors.
    defn_simd_manual!("avx", {
        /// Extract the most significant bit from unsigned 8-bit integers.
        #[intrinsic_for("vpmovmskb")]
        #[intel_equivalents("_mm_movemask_epi8")]
        pub fn bitmask_u8x16(x: u8x16) -> u16 {
            simd_bitmask(simd_ge::<_, u8x16>(x, u8x16::splat(0x80)))
        }

        /// Extract the sign bit from signed 8-bit integers.
        #[intrinsic_for("vpmovmskb")]
        #[intel_equivalents("_mm_movemask_epi8")]
        pub fn bitmask_s8x16(x: s8x16) -> u16 {
            simd_bitmask(simd_lt::<_, s8x16>(x, s8x16::splat(0)))
        }
    });
}


/// AVX intrinsics carried over from SSSE3.
///
/// # Intrinsics
///
/// Common integer operations:
///
/// - [x] `vpabsb`
///   - [`Use::abs_s8x16`]
/// - [x] `vpabsd`
///   - [`Use::abs_s16x8`]
/// - [x] `vpabsw`
///   - [`Use::abs_s32x4`]
///
/// - [x] `vphaddw`
///   - [`Use::concat_and_reduce_add_u16x2x4`]
///   - [`Use::concat_and_reduce_add_s16x2x4`]
/// - [x] `vphaddd`
///   - [`Use::concat_and_reduce_add_u32x2x2`]
///   - [`Use::concat_and_reduce_add_s32x2x2`]
///
/// - [x] `vphaddsw`
///   - [`Use::concat_and_reduce_saturating_add_s16x2x4`]
///
/// - [x] `vphsubw`
///   - [`Use::concat_and_reduce_sub_u16x2x4`]
///   - [`Use::concat_and_reduce_sub_s16x2x4`]
/// - [x] `vphsubd`
///   - [`Use::concat_and_reduce_sub_u32x2x2`]
///   - [`Use::concat_and_reduce_sub_s32x2x2`]
///
/// - [x] `vphsubsw`
///   - [`Use::concat_and_reduce_saturating_sub_s16x2x4`]
///
/// - [x] `vpsignb`
///   - [`Use::mul_sign_s8x16`]
/// - [x] `vpsignw`
///   - [`Use::mul_sign_s16x8`]
/// - [x] `vpsignd`
///   - [`Use::mul_sign_s32x4`]
///
/// Specialized integer operations:
///
/// - [x] `vpmaddubsw`
///   - [`Use::sum_of_prod_u8x2x8`]
/// - [x] `vpmulhrsw`
///   - [`Use::mul_and_halve_s16x8`]
///
/// Data shuffling:
///
/// - [x] `vpalignr`
///   - [`Use::align_elems_by_u8x16`]
///   - [`Use::align_elems_by_s8x16`]
///
/// - [x] `vpshufb`
///   - [`Use::shuffle_u8x16`]
///   - [`Use::shuffle_s8x16`]
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, AVX> {
    // Take the absolute value of integer elements in a 16-byte vector.
    defn_simd_shared!("avx", { simd_abs(x) }, {
        /// Absolute value of signed 8-bit integers.
        #[intrinsic_for("vpabsb")]
        #[intel_equivalents("_mm_abs_epi8")]
        pub fn abs_s8x16(x: s8x16) -> s8x16;

        /// Absolute value of signed 16-bit integers.
        #[intrinsic_for("vpabsw")]
        #[intel_equivalents("_mm_abs_epi16")]
        pub fn abs_s16x8(x: s16x8) -> s16x8;

        /// Absolute value of signed 32-bit integers.
        #[intrinsic_for("vpabsd")]
        #[intel_equivalents("_mm_abs_epi32")]
        pub fn abs_s32x4(x: s32x4) -> s32x4;
    });

    // Horizontally add pairs of integers from two concatenated 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Horizontally add pairs of unsigned 16-bit integers from two vectors.
        #[intrinsic_for("vphaddw")]
        #[intel_equivalents("_mm_hadd_epi16")]
        pub fn concat_and_reduce_add_u16x2x4(x: u16x8, y: u16x8) -> u16x8
            = "llvm.x86.ssse3.phadd.w.128";

        /// Horizontally add pairs of unsigned 32-bit integers from two vectors.
        #[intrinsic_for("vphaddd")]
        #[intel_equivalents("_mm_hadd_epi32")]
        pub fn concat_and_reduce_add_u32x2x2(x: u32x4, y: u32x4) -> u32x4
            = "llvm.x86.ssse3.phadd.d.128";

        /// Horizontally add pairs of signed 16-bit integers from two vectors.
        #[intrinsic_for("vphaddw")]
        #[intel_equivalents("_mm_hadd_epi16")]
        pub fn concat_and_reduce_add_s16x2x4(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phadd.w.128";

        /// Horizontally add pairs of signed 32-bit integers from two vectors.
        #[intrinsic_for("vphaddd")]
        #[intel_equivalents("_mm_hadd_epi32")]
        pub fn concat_and_reduce_add_s32x2x2(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.phadd.d.128";
    });

    // Horizontally add pairs of integers, with saturation, from two
    // concatenated 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Horizontally add pairs of signed 16-bit integers with saturation
        /// from two vectors.
        #[intrinsic_for("vphaddsw")]
        #[intel_equivalents("_mm_hadds_epi16")]
        pub fn concat_and_reduce_saturating_add_s16x2x4
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phadd.sw.128";
    });

    // Horizontally subtract pairs of integers from two concatenated 16-byte
    // vectors.
    defn_simd_llvm!("avx", {
        /// Horizontally subtract pairs of unsigned 16-bit integers from two
        /// vectors.
        #[intrinsic_for("vphsubw")]
        #[intel_equivalents("_mm_hsub_epi16")]
        pub fn concat_and_reduce_sub_u16x2x4(x: u16x8, y: u16x8) -> u16x8
            = "llvm.x86.ssse3.phsub.w.128";

        /// Horizontally subtract pairs of unsigned 32-bit integers from two
        /// vectors.
        #[intrinsic_for("vphsubd")]
        #[intel_equivalents("_mm_hsub_epi32")]
        pub fn concat_and_reduce_sub_u32x2x2(x: u32x4, y: u32x4) -> u32x4
            = "llvm.x86.ssse3.phsub.d.128";

        /// Horizontally subtract pairs of signed 16-bit integers from two
        /// vectors.
        #[intrinsic_for("vphsubw")]
        #[intel_equivalents("_mm_hsub_epi16")]
        pub fn concat_and_reduce_sub_s16x2x4(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phsub.w.128";

        /// Horizontally subtract pairs of signed 32-bit integers from two
        /// vectors.
        #[intrinsic_for("vphsubd")]
        #[intel_equivalents("_mm_hsub_epi32")]
        pub fn concat_and_reduce_sub_s32x2x2(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.phsub.d.128";
    });

    // Horizontally subtract pairs of integers, with saturation, from two
    // concatenated 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Horizontally subtract pairs of signed 16-bit integers with
        /// saturation from two vectors.
        #[intrinsic_for("vphsubsw")]
        #[intel_equivalents("_mm_hsubs_epi16")]
        pub fn concat_and_reduce_saturating_sub_s16x2x4
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.phsub.sw.128";
    });

    // Multiply integers from two 16-byte vectors and horizontally add pairs of
    // full-width products with saturation.
    defn_simd_llvm!("avx", {
        /// Multiply unsigned 8-bit integers, horizontally add pairs of 16-bit
        /// results, then saturate the sums to signed 16-bit integers.
        #[intrinsic_for("vpmaddubsw")]
        #[intel_equivalents("_mm_maddubs_epi16")]
        pub fn sum_of_prod_u8x2x8
            (x: u16x8, y: u16x8) -> s16x8
            = "llvm.x86.ssse3.pmadd.ub.sw.128";
    });

    // Multiply integers from two 16-byte vectors and halve the results with
    // rounding.
    defn_simd_llvm!("avx", {
        /// Multiply signed 16-bit integers and halve the results with rounding.
        #[intrinsic_for("vpmulhrsw")]
        #[intel_equivalents("_mm_mulhrs_epi16")]
        pub fn mul_and_halve_s16x8
            (x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.pmul.hr.sw.128";
    });

    // Multiply integers in a 16-byte vector by corresponding signs.
    defn_simd_llvm!("avx", {
        /// Multiply each signed 8-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("vpsignb")]
        #[intel_equivalents("_mm_sign_epi8")]
        pub fn mul_sign_s8x16(x: s8x16, y: s8x16) -> s8x16
            = "llvm.x86.ssse3.psign.b.128";

        /// Multiply each signed 16-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("vpsignw")]
        #[intel_equivalents("_mm_sign_epi16")]
        pub fn mul_sign_s16x8(x: s16x8, y: s16x8) -> s16x8
            = "llvm.x86.ssse3.psign.w.128";

        /// Multiply each signed 16-bit integer by the sign of the corresponding
        /// integer.
        #[intrinsic_for("vpsignd")]
        #[intel_equivalents("_mm_sign_epi32")]
        pub fn mul_sign_s32x4(x: s32x4, y: s32x4) -> s32x4
            = "llvm.x86.ssse3.psign.d.128";
    });

    // Slice the concatenation of two 16-byte vectors.
    defn_simd_shared!("avx", fn(T, U) -> R {
        const_assert!(SHIFT < 16);
        simd_shuffle(x, y, const {
            simd_slice_indices::<R>(SHIFT as usize)
        })
    }, {
        /// Extract an unaligned vector between two contiguous vectors.
        ///
        /// The given vectors are concatenated in order and the given number of
        /// elements are truncated from its beginning.
        #[intrinsic_for("vpalignr")]
        #[intel_equivalents("_mm_alignr_epi8")]
        pub fn align_elems_by_u8x16<SHIFT: u8>(x: u8x16, y: u8x16) -> u8x16;

        /// Extract an unaligned vector between two contiguous vectors.
        ///
        /// The given vectors are concatenated in order and the given number of
        /// elements are truncated from its beginning.
        #[intrinsic_for("vpalignr")]
        #[intel_equivalents("_mm_alignr_epi8")]
        pub fn align_elems_by_s8x16<SHIFT: u8>(x: s8x16, y: s8x16) -> s8x16;
    });

    // Shuffle bytes within 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Index this array of unsigned 8-bit integers with the given vector.
        ///
        /// Each element of the index value is used modulo 16.  If an index is
        /// negative, then that position is filled with zero.
        #[intrinsic_for("vpshufb")]
        #[intel_equivalents("_mm_shuffle_epi8")]
        pub fn shuffle_u8x16(x: u8x16, idxs: s8x16) -> u8x16
            = "llvm.x86.ssse3.pshuf.b.128";

        /// Index this array of signed 8-bit integers with the given vector.
        ///
        /// Each element of the index value is used modulo 16.  If an index is
        /// negative, then that position is filled with zero.
        #[intrinsic_for("vpshufb")]
        #[intel_equivalents("_mm_shuffle_epi8")]
        pub fn shuffle_s8x16(x: s8x16, idxs: s8x16) -> s8x16
            = "llvm.x86.ssse3.pshuf.b.128";
    });
}

/// AVX intrinsics carried over from SSE4.1.
///
/// # Intrinsics
///
/// Element insertion and extraction:
///
/// - [x] `vpextrb`
///   - [`Use::get_u8x16`]
///   - [`Use::get_s8x16`]
/// - [x] `vpextrd`
///   - [`Use::get_u16x8`]
///   - [`Use::get_s16x8`]
/// - [x] `vpextrq`
///   - [`Use::get_u32x4`]
///   - [`Use::get_s32x4`]
///
/// - [x] `vpinsrb`
///   - [`Use::put_u8x16`]
///   - [`Use::put_s8x16`]
/// - [x] `vpinsrd`
///   - [`Use::put_u16x8`]
///   - [`Use::put_s16x8`]
/// - [x] `vpinsrq`
///   - [`Use::put_u32x4`]
///   - [`Use::put_s32x4`]
///
/// Basic integer operations:
///
/// - [x] `vpmaxuw`
///   - [`Use::max_u16x8`]
/// - [x] `vpmaxud`
///   - [`Use::max_u32x4`]
/// - [x] `vpmaxsb`
///   - [`Use::max_s8x16`]
/// - [x] `vpmaxsd`
///   - [`Use::max_s32x4`]
///
/// - [x] `vpminuw`
///   - [`Use::min_u16x8`]
/// - [x] `vpminud`
///   - [`Use::min_u32x4`]
/// - [x] `vpminsb`
///   - [`Use::min_s8x16`]
/// - [x] `vpminsd`
///   - [`Use::min_s32x4`]
///
/// - [x] `vpmuldq`
///   - [`Use::mul_s32_s64x2`]
/// - [x] `vpmulld`
///   - [`Use::mul_lo_s32x4`]
///
/// Integer comparisons:
///
/// - [x] `vpcmpeqq`
///   - [`Use::cmp_eq_u64x2`]
///   - [`Use::cmp_eq_s64x2`]
///
/// - [ ] `vptest`
///
/// Specialized integer operations:
///
/// - [x] `vphminposw`
///   - [`Use::min_pos_u16x8`]
///
/// - [ ] `vmpsadbw`
///
/// Integer resizing:
///
/// - [x] `vpmovzxbw`
///   - [`Use::expand_u8x16_u16x8`]
///   - [`Use::expand_u8x16_s16x8`]
/// - [x] `vpmovzxbd`
///   - [`Use::expand_u8x16_u32x4`]
///   - [`Use::expand_u8x16_s32x4`]
/// - [x] `vpmovzxbq`
///   - [`Use::expand_u8x16_u64x2`]
///   - [`Use::expand_u8x16_s64x2`]
/// - [x] `vpmovzxwd`
///   - [`Use::expand_u16x8_u32x4`]
///   - [`Use::expand_u16x8_s32x4`]
/// - [x] `vpmovzxwq`
///   - [`Use::expand_u16x8_u64x2`]
///   - [`Use::expand_u16x8_s64x2`]
/// - [x] `vpmovzxdq`
///   - [`Use::expand_u32x4_u64x2`]
///   - [`Use::expand_u32x4_s64x2`]
///
/// - [x] `vpmovsxbw`
///   - [`Use::expand_s8x16_s16x8`]
/// - [x] `vpmovsxbd`
///   - [`Use::expand_s8x16_s32x4`]
/// - [x] `vpmovsxbq`
///   - [`Use::expand_s8x16_s64x2`]
/// - [x] `vpmovsxwd`
///   - [`Use::expand_s16x8_s32x4`]
/// - [x] `vpmovsxwq`
///   - [`Use::expand_s16x8_s64x2`]
/// - [x] `vpmovsxdq`
///   - [`Use::expand_s32x4_s64x2`]
///
/// - [x] `vpackusdw`
///   - [`Use::concat_and_saturate_u16_s32x4`]
///
/// Data shuffling:
///
/// - [x] `vpblendvb`
///   - [`Use::blend_u8x16`]
///   - [`Use::blend_s8x16`]
///
/// - [x] `vpblendw`
///   - [`Use::blend_by_u16x8`]
///   - [`Use::blend_by_s16x8`]
impl<FS> Use<FS>
where FS: HasFeature<FeatureGroup, AVX> {
    // Blend elements in 16-byte integer vectors with a constant mask.
    defn_simd_shared!("avx", {
        simd_select_bitmask(MASK, y, x)
    }, {
        /// Blend 16-bit integers between vectors using a constant mask.
        #[intrinsic_for("vpblendw")]
        #[intel_equivalents("_mm_blend_epi16")]
        pub fn blend_by_u16x8<MASK: u8>(x: u16x8, y: u16x8) -> u16x8;

        /// Blend 16-bit integers between vectors using a constant mask.
        #[intrinsic_for("vpblendw")]
        #[intel_equivalents("_mm_blend_epi16")]
        pub fn blend_by_s16x8<MASK: u8>(x: s16x8, y: s16x8) -> s16x8;
    });

    // Blend elements in 16-byte integer vectors with a variable mask.
    defn_simd_shared!("avx", fn(T, U, M) -> R {
        let mask: M = simd_lt(mask, M::splat(0));
        simd_select(mask, y, x)
    }, {
        /// Blend 8-bit integers between vectors.
        #[intrinsic_for("vpblendvb")]
        #[intel_equivalents("_mm_blendv_epi8")]
        pub fn blend_u8x16(x: u8x16, y: u8x16, mask: s8x16) -> u8x16;

        /// Blend 8-bit integers between vectors.
        #[intrinsic_for("vpblendvb")]
        #[intel_equivalents("_mm_blendv_epi8")]
        pub fn blend_s8x16(x: s8x16, y: s8x16, mask: s8x16) -> s8x16;
    });

    // Multiply low words in integer elements in 16-byte vectors.
    defn_simd_manual!("avx", {
        /// Multiply the low 32 bits in signed 64-bit integers.
        #[intrinsic_for("vpmuldq")]
        #[intel_equivalents("_mm_mul_epi32")]
        pub fn mul_s32_s64x2(a: s64x2, b: s64x2) -> s64x2 {
            let [a, b]: [s32x2; 2] = [simd_cast(a), simd_cast(b)];
            simd_mul(simd_cast(a), simd_cast(b))
        }
    });

    // Multiply integer elements in 16-byte vectors for low words.
    defn_simd_manual!("avx", {
        /// Multiply signed 32-bit integers for the low words.
        #[intrinsic_for("vpmulld")]
        #[intel_equivalents("_mm_mullo_epi32")]
        pub fn mul_lo_s32x4(a: s32x4, b: s32x4) -> s32x4 {
            simd_mul(a, b)
        }
    });

    // Compare integer elements in two 16-byte vectors for equality.
    defn_simd_manual!("avx", {
        /// Compare unsigned 64-bit integers for equality.
        #[intrinsic_for("vpcmpeqq")]
        #[intel_equivalents("_mm_cmpeq_epi64")]
        pub fn cmp_eq_u64x2(a: u64x2, b: u64x2) -> u64x2 = simd_eq;

        /// Compare signed 64-bit integers for equality.
        #[intrinsic_for("vpcmpeqq")]
        #[intel_equivalents("_mm_cmpeq_epi64")]
        pub fn cmp_eq_s64x2(a: s64x2, b: s64x2) -> s64x2 = simd_eq;
    });

    // Maximum of integer elements in 16-byte vectors.
    defn_simd_shared!("avx", { simd_max(a, b) }, {
        /// Maximum of unsigned 16-bit integers.
        #[intrinsic_for("vpmaxuw")]
        #[intel_equivalents("_mm_max_epu16")]
        pub fn max_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Maximum of unsigned 32-bit integers.
        #[intrinsic_for("vpmaxud")]
        #[intel_equivalents("_mm_max_epu32")]
        pub fn max_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Maximum of signed 8-bit integers.
        #[intrinsic_for("vpmaxsb")]
        #[intel_equivalents("_mm_max_epi8")]
        pub fn max_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Maximum of signed 32-bit integers.
        #[intrinsic_for("vpmaxsd")]
        #[intel_equivalents("_mm_max_epi32")]
        pub fn max_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Minimum of integer elements in 16-byte vectors.
    defn_simd_shared!("avx", { simd_min(a, b) }, {
        /// Minimum of unsigned 16-bit integers.
        #[intrinsic_for("vpminuw")]
        #[intel_equivalents("_mm_min_epu16")]
        pub fn min_u16x8(a: u16x8, b: u16x8) -> u16x8;

        /// Minimum of unsigned 32-bit integers.
        #[intrinsic_for("vpminud")]
        #[intel_equivalents("_mm_min_epu32")]
        pub fn min_u32x4(a: u32x4, b: u32x4) -> u32x4;

        /// Minimum of signed 8-bit integers.
        #[intrinsic_for("vpminsb")]
        #[intel_equivalents("_mm_min_epi8")]
        pub fn min_s8x16(a: s8x16, b: s8x16) -> s8x16;

        /// Minimum of signed 32-bit integers.
        #[intrinsic_for("vpminsd")]
        #[intel_equivalents("_mm_min_epi32")]
        pub fn min_s32x4(a: s32x4, b: s32x4) -> s32x4;
    });

    // Expand the lower integers in 16-byte vectors.
    defn_simd_manual!("avx", {
        /// Expand unsigned 8-bit integers to unsigned 16-bit integers.
        #[intrinsic_for("vpmovzxbw")]
        #[intel_equivalents("_mm_cvtepu8_epi16")]
        pub fn expand_u8x16_u16x8(x: u8x16) -> u16x8
            = simd_expand::<_, u8x8, _>;

        /// Expand unsigned 8-bit integers to signed 16-bit integers.
        #[intrinsic_for("vpmovzxbw")]
        #[intel_equivalents("_mm_cvtepu8_epi16")]
        pub fn expand_u8x16_s16x8(x: u8x16) -> s16x8
            = simd_expand::<_, u8x8, _>;

        /// Expand unsigned 8-bit integers to unsigned 32-bit integers.
        #[intrinsic_for("vpmovzxbd")]
        #[intel_equivalents("_mm_cvtepu8_epi32")]
        pub fn expand_u8x16_u32x4(x: u8x16) -> u32x4
            = simd_expand::<_, u8x4, _>;

        /// Expand unsigned 8-bit integers to signed 32-bit integers.
        #[intrinsic_for("vpmovzxbd")]
        #[intel_equivalents("_mm_cvtepu8_epi32")]
        pub fn expand_u8x16_s32x4(x: u8x16) -> s32x4
            = simd_expand::<_, u8x4, _>;

        /// Expand unsigned 8-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("vpmovzxbq")]
        #[intel_equivalents("_mm_cvtepu8_epi64")]
        pub fn expand_u8x16_u64x2(x: u8x16) -> u64x2
            = simd_expand::<_, u8x2, _>;

        /// Expand unsigned 8-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovzxbq")]
        #[intel_equivalents("_mm_cvtepu8_epi64")]
        pub fn expand_u8x16_s64x2(x: u8x16) -> s64x2
            = simd_expand::<_, u8x2, _>;

        /// Expand unsigned 16-bit integers to unsigned 32-bit integers.
        #[intrinsic_for("vpmovzxwd")]
        #[intel_equivalents("_mm_cvtepu16_epi32")]
        pub fn expand_u16x8_u32x4(x: u16x8) -> u32x4
            = simd_expand::<_, u16x4, _>;

        /// Expand unsigned 16-bit integers to signed 32-bit integers.
        #[intrinsic_for("vpmovzxwd")]
        #[intel_equivalents("_mm_cvtepu16_epi32")]
        pub fn expand_u16x8_s32x4(x: u16x8) -> s32x4
            = simd_expand::<_, u16x4, _>;

        /// Expand unsigned 16-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("vpmovzxwq")]
        #[intel_equivalents("_mm_cvtepu16_epi64")]
        pub fn expand_u16x8_u64x2(x: u16x8) -> u64x2
            = simd_expand::<_, u16x2, _>;

        /// Expand unsigned 16-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovzxwq")]
        #[intel_equivalents("_mm_cvtepu16_epi64")]
        pub fn expand_u16x8_s64x2(x: u16x8) -> s64x2
            = simd_expand::<_, u16x2, _>;

        /// Expand unsigned 32-bit integers to unsigned 64-bit integers.
        #[intrinsic_for("vpmovzxdq")]
        #[intel_equivalents("_mm_cvtepu32_epi64")]
        pub fn expand_u32x4_u64x2(x: u32x4) -> u64x2
            = simd_expand::<_, u32x2, _>;

        /// Expand unsigned 32-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovzxdq")]
        #[intel_equivalents("_mm_cvtepu32_epi64")]
        pub fn expand_u32x4_s64x2(x: u32x4) -> s64x2
            = simd_expand::<_, u32x2, _>;

        /// Expand signed 8-bit integers to signed 16-bit integers.
        #[intrinsic_for("vpmovsxbw")]
        #[intel_equivalents("_mm_cvtepi8_epi16")]
        pub fn expand_s8x16_s16x8(x: s8x16) -> s16x8
            = simd_expand::<_, s8x8, _>;

        /// Expand signed 8-bit integers to signed 32-bit integers.
        #[intrinsic_for("vpmovsxbd")]
        #[intel_equivalents("_mm_cvtepi8_epi32")]
        pub fn expand_s8x16_s32x4(x: s8x16) -> s32x4
            = simd_expand::<_, s8x4, _>;

        /// Expand signed 8-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovsxbq")]
        #[intel_equivalents("_mm_cvtepi8_epi64")]
        pub fn expand_s8x16_s64x2(x: s8x16) -> s64x2
            = simd_expand::<_, s8x2, _>;

        /// Expand signed 16-bit integers to signed 32-bit integers.
        #[intrinsic_for("vpmovsxwd")]
        #[intel_equivalents("_mm_cvtepi16_epi32")]
        pub fn expand_s16x8_s32x4(x: s16x8) -> s32x4
            = simd_expand::<_, s16x4, _>;

        /// Expand signed 16-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovsxwq")]
        #[intel_equivalents("_mm_cvtepi16_epi64")]
        pub fn expand_s16x8_s64x2(x: s16x8) -> s64x2
            = simd_expand::<_, s16x2, _>;

        /// Expand signed 32-bit integers to signed 64-bit integers.
        #[intrinsic_for("vpmovsxdq")]
        #[intel_equivalents("_mm_cvtepi32_epi64")]
        pub fn expand_s32x4_s64x2(x: s32x4) -> s64x2
            = simd_expand::<_, s32x2, _>;
    });

    // Saturate and pack integer elements from two 16-byte vectors.
    defn_simd_llvm!("avx", {
        /// Saturate two vectors of signed 32-bit integers to unsigned 16 bits.
        #[intrinsic_for("vpackusdw")]
        #[intel_equivalents("_mm_packus_epi32")]
        pub fn concat_and_saturate_u16_s32x4(a: s32x4, b: s32x4) -> u16x8
            = "llvm.x86.sse41.packusdw.128";
    });

    // Extract a primitive element from a 16-byte vector.
    defn_simd_shared!("avx", fn(T) -> R {
        const_assert!(INDEX < T::LEN as u8);
        simd_extract(x, const { INDEX as u32 })
    }, {
        /// Extract an unsigned 8-bit integer from a constant index.
        #[intrinsic_for("vpextrb")]
        #[intel_equivalents("_mm_extract_epi8")]
        pub fn get_u8x16<INDEX: u8>(x: u8x16) -> u8;

        /// Extract an unsigned 32-bit integer from a constant index.
        #[intrinsic_for("vpextrd")]
        #[intel_equivalents("_mm_extract_epi32")]
        pub fn get_u32x4<INDEX: u8>(x: u32x4) -> u32;

        /// Extract an unsigned 64-bit integer from a constant index.
        #[intrinsic_for("vpextrq")]
        #[intel_equivalents("_mm_extract_epi64")]
        pub fn get_u64x2<INDEX: u8>(x: u64x2) -> u64;

        /// Extract a signed 8-bit integer from a constant index.
        #[intrinsic_for("vpextrb")]
        #[intel_equivalents("_mm_extract_epi8")]
        pub fn get_s8x16<INDEX: u8>(x: s8x16) -> i8;

        /// Extract a signed 32-bit integer from a constant index.
        #[intrinsic_for("vpextrd")]
        #[intel_equivalents("_mm_extract_epi32")]
        pub fn get_s32x4<INDEX: u8>(x: s32x4) -> i32;

        /// Extract a signed 64-bit integer from a constant index.
        #[intrinsic_for("vpextrq")]
        #[intel_equivalents("_mm_extract_epi64")]
        pub fn get_s64x2<INDEX: u8>(x: s64x2) -> i64;
    });

    // Insert a primitive element into a 16-byte vector.
    defn_simd_shared!("avx", fn(T, E) -> R {
        const_assert!(INDEX < T::LEN as u8);
        simd_insert(x, const { INDEX as u32 }, e)
    }, {
        /// Replace an unsigned 8-bit integer at a constant index.
        #[intrinsic_for("vpinsrb")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_u8x16<INDEX: u8>(x: u8x16, e: u8) -> u8x16;

        /// Replace an unsigned 32-bit integer at a constant index.
        #[intrinsic_for("vpinsrd")]
        #[intel_equivalents("_mm_insert_epi32")]
        pub fn put_u32x4<INDEX: u8>(x: u32x4, e: u32) -> u32x4;

        /// Replace an unsigned 64-bit integer at a constant index.
        #[intrinsic_for("vpinsrq")]
        #[intel_equivalents("_mm_insert_epi64")]
        pub fn put_u64x2<INDEX: u8>(x: u64x2, e: u64) -> u64x2;

        /// Replace a signed 8-bit integer at a constant index.
        #[intrinsic_for("vpinsrb")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_s8x16<INDEX: u8>(x: s8x16, e: i8) -> s8x16;

        /// Replace a signed 32-bit integer at a constant index.
        #[intrinsic_for("vpinsrd")]
        #[intel_equivalents("_mm_insert_epi8")]
        pub fn put_s32x4<INDEX: u8>(x: s32x4, e: i32) -> s32x4;

        /// Replace a signed 64-bit integer at a constant index.
        #[intrinsic_for("vpinsrq")]
        #[intel_equivalents("_mm_insert_epi64")]
        pub fn put_s64x2<INDEX: u8>(x: s64x2, e: i64) -> s64x2;
    });

    // Find the minimum value and index in a 16-byte integer vector.
    defn_simd_llvm!("avx", {
        /// Find the minimum 16-bit integer and its index.
        #[intrinsic_for("vphminposw")]
        #[intel_equivalents("_mm_minpos_epu16")]
        pub fn min_pos_u16x8(x: u16x8) -> u16x8
            = "llvm.x86.sse41.phminposw";
    });
}