Documentation
#![allow(clippy::collapsible_if, clippy::too_many_lines)]

use crate::isa::InstructionSet;
use crate::pod::POD;
use crate::tools::transmute_copy as tc;
use crate::vector::{V128, V256};

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use crate::isa::{AVX2, SSE2, SSE41};

#[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
use crate::isa::NEON;

#[cfg(target_arch = "wasm32")]
use crate::isa::WASM128;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[cfg(all(feature = "unstable", target_arch = "arm"))]
use core::arch::arm::*;

#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;

#[cfg(target_arch = "wasm32")]
use core::arch::wasm32::*;

#[inline(always)]
pub fn splat<S: InstructionSet, T: POD, V: POD>(s: S, x: T) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm256_set1_epi8(tc(&x))) };
            }
        }
        {
            let c = splat::<S, T, V128>(s, x).x2();
            return unsafe { tc(&c) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm_set1_epi8(tc(&x))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm_set1_epi16(tc(&x))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm_set1_epi32(tc(&x))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm_set1_epi64x(tc(&x))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&vld1q_dup_u8(&tc(&x))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&vld1q_dup_u16(&tc(&x))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&vld1q_dup_u32(&tc(&x))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&vld1q_dup_u64(&tc(&x))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&u8x16_splat(tc(&x))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&u16x8_splat(tc(&x))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&u32x4_splat(tc(&x))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&u64x2_splat(tc(&x))) };
            }
        }
    }
    {
        let _ = (s, x);
        unreachable!()
    }
}

#[inline(always)]
pub fn add<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm256_add_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm256_add_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm256_add_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm256_add_epi64(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = add::<S, T, V128>(s, a.0, b.0);
            let c1 = add::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm_add_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm_add_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm_add_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm_add_epi64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&vaddq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&vaddq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&vaddq_u32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&vaddq_u64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&u8x16_add(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&u16x8_add(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&u32x4_add(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&u64x2_add(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn sub<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm256_sub_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm256_sub_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm256_sub_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm256_sub_epi64(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = sub::<S, T, V128>(s, a.0, b.0);
            let c1 = sub::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm_sub_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm_sub_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm_sub_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm_sub_epi64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&vsubq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&vsubq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&vsubq_u32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&vsubq_u64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&u8x16_sub(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&u16x8_sub(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&u32x4_sub(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&u64x2_sub(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn eq<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm256_cmpeq_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm256_cmpeq_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm256_cmpeq_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&_mm256_cmpeq_epi64(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = eq::<S, T, V128>(s, a.0, b.0);
            let c1 = eq::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&_mm_cmpeq_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&_mm_cmpeq_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&_mm_cmpeq_epi32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&vceqq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&vceqq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&vceqq_u32(tc(&a), tc(&b))) };
            }
            #[cfg(target_arch = "aarch64")]
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&vceqq_u64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, u8 | i8) {
                return unsafe { tc(&u8x16_eq(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16 | i16) {
                return unsafe { tc(&u16x8_eq(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32 | i32) {
                return unsafe { tc(&u32x4_eq(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u64 | i64) {
                return unsafe { tc(&u64x2_eq(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn lt<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm256_cmpgt_epi8(tc(&b), tc(&a))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm256_cmpgt_epi16(tc(&b), tc(&a))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm256_cmpgt_epi32(tc(&b), tc(&a))) };
            }
            if is_pod_type!(T, i64) {
                return unsafe { tc(&_mm256_cmpgt_epi64(tc(&b), tc(&a))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe {
                    let (a, b) = (tc(&a), tc(&b));
                    let c = _mm256_cmpeq_epi8(a, _mm256_max_epu8(a, b));
                    tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi8(a, a)))
                };
            }
            if is_pod_type!(T, u16) {
                return unsafe {
                    let (a, b) = (tc(&a), tc(&b));
                    let c = _mm256_cmpeq_epi16(a, _mm256_max_epu16(a, b));
                    tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi16(a, a)))
                };
            }
            if is_pod_type!(T, u32) {
                return unsafe {
                    let (a, b) = (tc(&a), tc(&b));
                    let c = _mm256_cmpeq_epi32(a, _mm256_max_epu32(a, b));
                    tc(&_mm256_xor_si256(c, _mm256_cmpeq_epi32(a, a)))
                };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = lt::<S, T, V128>(s, a.0, b.0);
            let c1 = lt::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm_cmplt_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm_cmplt_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm_cmplt_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe {
                    let (a, b) = (tc(&a), tc(&b));
                    let c = _mm_cmpeq_epi8(a, _mm_max_epu8(a, b));
                    tc(&_mm_xor_si128(c, _mm_cmpeq_epi8(a, a)))
                };
            }
            if is_pod_type!(T, u16) {
                return unsafe {
                    let m = _mm_set1_epi16(i16::MIN);
                    let a = _mm_xor_si128(tc(&a), m);
                    let b = _mm_xor_si128(tc(&b), m);
                    tc(&_mm_cmplt_epi16(a, b))
                };
            }
            if is_pod_type!(T, u32) {
                return unsafe {
                    let m = _mm_set1_epi32(i32::MIN);
                    let a = _mm_xor_si128(tc(&a), m);
                    let b = _mm_xor_si128(tc(&b), m);
                    tc(&_mm_cmplt_epi32(a, b))
                };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&vcltq_s8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&vcltq_s16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&vcltq_s32(tc(&a), tc(&b))) };
            }
            #[cfg(target_arch = "aarch64")]
            if is_pod_type!(T, i64) {
                return unsafe { tc(&vcltq_s64(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&vcltq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&vcltq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&vcltq_u32(tc(&a), tc(&b))) };
            }
            #[cfg(target_arch = "aarch64")]
            if is_pod_type!(T, u64) {
                return unsafe { tc(&vcltq_u64(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&i8x16_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&i16x8_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&i32x4_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i64) {
                return unsafe { tc(&i64x2_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&u8x16_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&u16x8_lt(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&u32x4_lt(tc(&a), tc(&b))) };
            }
            // if is_pod_type!(T, u64) {
            //     return unsafe { tc(&u64x2_lt(tc(&a), tc(&b))) };
            // }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn add_sat<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm256_adds_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm256_adds_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm256_adds_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm256_adds_epu16(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = add_sat::<S, T, V128>(s, a.0, b.0);
            let c1 = add_sat::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm_adds_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm_adds_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm_adds_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm_adds_epu16(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&vqaddq_s8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&vqaddq_s16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&vqaddq_s32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&vqaddq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&vqaddq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&vqaddq_u32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&i8x16_add_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&i16x8_add_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&u8x16_add_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&u16x8_add_sat(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn sub_sat<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm256_subs_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm256_subs_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm256_subs_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm256_subs_epu16(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = sub_sat::<S, T, V128>(s, a.0, b.0);
            let c1 = sub_sat::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm_subs_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm_subs_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm_subs_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm_subs_epu16(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&vqsubq_s8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&vqsubq_s16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&vqsubq_s32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&vqsubq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&vqsubq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&vqsubq_u32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&i8x16_sub_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&i16x8_sub_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&u8x16_sub_sat(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&u16x8_sub_sat(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

pub fn max<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm256_max_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm256_max_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm256_max_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm256_max_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm256_max_epu16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&_mm256_max_epu32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&_mm256_max_ps(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f64) {
                return unsafe { tc(&_mm256_max_pd(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = max::<S, T, V128>(s, a.0, b.0);
            let c1 = max::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(miri)]
        {
            if is_pod_type!(T, u8) {
                return unsafe { tc(&crate::simulation::u8x16_max(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE41) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm_max_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm_max_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm_max_epu16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&_mm_max_epu32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm_max_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm_max_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&_mm_max_ps(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f64) {
                return unsafe { tc(&_mm_max_pd(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&vmaxq_s8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&vmaxq_s16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&vmaxq_s32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&vmaxq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&vmaxq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&vmaxq_u32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&vmaxq_f32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&i8x16_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&i16x8_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&i32x4_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&u8x16_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&u16x8_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&u32x4_max(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&f32x4_max(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn min<S: InstructionSet, T: POD, V: POD>(s: S, a: V, b: V) -> V {
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm256_min_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm256_min_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm256_min_epi32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm256_min_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm256_min_epu16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&_mm256_min_epu32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&_mm256_min_ps(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f64) {
                return unsafe { tc(&_mm256_min_pd(tc(&a), tc(&b))) };
            }
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = min::<S, T, V128>(s, a.0, b.0);
            let c1 = min::<S, T, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(miri)]
        {
            if is_pod_type!(T, u8) {
                return unsafe { tc(&crate::simulation::u8x16_min(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE41) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&_mm_min_epi8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&_mm_min_epi32(tc(&a), tc(&b))) };
            }

            if is_pod_type!(T, u16) {
                return unsafe { tc(&_mm_min_epu16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&_mm_min_epu32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            if is_pod_type!(T, i16) {
                return unsafe { tc(&_mm_min_epi16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&_mm_min_epu8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&_mm_min_ps(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f64) {
                return unsafe { tc(&_mm_min_pd(tc(&a), tc(&b))) };
            }
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&vminq_s8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&vminq_s16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&vminq_s32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&vminq_u8(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&vminq_u16(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&vminq_u32(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&vminq_f32(tc(&a), tc(&b))) };
            }
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            if is_pod_type!(T, i8) {
                return unsafe { tc(&i8x16_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i16) {
                return unsafe { tc(&i16x8_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, i32) {
                return unsafe { tc(&i32x4_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u8) {
                return unsafe { tc(&u8x16_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u16) {
                return unsafe { tc(&u16x8_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, u32) {
                return unsafe { tc(&u32x4_min(tc(&a), tc(&b))) };
            }
            if is_pod_type!(T, f32) {
                return unsafe { tc(&f32x4_min(tc(&a), tc(&b))) };
            }
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn and<S, V>(s: S, a: V, b: V) -> V
where
    S: InstructionSet,
    V: POD,
{
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            return unsafe { tc(&_mm256_and_si256(tc(&a), tc(&b))) };
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = and::<S, V128>(s, a.0, b.0);
            let c1 = and::<S, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            return unsafe { tc(&_mm_and_si128(tc(&a), tc(&b))) };
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            return unsafe { tc(&vandq_u8(tc(&a), tc(&b))) };
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            return unsafe { tc(&v128_and(tc(&a), tc(&b))) };
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn or<S, V>(s: S, a: V, b: V) -> V
where
    S: InstructionSet,
    V: POD,
{
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            return unsafe { tc(&_mm256_or_si256(tc(&a), tc(&b))) };
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = or::<S, V128>(s, a.0, b.0);
            let c1 = or::<S, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            return unsafe { tc(&_mm_or_si128(tc(&a), tc(&b))) };
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            return unsafe { tc(&vorrq_u8(tc(&a), tc(&b))) };
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            return unsafe { tc(&v128_or(tc(&a), tc(&b))) };
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn xor<S, V>(s: S, a: V, b: V) -> V
where
    S: InstructionSet,
    V: POD,
{
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            return unsafe { tc(&_mm256_xor_si256(tc(&a), tc(&b))) };
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = xor::<S, V128>(s, a.0, b.0);
            let c1 = xor::<S, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            return unsafe { tc(&_mm_xor_si128(tc(&a), tc(&b))) };
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            return unsafe { tc(&veorq_u8(tc(&a), tc(&b))) };
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            return unsafe { tc(&v128_xor(tc(&a), tc(&b))) };
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}

#[inline(always)]
pub fn andnot<S, V>(s: S, a: V, b: V) -> V
where
    S: InstructionSet,
    V: POD,
{
    if is_pod_type!(V, V256) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, AVX2) {
            let (a, b) = (b, a);
            return unsafe { tc(&_mm256_andnot_si256(tc(&a), tc(&b))) };
        }
        {
            let (a, b): (V256, V256) = unsafe { (tc(&a), tc(&b)) };
            let (a, b) = (a.to_v128x2(), b.to_v128x2());
            let c0 = andnot::<S, V128>(s, a.0, b.0);
            let c1 = andnot::<S, V128>(s, a.1, b.1);
            return unsafe { tc(&V256::from_v128x2((c0, c1))) };
        }
    }
    if is_pod_type!(V, V128) {
        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
        if matches_isa!(S, SSE2) {
            let (a, b) = (b, a);
            return unsafe { tc(&_mm_andnot_si128(tc(&a), tc(&b))) };
        }
        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
        if matches_isa!(S, NEON) {
            return unsafe { tc(&vbicq_u8(tc(&a), tc(&b))) };
        }
        #[cfg(target_arch = "wasm32")]
        if matches_isa!(S, WASM128) {
            return unsafe { tc(&v128_andnot(tc(&a), tc(&b))) };
        }
    }
    {
        let _ = (s, a, b);
        unreachable!()
    }
}