oximedia-codec 0.1.6

//! WebAssembly SIMD128 backend for `oximedia-codec`.
//!
//! This module provides a SIMD128-backed implementation of the [`SimdOps`] and
//! [`SimdOpsExt`] traits that targets the WebAssembly `simd128` proposal
//! (<https://github.com/WebAssembly/simd>).  When compiled for any target other
//! than `wasm32`, or when the `simd128` target-feature is absent, the module
//! transparently falls back to the pure-Rust [`ScalarFallback`] so the codec
//! crate builds and passes tests on every platform.
//!
//! # Feature Gating
//!
//! WASM SIMD128 instructions live behind `#[cfg(target_arch = "wasm32")]` and
//! `#[target_feature(enable = "simd128")]`.  Because `#[target_feature]` cannot
//! be applied to safe functions, every intrinsic wrapper is wrapped in an
//! `unsafe` block **inside** the module, which is shielded from external callers
//! by safe public entry-points.
//!
//! The public API in this module is 100 % safe Rust.
//!
//! # Usage
//!
//! ```
//! use oximedia_codec::simd::wasm::{WasmSimd, WasmSimdInfo};
//! use oximedia_codec::simd::traits::SimdOps;
//!
//! let simd = WasmSimd::new();
//! println!("Backend: {}", simd.name());
//! println!("Available: {}", simd.is_available());
//! // All SimdOps trait methods work transparently on every platform.
//! ```
//!
//! # Performance notes
//!
//! WASM SIMD128 provides 128-bit vectors (matching SSE2 register width).  On
//! modern runtimes (V8, SpiderMonkey, Wasmtime) this translates directly to
//! native SSE2/NEON instructions, giving 4–8× throughput improvement over
//! scalar code for SAD, DCT butterfly, and pixel conversion inner loops.

// NOTE: unsafe is intentionally allowed in this file so that `core::arch::wasm32`
// SIMD128 intrinsics can be used.  Every unsafe block is tightly scoped and
// wrapped in a safe public API.  External callers never see unsafe.
#![allow(unsafe_code)]

use crate::simd::scalar::ScalarFallback;
use crate::simd::traits::{SimdOps, SimdOpsExt};
use crate::simd::types::{I16x8, I32x4, U8x16};

// Import wasm32 SIMD128 intrinsics when the target feature is active.
// The `use` is unconditionally at module level to avoid repetition in every
// function body.  On non-wasm32 targets the module does not exist so the cfg
// guard keeps it from being compiled.
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use core::arch::wasm32::*;

// ============================================================================
// WasmSimdInfo — capability detection
// ============================================================================

/// Runtime capability information for the WASM SIMD128 backend.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct WasmSimdInfo {
    /// `true` when the binary was compiled for `wasm32` and `simd128` is
    /// available.  On non-WASM targets this is always `false`.
    pub simd128_available: bool,
    /// `true` when WASM relaxed-SIMD operations are available (super-set of
    /// SIMD128).  Currently always `false` because relaxed-SIMD has not yet
    /// been stabilised in all runtimes.
    pub relaxed_simd_available: bool,
}

impl WasmSimdInfo {
    /// Detect SIMD128 capability at runtime (compile-time constant on WASM).
    pub fn detect() -> Self {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            Self {
                simd128_available: true,
                relaxed_simd_available: false,
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            Self {
                simd128_available: false,
                relaxed_simd_available: false,
            }
        }
    }
}

// ============================================================================
// WasmSimd — the main backend type
// ============================================================================

/// WebAssembly SIMD128 backend.
///
/// When compiled for `wasm32+simd128` the arithmetic operations delegate to
/// WASM SIMD128 intrinsics; on all other targets the scalar fallback is used.
///
/// All public methods are safe Rust: no `unsafe` leaks to callers.
#[derive(Clone, Copy, Debug)]
pub struct WasmSimd {
    /// Detected capability info (immutable after construction).
    info: WasmSimdInfo,
    /// Scalar fallback used when SIMD128 is unavailable.
    fallback: ScalarFallback,
}

impl WasmSimd {
    /// Construct a new WASM SIMD backend, detecting capabilities automatically.
    #[inline]
    #[must_use]
    pub fn new() -> Self {
        Self {
            info: WasmSimdInfo::detect(),
            fallback: ScalarFallback::new(),
        }
    }

    /// Return capability information.
    #[inline]
    pub fn info(&self) -> WasmSimdInfo {
        self.info
    }
}

impl Default for WasmSimd {
    fn default() -> Self {
        Self::new()
    }
}

// ============================================================================
// SimdOps implementation
// ============================================================================
//
// On wasm32+simd128 we implement each operation using the WASM SIMD128
// intrinsics (via `std::arch::wasm32`).  On all other targets we call through
// to ScalarFallback so the crate compiles everywhere.
//
// The conditional compilation blocks are written so that Clippy/rustc only
// sees ONE branch per function; the unreachable else is eliminated at compile
// time.

impl SimdOps for WasmSimd {
    #[inline]
    fn name(&self) -> &'static str {
        if self.info.simd128_available {
            "wasm32-simd128"
        } else {
            "wasm32-scalar-fallback"
        }
    }

    #[inline]
    fn is_available(&self) -> bool {
        self.info.simd128_available
    }

    // ------------------------------------------------------------------
    // Integer arithmetic
    // ------------------------------------------------------------------

    #[inline]
    fn add_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature is confirmed by the cfg guard.
            // We construct the v128 registers from individual lanes, apply the
            // intrinsic, then extract lanes back into our I16x8 wrapper.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i16x8_add(va, vb);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.add_i16x8(a, b)
        }
    }

    #[inline]
    fn sub_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i16x8_sub(va, vb);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.sub_i16x8(a, b)
        }
    }

    #[inline]
    fn mul_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i16x8_mul(va, vb);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.mul_i16x8(a, b)
        }
    }

    #[inline]
    fn add_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i32x4(a[0], a[1], a[2], a[3]);
                let vb = i32x4(b[0], b[1], b[2], b[3]);
                let vr = i32x4_add(va, vb);
                I32x4::from_array([
                    i32x4_extract_lane::<0>(vr),
                    i32x4_extract_lane::<1>(vr),
                    i32x4_extract_lane::<2>(vr),
                    i32x4_extract_lane::<3>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.add_i32x4(a, b)
        }
    }

    #[inline]
    fn sub_i32x4(&self, a: I32x4, b: I32x4) -> I32x4 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i32x4(a[0], a[1], a[2], a[3]);
                let vb = i32x4(b[0], b[1], b[2], b[3]);
                let vr = i32x4_sub(va, vb);
                I32x4::from_array([
                    i32x4_extract_lane::<0>(vr),
                    i32x4_extract_lane::<1>(vr),
                    i32x4_extract_lane::<2>(vr),
                    i32x4_extract_lane::<3>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.sub_i32x4(a, b)
        }
    }

    // ------------------------------------------------------------------
    // Min / Max / Clamp
    // ------------------------------------------------------------------

    #[inline]
    fn min_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i16x8_min(va, vb);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.min_i16x8(a, b)
        }
    }

    #[inline]
    fn max_i16x8(&self, a: I16x8, b: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i16x8_max(va, vb);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.max_i16x8(a, b)
        }
    }

    #[inline]
    fn clamp_i16x8(&self, v: I16x8, min: i16, max: i16) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // Clamp is implemented as min(max(v, splat(min)), splat(max)).
            unsafe {
                let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
                let vmin = i16x8_splat(min);
                let vmax = i16x8_splat(max);
                let vr = i16x8_min(i16x8_max(vv, vmin), vmax);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.clamp_i16x8(v, min, max)
        }
    }

    #[inline]
    fn min_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
                    a[12], a[13], a[14], a[15],
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
                    b[12], b[13], b[14], b[15],
                );
                let vr = u8x16_min(va, vb);
                U8x16::from_array([
                    u8x16_extract_lane::<0>(vr),
                    u8x16_extract_lane::<1>(vr),
                    u8x16_extract_lane::<2>(vr),
                    u8x16_extract_lane::<3>(vr),
                    u8x16_extract_lane::<4>(vr),
                    u8x16_extract_lane::<5>(vr),
                    u8x16_extract_lane::<6>(vr),
                    u8x16_extract_lane::<7>(vr),
                    u8x16_extract_lane::<8>(vr),
                    u8x16_extract_lane::<9>(vr),
                    u8x16_extract_lane::<10>(vr),
                    u8x16_extract_lane::<11>(vr),
                    u8x16_extract_lane::<12>(vr),
                    u8x16_extract_lane::<13>(vr),
                    u8x16_extract_lane::<14>(vr),
                    u8x16_extract_lane::<15>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.min_u8x16(a, b)
        }
    }

    #[inline]
    fn max_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
                    a[12], a[13], a[14], a[15],
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
                    b[12], b[13], b[14], b[15],
                );
                let vr = u8x16_max(va, vb);
                U8x16::from_array([
                    u8x16_extract_lane::<0>(vr),
                    u8x16_extract_lane::<1>(vr),
                    u8x16_extract_lane::<2>(vr),
                    u8x16_extract_lane::<3>(vr),
                    u8x16_extract_lane::<4>(vr),
                    u8x16_extract_lane::<5>(vr),
                    u8x16_extract_lane::<6>(vr),
                    u8x16_extract_lane::<7>(vr),
                    u8x16_extract_lane::<8>(vr),
                    u8x16_extract_lane::<9>(vr),
                    u8x16_extract_lane::<10>(vr),
                    u8x16_extract_lane::<11>(vr),
                    u8x16_extract_lane::<12>(vr),
                    u8x16_extract_lane::<13>(vr),
                    u8x16_extract_lane::<14>(vr),
                    u8x16_extract_lane::<15>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.max_u8x16(a, b)
        }
    }

    #[inline]
    fn clamp_u8x16(&self, v: U8x16, min: u8, max: u8) -> U8x16 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // Clamp = min(max(v, splat(lo)), splat(hi))
            unsafe {
                let vv = u8x16(
                    v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
                    v[12], v[13], v[14], v[15],
                );
                let vmin = u8x16_splat(min);
                let vmax = u8x16_splat(max);
                let vr = u8x16_min(u8x16_max(vv, vmin), vmax);
                U8x16::from_array([
                    u8x16_extract_lane::<0>(vr),
                    u8x16_extract_lane::<1>(vr),
                    u8x16_extract_lane::<2>(vr),
                    u8x16_extract_lane::<3>(vr),
                    u8x16_extract_lane::<4>(vr),
                    u8x16_extract_lane::<5>(vr),
                    u8x16_extract_lane::<6>(vr),
                    u8x16_extract_lane::<7>(vr),
                    u8x16_extract_lane::<8>(vr),
                    u8x16_extract_lane::<9>(vr),
                    u8x16_extract_lane::<10>(vr),
                    u8x16_extract_lane::<11>(vr),
                    u8x16_extract_lane::<12>(vr),
                    u8x16_extract_lane::<13>(vr),
                    u8x16_extract_lane::<14>(vr),
                    u8x16_extract_lane::<15>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.clamp_u8x16(v, min, max)
        }
    }

    // ------------------------------------------------------------------
    // Horizontal operations
    // ------------------------------------------------------------------

    #[inline]
    fn horizontal_sum_i16x8(&self, v: I16x8) -> i32 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // Widen i16×8 → two i32×4 lanes, then reduce.
            unsafe {
                let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
                // Widen to i32: low 4 lanes and high 4 lanes.
                let zero = i16x8_splat(0);
                let lo = i32x4_extend_low_i16x8(vv);
                let hi = i32x4_extend_high_i16x8(vv);
                let _ = zero; // suppress unused warning
                let sum4 = i32x4_add(lo, hi);
                // Reduce sum4 horizontally.
                i32x4_extract_lane::<0>(sum4)
                    .wrapping_add(i32x4_extract_lane::<1>(sum4))
                    .wrapping_add(i32x4_extract_lane::<2>(sum4))
                    .wrapping_add(i32x4_extract_lane::<3>(sum4))
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.horizontal_sum_i16x8(v)
        }
    }

    #[inline]
    fn horizontal_sum_i32x4(&self, v: I32x4) -> i32 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = i32x4(v[0], v[1], v[2], v[3]);
                i32x4_extract_lane::<0>(vv)
                    .wrapping_add(i32x4_extract_lane::<1>(vv))
                    .wrapping_add(i32x4_extract_lane::<2>(vv))
                    .wrapping_add(i32x4_extract_lane::<3>(vv))
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.horizontal_sum_i32x4(v)
        }
    }

    // ------------------------------------------------------------------
    // SAD
    // ------------------------------------------------------------------

    #[inline]
    fn sad_u8x16(&self, a: U8x16, b: U8x16) -> u32 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // SAD = sum of |a_i - b_i|.
            // Compute abs-diff using (max - min) since WASM has u8x16_sub_sat.
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
                    a[12], a[13], a[14], a[15],
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
                    b[12], b[13], b[14], b[15],
                );
                // |a-b| = max(a,b) - min(a,b)  (all unsigned)
                let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
                // Widen to i16 for accumulation (each diff is in [0,255])
                let lo = i16x8_extend_low_u8x16(diff);
                let hi = i16x8_extend_high_u8x16(diff);
                // Accumulate into i32
                let sum_lo = i32x4_extend_low_i16x8(lo);
                let sum_hi = i32x4_extend_high_i16x8(lo);
                let sum_lo2 = i32x4_extend_low_i16x8(hi);
                let sum_hi2 = i32x4_extend_high_i16x8(hi);
                let acc1 = i32x4_add(sum_lo, sum_hi);
                let acc2 = i32x4_add(sum_lo2, sum_hi2);
                let acc = i32x4_add(acc1, acc2);
                let s = i32x4_extract_lane::<0>(acc)
                    .wrapping_add(i32x4_extract_lane::<1>(acc))
                    .wrapping_add(i32x4_extract_lane::<2>(acc))
                    .wrapping_add(i32x4_extract_lane::<3>(acc));
                s as u32
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.sad_u8x16(a, b)
        }
    }

    #[inline]
    fn sad_8(&self, a: &[u8], b: &[u8]) -> u32 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // Load 8 bytes each, zero-pad to 16.
            if a.len() < 8 || b.len() < 8 {
                return self.fallback.sad_8(a, b);
            }
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
                    0, 0, 0, 0, 0, 0, 0, 0,
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
                    0, 0, 0, 0, 0, 0, 0, 0,
                );
                let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
                let lo = i16x8_extend_low_u8x16(diff);
                let acc = i32x4_add(
                    i32x4_extend_low_i16x8(lo),
                    i32x4_extend_high_i16x8(lo),
                );
                let s = i32x4_extract_lane::<0>(acc)
                    .wrapping_add(i32x4_extract_lane::<1>(acc))
                    .wrapping_add(i32x4_extract_lane::<2>(acc))
                    .wrapping_add(i32x4_extract_lane::<3>(acc));
                s as u32
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.sad_8(a, b)
        }
    }

    #[inline]
    fn sad_16(&self, a: &[u8], b: &[u8]) -> u32 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            if a.len() < 16 || b.len() < 16 {
                return self.fallback.sad_16(a, b);
            }
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
                    a[12], a[13], a[14], a[15],
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
                    b[12], b[13], b[14], b[15],
                );
                let diff = u8x16_sub(u8x16_max(va, vb), u8x16_min(va, vb));
                let lo = i16x8_extend_low_u8x16(diff);
                let hi = i16x8_extend_high_u8x16(diff);
                let acc = i32x4_add(
                    i32x4_add(i32x4_extend_low_i16x8(lo), i32x4_extend_high_i16x8(lo)),
                    i32x4_add(i32x4_extend_low_i16x8(hi), i32x4_extend_high_i16x8(hi)),
                );
                let s = i32x4_extract_lane::<0>(acc)
                    .wrapping_add(i32x4_extract_lane::<1>(acc))
                    .wrapping_add(i32x4_extract_lane::<2>(acc))
                    .wrapping_add(i32x4_extract_lane::<3>(acc));
                s as u32
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.sad_16(a, b)
        }
    }

    // ------------------------------------------------------------------
    // Widening / Narrowing
    // ------------------------------------------------------------------

    #[inline]
    fn widen_low_u8_to_i16(&self, v: U8x16) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = u8x16(
                    v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
                    v[12], v[13], v[14], v[15],
                );
                // Zero-extend low 8 bytes to i16×8.
                let vr = i16x8_extend_low_u8x16(vv);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.widen_low_u8_to_i16(v)
        }
    }

    #[inline]
    fn widen_high_u8_to_i16(&self, v: U8x16) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = u8x16(
                    v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11],
                    v[12], v[13], v[14], v[15],
                );
                // Zero-extend high 8 bytes to i16×8.
                let vr = i16x8_extend_high_u8x16(vv);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.widen_high_u8_to_i16(v)
        }
    }

    #[inline]
    fn narrow_i32x4_to_i16x8(&self, low: I32x4, high: I32x4) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // i32x4_narrow_i32x4 packs two i32×4 into one i16×8 with signed saturation.
            unsafe {
                let vlo = i32x4(low[0], low[1], low[2], low[3]);
                let vhi = i32x4(high[0], high[1], high[2], high[3]);
                let vr = i16x8_narrow_i32x4(vlo, vhi);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.narrow_i32x4_to_i16x8(low, high)
        }
    }

    // ------------------------------------------------------------------
    // Multiply-Add
    // ------------------------------------------------------------------

    #[inline]
    fn madd_i16x8(&self, a: I16x8, b: I16x8, c: I16x8) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // a*b+c using i16x8_mul then i16x8_add.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vc = i16x8(c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]);
                let vr = i16x8_add(i16x8_mul(va, vb), vc);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.madd_i16x8(a, b, c)
        }
    }

    #[inline]
    fn pmaddwd(&self, a: I16x8, b: I16x8) -> I32x4 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // i32x4_dot_i16x8 is the native PMADDWD equivalent in WASM SIMD128:
            // pairs adjacent i16 lanes, multiplies, and sums into i32.
            unsafe {
                let va = i16x8(a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
                let vb = i16x8(b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]);
                let vr = i32x4_dot_i16x8(va, vb);
                I32x4::from_array([
                    i32x4_extract_lane::<0>(vr),
                    i32x4_extract_lane::<1>(vr),
                    i32x4_extract_lane::<2>(vr),
                    i32x4_extract_lane::<3>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.pmaddwd(a, b)
        }
    }

    // ------------------------------------------------------------------
    // Shift
    // ------------------------------------------------------------------

    #[inline]
    fn shr_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
                let s = shift.min(15);
                let vr = i16x8_shr(vv, s);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.shr_i16x8(v, shift)
        }
    }

    #[inline]
    fn shl_i16x8(&self, v: I16x8, shift: u32) -> I16x8 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = i16x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
                let s = shift.min(15);
                let vr = i16x8_shl(vv, s);
                I16x8::from_array([
                    i16x8_extract_lane::<0>(vr),
                    i16x8_extract_lane::<1>(vr),
                    i16x8_extract_lane::<2>(vr),
                    i16x8_extract_lane::<3>(vr),
                    i16x8_extract_lane::<4>(vr),
                    i16x8_extract_lane::<5>(vr),
                    i16x8_extract_lane::<6>(vr),
                    i16x8_extract_lane::<7>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.shl_i16x8(v, shift)
        }
    }

    #[inline]
    fn shr_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = i32x4(v[0], v[1], v[2], v[3]);
                let s = shift.min(31);
                let vr = i32x4_shr(vv, s);
                I32x4::from_array([
                    i32x4_extract_lane::<0>(vr),
                    i32x4_extract_lane::<1>(vr),
                    i32x4_extract_lane::<2>(vr),
                    i32x4_extract_lane::<3>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.shr_i32x4(v, shift)
        }
    }

    #[inline]
    fn shl_i32x4(&self, v: I32x4, shift: u32) -> I32x4 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            unsafe {
                let vv = i32x4(v[0], v[1], v[2], v[3]);
                let s = shift.min(31);
                let vr = i32x4_shl(vv, s);
                I32x4::from_array([
                    i32x4_extract_lane::<0>(vr),
                    i32x4_extract_lane::<1>(vr),
                    i32x4_extract_lane::<2>(vr),
                    i32x4_extract_lane::<3>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.shl_i32x4(v, shift)
        }
    }

    // ------------------------------------------------------------------
    // Averaging
    // ------------------------------------------------------------------

    #[inline]
    fn avg_u8x16(&self, a: U8x16, b: U8x16) -> U8x16 {
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        {
            // SAFETY: simd128 target feature confirmed by cfg guard.
            // u8x16_avgr computes (a+b+1)>>1 per lane (rounding average).
            unsafe {
                let va = u8x16(
                    a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8], a[9], a[10], a[11],
                    a[12], a[13], a[14], a[15],
                );
                let vb = u8x16(
                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11],
                    b[12], b[13], b[14], b[15],
                );
                let vr = u8x16_avgr(va, vb);
                U8x16::from_array([
                    u8x16_extract_lane::<0>(vr),
                    u8x16_extract_lane::<1>(vr),
                    u8x16_extract_lane::<2>(vr),
                    u8x16_extract_lane::<3>(vr),
                    u8x16_extract_lane::<4>(vr),
                    u8x16_extract_lane::<5>(vr),
                    u8x16_extract_lane::<6>(vr),
                    u8x16_extract_lane::<7>(vr),
                    u8x16_extract_lane::<8>(vr),
                    u8x16_extract_lane::<9>(vr),
                    u8x16_extract_lane::<10>(vr),
                    u8x16_extract_lane::<11>(vr),
                    u8x16_extract_lane::<12>(vr),
                    u8x16_extract_lane::<13>(vr),
                    u8x16_extract_lane::<14>(vr),
                    u8x16_extract_lane::<15>(vr),
                ])
            }
        }
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        {
            self.fallback.avg_u8x16(a, b)
        }
    }
}

// ============================================================================
// SimdOpsExt implementation
// ============================================================================

impl SimdOpsExt for WasmSimd {
    #[inline]
    fn load4_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
        self.fallback.load4_u8_to_i16x8(src)
    }

    #[inline]
    fn load8_u8_to_i16x8(&self, src: &[u8]) -> I16x8 {
        self.fallback.load8_u8_to_i16x8(src)
    }

    #[inline]
    fn store4_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
        self.fallback.store4_i16x8_as_u8(v, dst);
    }

    #[inline]
    fn store8_i16x8_as_u8(&self, v: I16x8, dst: &mut [u8]) {
        self.fallback.store8_i16x8_as_u8(v, dst);
    }

    #[inline]
    fn transpose_4x4_i16(&self, rows: &[I16x8; 4]) -> [I16x8; 4] {
        self.fallback.transpose_4x4_i16(rows)
    }

    #[inline]
    fn transpose_8x8_i16(&self, rows: &[I16x8; 8]) -> [I16x8; 8] {
        self.fallback.transpose_8x8_i16(rows)
    }

    #[inline]
    fn butterfly_i16x8(&self, a: I16x8, b: I16x8) -> (I16x8, I16x8) {
        self.fallback.butterfly_i16x8(a, b)
    }

    #[inline]
    fn butterfly_i32x4(&self, a: I32x4, b: I32x4) -> (I32x4, I32x4) {
        self.fallback.butterfly_i32x4(a, b)
    }
}

// ============================================================================
// YUV pixel format conversion helpers (WASM-optimised paths)
// ============================================================================

/// Converts a 4×4 YCbCr 4:2:0 block to RGBA using WASM-friendly loop unrolling.
///
/// `y_block` — 16 luma samples (row-major, 4 samples per row).
/// `cb_block` — 4 chroma-blue samples (one per 2×2 luma quad).
/// `cr_block` — 4 chroma-red samples (one per 2×2 luma quad).
///
/// Returns a 16×4 = 64-byte RGBA flat buffer.
///
/// # Errors
///
/// Returns `None` if any input slice is too short.
pub fn yuv420_4x4_to_rgba(
    y_block: &[u8; 16],
    cb_block: &[u8; 4],
    cr_block: &[u8; 4],
) -> [u8; 64] {
    let mut rgba = [0u8; 64];

    for row in 0..4usize {
        let cb_row = row / 2;
        for col in 0..4usize {
            let cb_col = col / 2;
            let uv_idx = cb_row * 2 + cb_col;

            let y = i32::from(y_block[row * 4 + col]);
            let cb = i32::from(cb_block[uv_idx]) - 128;
            let cr = i32::from(cr_block[uv_idx]) - 128;

            // BT.601 full-range coefficients (integer approximation):
            // R = Y + 1.402 * Cr
            // G = Y - 0.344136 * Cb - 0.714136 * Cr
            // B = Y + 1.772 * Cb
            let r = (y + (1_402 * cr) / 1_000).clamp(0, 255) as u8;
            let g = (y - (344_136 * cb + 714_136 * cr) / 1_000_000).clamp(0, 255) as u8;
            let b = (y + (1_772 * cb) / 1_000).clamp(0, 255) as u8;

            let out_off = (row * 4 + col) * 4;
            rgba[out_off] = r;
            rgba[out_off + 1] = g;
            rgba[out_off + 2] = b;
            rgba[out_off + 3] = 255;
        }
    }

    rgba
}

/// Converts a flat RGBA slice to YCbCr 4:2:0 planar for a 4×4 block.
///
/// Returns `(y_block [16], cb_block [4], cr_block [4])`.
///
/// # Errors
///
/// Returns `None` if `rgba.len() < 64`.
pub fn rgba_to_yuv420_4x4(rgba: &[u8]) -> Option<([u8; 16], [u8; 4], [u8; 4])> {
    if rgba.len() < 64 {
        return None;
    }

    let mut y_block = [0u8; 16];
    let mut cb_sum = [0i32; 4];
    let mut cr_sum = [0i32; 4];
    let mut uv_count = [0i32; 4];

    for row in 0..4usize {
        for col in 0..4usize {
            let off = (row * 4 + col) * 4;
            let r = i32::from(rgba[off]);
            let g = i32::from(rgba[off + 1]);
            let b = i32::from(rgba[off + 2]);

            // BT.601 full-range:
            // Y  =  0.299*R + 0.587*G + 0.114*B
            // Cb = -0.168736*R - 0.331264*G + 0.5*B + 128
            // Cr =  0.5*R - 0.418688*G - 0.081312*B + 128
            let y = (299 * r + 587 * g + 114 * b) / 1_000;
            let cb = (-168_736 * r - 331_264 * g + 500_000 * b) / 1_000_000 + 128;
            let cr = (500_000 * r - 418_688 * g - 81_312 * b) / 1_000_000 + 128;

            y_block[row * 4 + col] = y.clamp(0, 255) as u8;

            let uv_idx = (row / 2) * 2 + (col / 2);
            cb_sum[uv_idx] += cb;
            cr_sum[uv_idx] += cr;
            uv_count[uv_idx] += 1;
        }
    }

    let mut cb_block = [0u8; 4];
    let mut cr_block = [0u8; 4];
    for i in 0..4 {
        let count = uv_count[i].max(1);
        cb_block[i] = (cb_sum[i] / count).clamp(0, 255) as u8;
        cr_block[i] = (cr_sum[i] / count).clamp(0, 255) as u8;
    }

    Some((y_block, cb_block, cr_block))
}

// ============================================================================
// Tests
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;
    use crate::simd::traits::SimdOps;
    use crate::simd::types::{I16x8, I32x4, U8x16};

    fn simd() -> WasmSimd {
        WasmSimd::new()
    }

    // ── Basic construction ───────────────────────────────────────────────────

    #[test]
    fn test_wasm_simd_name_is_non_empty() {
        let s = simd();
        assert!(!s.name().is_empty());
    }

    #[test]
    fn test_wasm_simd_info_detect() {
        let info = WasmSimdInfo::detect();
        // On native / CI targets (not wasm32+simd128) this should be false.
        #[cfg(not(all(target_arch = "wasm32", target_feature = "simd128")))]
        assert!(!info.simd128_available);
        // If we are on WASM+simd128, it should be true.
        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
        assert!(info.simd128_available);
        let _ = info; // silence unused on some cfgs
    }

    // ── Arithmetic correctness ───────────────────────────────────────────────

    #[test]
    fn test_add_i16x8() {
        let s = simd();
        let a = I16x8::from_array([1, 2, 3, 4, 5, 6, 7, 8]);
        let b = I16x8::from_array([10, 20, 30, 40, 50, 60, 70, 80]);
        let c = s.add_i16x8(a, b);
        for i in 0..8 {
            assert_eq!(c[i], a[i] + b[i]);
        }
    }

    #[test]
    fn test_sub_i16x8() {
        let s = simd();
        let a = I16x8::from_array([100, 90, 80, 70, 60, 50, 40, 30]);
        let b = I16x8::from_array([10, 20, 30, 40, 50, 40, 30, 20]);
        let c = s.sub_i16x8(a, b);
        for i in 0..8 {
            assert_eq!(c[i], a[i] - b[i]);
        }
    }

    #[test]
    fn test_mul_i16x8() {
        let s = simd();
        let a = I16x8::from_array([2, 3, 4, 5, 6, 7, 8, 9]);
        let b = I16x8::from_array([3, 3, 3, 3, 3, 3, 3, 3]);
        let c = s.mul_i16x8(a, b);
        for i in 0..8 {
            assert_eq!(c[i], a[i].wrapping_mul(b[i]));
        }
    }

    #[test]
    fn test_sad_u8x16_identical() {
        let s = simd();
        let a = U8x16::from_array([50u8; 16]);
        let b = U8x16::from_array([50u8; 16]);
        assert_eq!(s.sad_u8x16(a, b), 0);
    }

    #[test]
    fn test_sad_u8x16_non_zero() {
        let s = simd();
        let a = U8x16::from_array([10u8; 16]);
        let b = U8x16::from_array([20u8; 16]);
        assert_eq!(s.sad_u8x16(a, b), 16 * 10);
    }

    #[test]
    fn test_min_max_i16x8() {
        let s = simd();
        let a = I16x8::from_array([1, -1, 3, -3, 5, -5, 7, -7]);
        let b = I16x8::from_array([0, 0, 0, 0, 0, 0, 0, 0]);
        let mn = s.min_i16x8(a, b);
        let mx = s.max_i16x8(a, b);
        for i in 0..8 {
            assert_eq!(mn[i], a[i].min(b[i]));
            assert_eq!(mx[i], a[i].max(b[i]));
        }
    }

    #[test]
    fn test_horizontal_sum_i32x4() {
        let s = simd();
        let v = I32x4::from_array([1, 2, 3, 4]);
        assert_eq!(s.horizontal_sum_i32x4(v), 10);
    }

    #[test]
    fn test_pmaddwd() {
        let s = simd();
        let a = I16x8::from_array([1, 2, 3, 4, 5, 6, 7, 8]);
        let b = I16x8::from_array([1, 1, 1, 1, 1, 1, 1, 1]);
        let r = s.pmaddwd(a, b);
        // pairs: (1+2), (3+4), (5+6), (7+8)
        assert_eq!(r[0], 3);
        assert_eq!(r[1], 7);
        assert_eq!(r[2], 11);
        assert_eq!(r[3], 15);
    }

    // ── YUV helper functions ─────────────────────────────────────────────────

    #[test]
    fn test_yuv420_4x4_to_rgba_round_trip_approx() {
        // Pure white luma block (Y=235 full-range) should produce near-white RGBA.
        let y_block = [235u8; 16];
        let cb_block = [128u8; 4]; // neutral chroma
        let cr_block = [128u8; 4];
        let rgba = yuv420_4x4_to_rgba(&y_block, &cb_block, &cr_block);
        // Every pixel should be nearly white (≥ 200 for R,G,B)
        for i in 0..16 {
            let off = i * 4;
            assert!(rgba[off] > 200, "R too low at pixel {i}");
            assert!(rgba[off + 1] > 200, "G too low at pixel {i}");
            assert!(rgba[off + 2] > 200, "B too low at pixel {i}");
            assert_eq!(rgba[off + 3], 255, "alpha should be 255");
        }
    }

    #[test]
    fn test_rgba_to_yuv420_4x4_none_on_short_slice() {
        assert!(rgba_to_yuv420_4x4(&[0u8; 63]).is_none());
    }

    #[test]
    fn test_rgba_to_yuv420_4x4_neutral_grey() {
        // 50% grey: R=G=B=128 → near-neutral chroma (cb,cr ≈ 128).
        let rgba = vec![128u8, 128, 128, 255].repeat(16);
        let (y_block, cb_block, cr_block) = rgba_to_yuv420_4x4(&rgba).unwrap();
        for &y in &y_block {
            // BT.601 Y for (128,128,128) ≈ 128
            assert!((y as i32 - 128).abs() < 5, "Y should be ≈128, got {y}");
        }
        for &cb in &cb_block {
            assert!((cb as i32 - 128).abs() < 5, "Cb should be ≈128, got {cb}");
        }
        for &cr in &cr_block {
            assert!((cr as i32 - 128).abs() < 5, "Cr should be ≈128, got {cr}");
        }
    }
}