colconv 0.1.0 - Docs.rs

//! Shared helpers for the x86_64 SIMD backends.
//!
//! Items here use SSE2 + SSSE3 + SSE4.1 intrinsics (e.g. `_mm_blendv_ps`,
//! `_mm_packus_epi32`), so they're safe to call from any x86 backend at
//! SSE4.1 or above (currently SSE4.1, AVX2, and AVX‑512).
//! `#[inline(always)]` guarantees they inline into the caller,
//! inheriting its `#[target_feature]` context.

use core::arch::x86_64::*;

/// Per-u32-lane byte-swap shuffle mask, shared by the BE-aware 10-bit
/// packed-RGB loaders below. SSSE3's `_mm_shuffle_epi8` selects byte
/// `mask[i]` of the input for each output byte, so this mask reverses
/// the four bytes inside every 32-bit lane.
#[cfg(feature = "rgb")]
const BSWAP_U32X4_MASK: __m128i =
  unsafe { core::mem::transmute([3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) };

/// Endian-aware 16-byte load (4 x u32) for X2RGB10 / X2BGR10 input.
///
/// On every modern x86 host (LE), `BE = false` is a plain
/// `_mm_loadu_si128`, and `BE = true` adds a single `_mm_shuffle_epi8`
/// per 16-byte chunk to byte-swap each u32 lane into host-native order
/// for the subsequent shift-and-mask field extraction. The const-bool
/// monomorphization eliminates the branch at compile time.
///
/// # Safety
///
/// - `ptr` must point to at least 16 readable bytes; alignment is
///   unrestricted (this is `_mm_loadu_si128`, not `_mm_load_si128`).
/// - **SSE2** must be available in the caller's `target_feature`
///   context for `_mm_loadu_si128`.
/// - **SSSE3** must be available in the caller's `target_feature`
///   context when `BE = true` for `_mm_shuffle_epi8`. SSE4.1 is the
///   typical caller floor here; in Rust's target-feature model
///   `sse4.1` enables `ssse3` (LLVM's SSE feature dependency chain),
///   so callers gated `#[target_feature(enable = "sse4.1")]` (or any
///   superset such as `avx2` / `avx512bw`) satisfy this requirement
///   without an extra `enable = "ssse3"`. The `BE = false` branch
///   issues no SSSE3 instruction, so SSE2 alone is sufficient there;
///   the merged const-generic body is still gated by the strictest of
///   the two paths, the SSSE3 requirement.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn x2_load_endian_u32x4<const BE: bool>(ptr: *const u8) -> __m128i {
  unsafe {
    let v = _mm_loadu_si128(ptr.cast());
    if BE {
      _mm_shuffle_epi8(v, BSWAP_U32X4_MASK)
    } else {
      v
    }
  }
}

/// Writes 16 pixels of packed RGB (48 bytes) from three u8x16 channel
/// vectors.
///
/// Three output blocks of 16 bytes each interleave R, G, B triples.
/// Each channel contributes specific bytes to each block; the shuffle
/// masks below assign those bytes (with `-1` = 0x80 = "zero the lane,
/// to be OR'd in by another channel's contribution").
///
/// Conceptually, block 0 (bytes 0..16) takes:
/// `R0, G0, B0, R1, G1, B1, R2, G2, B2, R3, G3, B3, R4, G4, B4, R5`.
/// Block 1 (bytes 16..32):
/// `G5, B5, R6, G6, B6, R7, G7, B7, R8, G8, B8, R9, G9, B9, R10, G10`.
/// Block 2 (bytes 32..48):
/// `B10, R11, G11, B11, ..., R15, G15, B15`.
///
/// Each of the three 16‑byte stores is the OR of three shuffles of
/// the R, G, B inputs. This is the well‑known SSSE3 3‑way interleave
/// pattern from libyuv / OpenCV.
///
/// # Safety
///
/// - `ptr` must point to at least 48 writable, properly aligned (or
///   unaligned‑tolerated via the `storeu` variant) bytes.
/// - The calling function must have SSSE3 available (either through
///   `#[target_feature(enable = "ssse3")]` / a superset feature like
///   `"sse4.1"` or `"avx2"`, or via the target's default feature set).
#[cfg(any(
  feature = "yuv-444-packed",
  feature = "rgb-legacy",
  feature = "mono",
  feature = "rgb",
  feature = "yuv-packed",
  feature = "gbr",
  feature = "yuv-semi-planar",
  feature = "yuv-planar",
  feature = "y2xx",
  feature = "xyz",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u8) {
  unsafe {
    // Shuffle masks for block 0 (first 16 output bytes).
    //   dst byte i gets source byte mask[i] from the corresponding
    //   input channel (R for r_mask, G for g_mask, B for b_mask).
    //   0x80 (`-1` as i8) zeroes that output lane.
    let r0 = _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
    let g0 = _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
    let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
    let out0 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
      _mm_shuffle_epi8(b, b0),
    );

    // Block 1 (bytes 16..32).
    let r1 = _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1);
    let g1 = _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10);
    let b1 = _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1);
    let out1 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
      _mm_shuffle_epi8(b, b1),
    );

    // Block 2 (bytes 32..48).
    let r2 = _mm_setr_epi8(
      -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1,
    );
    let g2 = _mm_setr_epi8(
      -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1,
    );
    let b2 = _mm_setr_epi8(
      10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15,
    );
    let out2 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
      _mm_shuffle_epi8(b, b2),
    );

    _mm_storeu_si128(ptr.cast(), out0);
    _mm_storeu_si128(ptr.add(16).cast(), out1);
    _mm_storeu_si128(ptr.add(32).cast(), out2);
  }
}

/// Writes 16 pixels of packed RGBA (64 bytes) from four u8x16 channel
/// vectors. Mirrors [`write_rgb_16`] for the 4-channel output path.
///
/// The 4-byte stride aligns cleanly with the 16-byte register width:
/// each output block holds exactly 4 RGBA quads (16 bytes), with R,
/// G, B, A interleaved at positions `(0, 1, 2, 3)`, `(4, 5, 6, 7)`,
/// etc. The shuffle masks are simpler than the 3-channel pattern
/// because a single source byte goes to a single output byte (no
/// channel "split across blocks" boundary).
///
/// Conceptually:
/// - Block 0 (bytes 0..16): R0,G0,B0,A0, R1,G1,B1,A1, R2,G2,B2,A2,
///   R3,G3,B3,A3
/// - Block 1 (bytes 16..32): pixels 4..7
/// - Block 2 (bytes 32..48): pixels 8..11
/// - Block 3 (bytes 48..64): pixels 12..15
///
/// Each block is the OR of four `_mm_shuffle_epi8` gathers — one per
/// channel — with `0x80` (`-1` as i8) zeroing lanes that another
/// channel's shuffle will fill.
///
/// # Safety
///
/// - `ptr` must point to at least 64 writable bytes.
/// - The calling function must have SSSE3 available (via
///   `#[target_feature(enable = "ssse3")]` or a superset).
#[cfg(any(
  feature = "yuv-444-packed",
  feature = "rgb-legacy",
  feature = "mono",
  feature = "rgb",
  feature = "yuv-packed",
  feature = "gbr",
  feature = "yuv-semi-planar",
  feature = "yuv-planar",
  feature = "y2xx",
  feature = "xyz",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_16(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) {
  unsafe {
    // Block 0 (bytes 0..16): pixels 0..3, source bytes 0..3 from
    // each channel placed at output positions
    // (0, 1, 2, 3) for pixel 0, (4, 5, 6, 7) for pixel 1, etc.
    let r0 = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1);
    let g0 = _mm_setr_epi8(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1);
    let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1);
    let a0 = _mm_setr_epi8(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
    let out0 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
      _mm_or_si128(_mm_shuffle_epi8(b, b0), _mm_shuffle_epi8(a, a0)),
    );

    // Block 1 (bytes 16..32): pixels 4..7, source bytes 4..7.
    let r1 = _mm_setr_epi8(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1);
    let g1 = _mm_setr_epi8(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1);
    let b1 = _mm_setr_epi8(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1);
    let a1 = _mm_setr_epi8(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7);
    let out1 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
      _mm_or_si128(_mm_shuffle_epi8(b, b1), _mm_shuffle_epi8(a, a1)),
    );

    // Block 2 (bytes 32..48): pixels 8..11, source bytes 8..11.
    let r2 = _mm_setr_epi8(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1);
    let g2 = _mm_setr_epi8(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1);
    let b2 = _mm_setr_epi8(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1);
    let a2 = _mm_setr_epi8(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11);
    let out2 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
      _mm_or_si128(_mm_shuffle_epi8(b, b2), _mm_shuffle_epi8(a, a2)),
    );

    // Block 3 (bytes 48..64): pixels 12..15, source bytes 12..15.
    let r3 = _mm_setr_epi8(
      12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1,
    );
    let g3 = _mm_setr_epi8(
      -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
    );
    let b3 = _mm_setr_epi8(
      -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1,
    );
    let a3 = _mm_setr_epi8(
      -1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15,
    );
    let out3 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r3), _mm_shuffle_epi8(g, g3)),
      _mm_or_si128(_mm_shuffle_epi8(b, b3), _mm_shuffle_epi8(a, a3)),
    );

    _mm_storeu_si128(ptr.cast(), out0);
    _mm_storeu_si128(ptr.add(16).cast(), out1);
    _mm_storeu_si128(ptr.add(32).cast(), out2);
    _mm_storeu_si128(ptr.add(48).cast(), out3);
  }
}

/// Writes 8 pixels of packed `u8` RGB (24 bytes) from three `__m128i`
/// channel vectors whose **low 8 bytes** carry the 8 valid samples
/// (high 8 bytes are don't-care, e.g. zero from a `_mm_packus_epi16`
/// against a zero register).
///
/// Two output blocks: block 0 stores 16 bytes
/// `[R0,G0,B0, R1,G1,B1, R2,G2,B2, R3,G3,B3, R4,G4,B4, R5]` via
/// `storeu_si128`, block 1 stores the remaining 8 bytes
/// `[G5,B5, R6,G6,B6, R7,G7,B7]` via `storel_epi64`. Each block is the
/// OR of three `_mm_shuffle_epi8` gathers (one per channel), reusing
/// the same byte-mask shape as the 16-pixel
/// [`write_rgb_16`](super::x86_common::write_rgb_16) helper but only
/// emitting blocks 0 and 1's first half (which together touch source
/// bytes 0..7 of each channel — exactly the valid range for 8-pixel
/// inputs).
///
/// This avoids round-tripping each channel through a 16-byte stack
/// array and a per-pixel scalar scatter loop, which would limit SIMD's
/// benefit on the output stage.
///
/// # Safety
///
/// - `ptr` must point to at least 24 writable bytes.
/// - The calling function must have SSSE3 available (via
///   `#[target_feature(enable = "sse4.1")]` or any AVX2 / AVX-512
///   superset).
//
// Gated on `std` / `alloc` because the only callers
// (`x86_sse41::xyz12`, `x86_avx2::xyz12`) are gated the same way.
// Without this gate `cargo build --no-default-features` fires a
// dead-code warning that becomes a hard error under `RUSTFLAGS=
// -Dwarnings` on Windows CI.
#[cfg(any(feature = "std", feature = "alloc"))]
#[cfg(feature = "xyz")]
#[inline(always)]
pub(super) unsafe fn write_rgb_u8_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u8) {
  unsafe {
    // Block 0 = bytes 0..15 = [R0,G0,B0, R1,G1,B1, R2,G2,B2, R3,G3,B3,
    // R4,G4,B4, R5]. Same masks as `write_rgb_16` block 0.
    let r0 = _mm_setr_epi8(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5);
    let g0 = _mm_setr_epi8(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1);
    let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1);
    let out0 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
      _mm_shuffle_epi8(b, b0),
    );

    // Block 1 first half = bytes 16..23 = [G5,B5, R6,G6,B6, R7,G7,B7].
    // Same masks as `write_rgb_16` block 1 — source bytes 5..7 of each
    // channel are inside the 0..7 valid range for 8-pixel inputs.
    let r1 = _mm_setr_epi8(-1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let g1 = _mm_setr_epi8(5, -1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let b1 = _mm_setr_epi8(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1);
    let out1 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
      _mm_shuffle_epi8(b, b1),
    );

    _mm_storeu_si128(ptr.cast(), out0);
    // `storel_epi64` writes the low 8 bytes of `out1` (= block 1's
    // first half) to `ptr + 16`, leaving any subsequent bytes
    // untouched — important for callers writing into a tightly-bound
    // 24-byte slot.
    _mm_storel_epi64(ptr.add(16).cast(), out1);
  }
}

/// Writes 8 pixels of packed `u8` RGBA (32 bytes) from four `__m128i`
/// channel vectors whose **low 8 bytes** carry the 8 valid samples.
///
/// Pure unpack ladder — no shuffle masks needed because the 4-byte
/// RGBA stride aligns with `__m128i` lanes:
/// 1. `rg = unpacklo_epi8(R, G)` interleaves bytes:
///    `[R0,G0, R1,G1, ..., R7,G7]` (16 bytes, 8 R/G pairs).
/// 2. `ba = unpacklo_epi8(B, A)` likewise: `[B0,A0, ..., B7,A7]`.
/// 3. `rgba_lo = unpacklo_epi16(rg, ba)` zips into RGBA quads:
///    `[R0,G0,B0,A0, R1,G1,B1,A1, R2,G2,B2,A2, R3,G3,B3,A3]`.
/// 4. `rgba_hi = unpackhi_epi16(rg, ba)` produces pixels 4..7.
///
/// Two `_mm_storeu_si128` writes emit the 32-byte output. This avoids
/// a 4-channel scalar scatter that would limit SIMD's benefit on the
/// output stage.
///
/// # Safety
///
/// - `ptr` must point to at least 32 writable bytes.
/// - The calling function must have SSE2 available (any SSE4.1+
///   superset satisfies this).
//
// Same `std` / `alloc` gate as [`write_rgb_u8_8`] — only the xyz12
// SIMD kernels (gated on those features) consume this helper.
#[cfg(any(feature = "std", feature = "alloc"))]
#[cfg(feature = "xyz")]
#[inline(always)]
pub(super) unsafe fn write_rgba_u8_8(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) {
  unsafe {
    let rg = _mm_unpacklo_epi8(r, g);
    let ba = _mm_unpacklo_epi8(b, a);
    let rgba_lo = _mm_unpacklo_epi16(rg, ba);
    let rgba_hi = _mm_unpackhi_epi16(rg, ba);
    _mm_storeu_si128(ptr.cast(), rgba_lo);
    _mm_storeu_si128(ptr.add(16).cast(), rgba_hi);
  }
}

/// Writes 8 pixels of packed **`u16`** RGB (48 bytes = 24 `u16`)
/// from three `u16x8` channel vectors. Drives the SSE4.1 / AVX2 /
/// AVX‑512 high‑bit‑depth kernels' u16 output path.
///
/// Three output blocks of 16 bytes (8 `u16`) each hold:
/// - Block 0: `R0, G0, B0, R1, G1, B1, R2, G2` (u16 indices 0..7)
/// - Block 1: `B2, R3, G3, B3, R4, G4, B4, R5`
/// - Block 2: `G5, B5, R6, G6, B6, R7, G7, B7`
///
/// Each block is the OR of three `_mm_shuffle_epi8` gathers — one
/// from each of R, G, B — with the byte mask picking a pair of
/// adjacent bytes (lo, hi) for every `u16` sourced from that
/// channel. 0x80 (`-1` as i8) zeros the lane, to be OR'd in by
/// another channel's contribution.
///
/// # Safety
///
/// - `ptr` must point to at least 48 writable bytes (aligned or
///   unaligned — we use `storeu`).
/// - The calling function must have SSSE3 available (via SSE4.1 or
///   a superset like AVX2 / AVX‑512BW).
#[cfg(any(
  feature = "yuv-444-packed",
  feature = "rgb-legacy",
  feature = "mono",
  feature = "rgb",
  feature = "gbr",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *mut u16) {
  unsafe {
    // Block 0 = [R0 G0 B0 R1 G1 B1 R2 G2] — 8 `u16` = 16 bytes.
    // R contributes pairs (0,1), (6,7), (12,13); G pairs (2,3), (8,9),
    // (14,15); B pairs (4,5), (10,11).
    let r0 = _mm_setr_epi8(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1);
    let g0 = _mm_setr_epi8(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5);
    let b0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1);
    let out0 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
      _mm_shuffle_epi8(b, b0),
    );

    // Block 1 = [B2 R3 G3 B3 R4 G4 B4 R5]. R pairs (6,7), (8,9),
    // (10,11); G pairs (6,7), (8,9); B pairs (4,5), (6,7), (8,9).
    let r1 = _mm_setr_epi8(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11);
    let g1 = _mm_setr_epi8(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1);
    let b1 = _mm_setr_epi8(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1);
    let out1 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
      _mm_shuffle_epi8(b, b1),
    );

    // Block 2 = [G5 B5 R6 G6 B6 R7 G7 B7]. R pairs (12,13), (14,15);
    // G pairs (10,11), (12,13), (14,15); B pairs (10,11), (12,13),
    // (14,15).
    let r2 = _mm_setr_epi8(
      -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1,
    );
    let g2 = _mm_setr_epi8(
      10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1,
    );
    let b2 = _mm_setr_epi8(
      -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15,
    );
    let out2 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
      _mm_shuffle_epi8(b, b2),
    );

    _mm_storeu_si128(ptr.cast(), out0);
    _mm_storeu_si128(ptr.add(8).cast(), out1);
    _mm_storeu_si128(ptr.add(16).cast(), out2);
  }
}

/// Interleaves 8 R/G/B/A `u16` samples into packed RGBA quads (32
/// `u16` = 64 bytes). Two 16-bit unpack stages followed by two 32-bit
/// unpack stages produce four 16-byte chunks of `[R, G, B, A]` quads,
/// stored back-to-back via `_mm_storeu_si128`.
///
/// # Safety
///
/// - `ptr` must point to at least 64 writable bytes (aligned or
///   unaligned — we use `storeu`).
/// - The calling function must have SSE2 available (the unpack +
///   `storeu_si128` intrinsics; SSE4.1 / AVX2 / AVX-512 supersets all
///   satisfy this).
#[cfg(any(
  feature = "yuv-444-packed",
  feature = "rgb-legacy",
  feature = "mono",
  feature = "rgb",
  feature = "gbr",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "y2xx",
))]
#[inline(always)]
pub(super) unsafe fn write_rgba_u16_8(
  r: __m128i,
  g: __m128i,
  b: __m128i,
  a: __m128i,
  ptr: *mut u16,
) {
  unsafe {
    let rg_lo = _mm_unpacklo_epi16(r, g);
    let rg_hi = _mm_unpackhi_epi16(r, g);
    let ba_lo = _mm_unpacklo_epi16(b, a);
    let ba_hi = _mm_unpackhi_epi16(b, a);
    let q0 = _mm_unpacklo_epi32(rg_lo, ba_lo);
    let q1 = _mm_unpackhi_epi32(rg_lo, ba_lo);
    let q2 = _mm_unpacklo_epi32(rg_hi, ba_hi);
    let q3 = _mm_unpackhi_epi32(rg_hi, ba_hi);
    _mm_storeu_si128(ptr.cast(), q0);
    _mm_storeu_si128(ptr.add(8).cast(), q1);
    _mm_storeu_si128(ptr.add(16).cast(), q2);
    _mm_storeu_si128(ptr.add(24).cast(), q3);
  }
}

/// Swaps the outer two channels of 16 packed 3‑byte pixels (48 bytes
/// in, 48 bytes out). Drives both BGR→RGB and RGB→BGR conversions
/// since the transformation is self‑inverse.
///
/// Uses the SSSE3 `_mm_shuffle_epi8` 3‑way gather pattern: each 16‑byte
/// output chunk is built from shuffles of the three adjacent input
/// chunks, combined with `_mm_or_si128`. 7 shuffles + 4 ORs per 16
/// pixels. Mask values verified byte‑by‑byte against the scalar
/// reference (see the equivalence tests in `neon`/x86 backends).
///
/// # Safety
///
/// - `input_ptr` must point to at least 48 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - The calling function must have SSSE3 available (either through
///   `#[target_feature(enable = "ssse3")]` / a superset feature like
///   `"sse4.1"` / `"avx2"` / `"avx512bw"`, or the target's defaults).
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn swap_rb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());

    // Output chunk 0 (abs bytes 0..16): 15 bytes from chunk 0, byte 15
    // (= R5) pulled from chunk 1 local position 1.
    let m00 = _mm_setr_epi8(2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, -1);
    let m01 = _mm_setr_epi8(
      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1,
    );
    let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));

    // Output chunk 1 (abs bytes 16..32): most from chunk 1, byte 17
    // (= B5) from chunk 0, byte 30 (= R10) from chunk 2.
    let m10 = _mm_setr_epi8(
      -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m11 = _mm_setr_epi8(0, -1, 4, 3, 2, 7, 6, 5, 10, 9, 8, 13, 12, 11, -1, 15);
    let m12 = _mm_setr_epi8(
      -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1,
    );
    let out1 = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(in0, m10), _mm_shuffle_epi8(in1, m11)),
      _mm_shuffle_epi8(in2, m12),
    );

    // Output chunk 2 (abs bytes 32..48): 15 bytes from chunk 2, byte
    // 32 (= B10) pulled from chunk 1 local position 14.
    let m20 = _mm_setr_epi8(
      14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m21 = _mm_setr_epi8(-1, 3, 2, 1, 6, 5, 4, 9, 8, 7, 12, 11, 10, 15, 14, 13);
    let out2 = _mm_or_si128(_mm_shuffle_epi8(in1, m20), _mm_shuffle_epi8(in2, m21));

    _mm_storeu_si128(output_ptr.cast(), out0);
    _mm_storeu_si128(output_ptr.add(16).cast(), out1);
    _mm_storeu_si128(output_ptr.add(32).cast(), out2);
  }
}

/// Swaps R↔B in 4 packed BGRA pixels (16 bytes), preserving alpha.
/// One `_mm_shuffle_epi8` per 16-byte vector — within each 4-byte
/// pixel, byte 0 ↔ byte 2 (alpha at byte 3 unchanged). Self-inverse,
/// so the same helper can be used for `RGBA→BGRA` (not currently
/// wired, but the semantics are identical).
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - The calling function must have SSSE3 available (via
///   `#[target_feature(enable = "ssse3")]` or a superset feature
///   like `"sse4.1"` / `"avx2"` / `"avx512bw"`, or the target's
///   defaults).
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn swap_rb_alpha_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let out_v = _mm_shuffle_epi8(in_v, mask);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Drops the alpha byte from 16 packed RGBA pixels (64 input bytes,
/// 48 output bytes). Four input vectors of 4 pixels each compact into
/// three output vectors. Each output block ORs two shuffles drawing
/// from neighbouring input vectors — the same "lane straddle" pattern
/// as [`swap_rb_16_pixels`] for the 3-channel target.
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn drop_alpha_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
    let in3 = _mm_loadu_si128(input_ptr.add(48).cast());

    // out0 (bytes 0..16): pixels 0..3 from in0, pixel 4 + R5 from in1.
    let m00 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1);
    let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4);
    let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));

    // out1 (bytes 16..32): G5 B5 R6..R9 G9 B9 R10 G10 from in1+in2.
    let m11 = _mm_setr_epi8(5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1);
    let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9);
    let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));

    // out2 (bytes 32..48): B10 + pixel 11 from in2, pixels 12..15 from in3.
    let m22 = _mm_setr_epi8(
      10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m23 = _mm_setr_epi8(-1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14);
    let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));

    _mm_storeu_si128(output_ptr.cast(), out0);
    _mm_storeu_si128(output_ptr.add(16).cast(), out1);
    _mm_storeu_si128(output_ptr.add(32).cast(), out2);
  }
}

/// Drops the leading alpha byte from 16 packed ARGB pixels (64
/// input bytes → 48 output bytes). Same compaction shape as
/// [`drop_alpha_16_pixels`], but each pixel triple is read at
/// offsets `(+1, +2, +3)` (R, G, B) since alpha is at the **leading**
/// position. Used by the Ship 9c [`Argb`](crate::source::Argb)
/// `with_rgb` / `with_luma` / `with_hsv` paths.
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn argb_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
    let in3 = _mm_loadu_si128(input_ptr.add(48).cast());

    let m00 = _mm_setr_epi8(1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, -1, -1, -1, -1);
    let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 5);
    let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));

    let m11 = _mm_setr_epi8(6, 7, 9, 10, 11, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1);
    let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 1, 2, 3, 5, 6, 7, 9, 10);
    let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));

    let m22 = _mm_setr_epi8(
      11, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m23 = _mm_setr_epi8(-1, -1, -1, -1, 1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15);
    let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));

    _mm_storeu_si128(output_ptr.cast(), out0);
    _mm_storeu_si128(output_ptr.add(16).cast(), out1);
    _mm_storeu_si128(output_ptr.add(32).cast(), out2);
  }
}

/// Drops leading alpha and reverses the inner three bytes for 16
/// packed ABGR pixels (64 input bytes → 48 output bytes). Each
/// pixel triple is read at offsets `(+3, +2, +1)` (R, G, B from
/// `A, B, G, R` input). Used by the Ship 9c
/// [`Abgr`](crate::source::Abgr) `with_rgb` / `with_luma` / `with_hsv`
/// paths.
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn abgr_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
    let in3 = _mm_loadu_si128(input_ptr.add(48).cast());

    let m00 = _mm_setr_epi8(3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1);
    let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 2, 1, 7);
    let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));

    let m11 = _mm_setr_epi8(6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, -1, -1, -1, -1);
    let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10);
    let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));

    let m22 = _mm_setr_epi8(
      9, 15, 14, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m23 = _mm_setr_epi8(-1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13);
    let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));

    _mm_storeu_si128(output_ptr.cast(), out0);
    _mm_storeu_si128(output_ptr.add(16).cast(), out1);
    _mm_storeu_si128(output_ptr.add(32).cast(), out2);
  }
}

/// Rotates leading alpha to trailing position in 4 packed ARGB
/// pixels (16 bytes). One `_mm_shuffle_epi8` per 16-byte vector —
/// within each 4-byte pixel: `(A, R, G, B) → (R, G, B, A)`.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn argb_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let out_v = _mm_shuffle_epi8(in_v, mask);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Reverses byte order in 4 packed ABGR pixels (16 bytes). One
/// `_mm_shuffle_epi8` per 16-byte vector — within each 4-byte
/// pixel: `(A, B, G, R) → (R, G, B, A)`. Self-inverse.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn abgr_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let out_v = _mm_shuffle_epi8(in_v, mask);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

// ===== Padding-byte helpers (Ship 9d) ====================================
//
// Used by the [`Xrgb`](crate::source::Xrgb) / [`Rgbx`](crate::source::Rgbx) /
// [`Xbgr`](crate::source::Xbgr) / [`Bgrx`](crate::source::Bgrx) source-side
// `with_rgba` paths. Each helper takes 4 input pixels (16 bytes), pulls
// R/G/B from the appropriate byte offsets, and forces the alpha lane
// to `0xFF` via OR with a constant. The padding byte's value is
// discarded — the shuffle mask uses `-1` (0x80) on the alpha lane to
// zero it before the OR.

/// Drops the **leading** padding byte from 4 packed XRGB pixels (16
/// bytes), producing 4 packed RGBA pixels with `A = 0xFF`. One
/// `_mm_shuffle_epi8` + one `_mm_or_si128` per 16-byte vector.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn xrgb_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    // Pull bytes 1,2,3 of each input pixel into the first three output
    // lanes; -1 zeros the alpha lane (filled in by the OR below).
    let mask = _mm_setr_epi8(1, 2, 3, -1, 5, 6, 7, -1, 9, 10, 11, -1, 13, 14, 15, -1);
    // 0xFF in every alpha lane; 0 elsewhere.
    let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let shuffled = _mm_shuffle_epi8(in_v, mask);
    let out_v = _mm_or_si128(shuffled, alpha);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Drops the **trailing** padding byte from 4 packed RGBX pixels (16
/// bytes), producing 4 packed RGBA pixels with `A = 0xFF`. One
/// `_mm_shuffle_epi8` + one `_mm_or_si128` per 16-byte vector.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn rgbx_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(0, 1, 2, -1, 4, 5, 6, -1, 8, 9, 10, -1, 12, 13, 14, -1);
    let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let shuffled = _mm_shuffle_epi8(in_v, mask);
    let out_v = _mm_or_si128(shuffled, alpha);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Reverses RGB and drops **leading** padding from 4 packed XBGR
/// pixels (16 bytes), producing 4 packed RGBA pixels with `A = 0xFF`.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn xbgr_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(3, 2, 1, -1, 7, 6, 5, -1, 11, 10, 9, -1, 15, 14, 13, -1);
    let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let shuffled = _mm_shuffle_epi8(in_v, mask);
    let out_v = _mm_or_si128(shuffled, alpha);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Reverses RGB and drops **trailing** padding from 4 packed BGRX
/// pixels (16 bytes), producing 4 packed RGBA pixels with `A = 0xFF`.
///
/// # Safety
///
/// - `input_ptr` must point to at least 16 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn bgrx_to_rgba_4_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let mask = _mm_setr_epi8(2, 1, 0, -1, 6, 5, 4, -1, 10, 9, 8, -1, 14, 13, 12, -1);
    let alpha = _mm_set1_epi32(0xFF00_0000_u32 as i32);
    let in_v = _mm_loadu_si128(input_ptr.cast());
    let shuffled = _mm_shuffle_epi8(in_v, mask);
    let out_v = _mm_or_si128(shuffled, alpha);
    _mm_storeu_si128(output_ptr.cast(), out_v);
  }
}

/// Swaps R↔B and drops alpha for 16 packed BGRA pixels (64 input
/// bytes → 48 output bytes). Same shape as [`drop_alpha_16_pixels`]
/// but each pixel triple is read with the channel order reversed
/// (`B, G, R, A` → `R, G, B`). Within each 4-byte input pixel:
/// R is at offset +2, G at +1, B at +0; A at +3 is discarded.
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - SSSE3 must be available in the caller's `target_feature` context.
#[cfg(feature = "rgb")]
#[inline(always)]
pub(super) unsafe fn bgra_to_rgb_16_pixels(input_ptr: *const u8, output_ptr: *mut u8) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());
    let in3 = _mm_loadu_si128(input_ptr.add(48).cast());

    let m00 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
    let m01 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6);
    let out0 = _mm_or_si128(_mm_shuffle_epi8(in0, m00), _mm_shuffle_epi8(in1, m01));

    let m11 = _mm_setr_epi8(5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1);
    let m12 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9);
    let out1 = _mm_or_si128(_mm_shuffle_epi8(in1, m11), _mm_shuffle_epi8(in2, m12));

    let m22 = _mm_setr_epi8(
      8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
    );
    let m23 = _mm_setr_epi8(-1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12);
    let out2 = _mm_or_si128(_mm_shuffle_epi8(in2, m22), _mm_shuffle_epi8(in3, m23));

    _mm_storeu_si128(output_ptr.cast(), out0);
    _mm_storeu_si128(output_ptr.add(16).cast(), out1);
    _mm_storeu_si128(output_ptr.add(32).cast(), out2);
  }
}

// ===== 10-bit packed RGB (Ship 9e) =======================================
//
// Each 4-byte input pixel is a `u32` LE word with packing
// `(MSB) 2X | 10c2 | 10c1 | 10c0 (LSB)` — c2/c1/c0 are R/G/B for
// X2RGB10 and B/G/R for X2BGR10. The helpers below extract the three
// 10-bit channels via `_mm_srli_epi32` + `_mm_and_si128`, narrow the
// `u32` lanes to `u8` (or keep `u16`), and call `write_rgb_16` /
// `write_rgba_16` / `write_rgb_u16_8` to interleave into the packed
// output.
//
// Down-shift to `u8` uses an extra `>> 2` on the channel bits — i.e.
// `(pix >> 22) & 0xFF` extracts the top 8 of the 10-bit R10 channel
// at bits 20..29 directly, skipping a separate masking step.

/// Extracts a 10-bit channel as a `u8` (top 8 bits) from each of 4
/// `u32` pixels in a `__m128i`. The returned vector holds the `u8`
/// value in the low byte of each `u32` lane (high 24 bits zero).
///
/// `SHIFT` selects which channel: 22 for the bits at 20..29, 12 for
/// 10..19, 2 for 0..9.
///
/// # Safety
///
/// Caller's `target_feature` must include SSE2 or higher; SSE4.1 and
/// up include this.
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn extract_10bit_to_u8_lane<const SHIFT: i32>(pix: __m128i) -> __m128i {
  unsafe {
    let mask_ff = _mm_set1_epi32(0xFF);
    _mm_and_si128(_mm_srli_epi32::<SHIFT>(pix), mask_ff)
  }
}

/// Extracts a 10-bit channel as a `u16` from each of 4 `u32`
/// pixels. Returned vector holds the 10-bit value (range `[0, 1023]`)
/// in the low `u16` half of each `u32` lane.
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn extract_10bit_to_u16_lane<const SHIFT: i32>(pix: __m128i) -> __m128i {
  unsafe {
    let mask_3ff = _mm_set1_epi32(0x3FF);
    _mm_and_si128(_mm_srli_epi32::<SHIFT>(pix), mask_3ff)
  }
}

/// Packs 4x `u32x4` channel vectors (16 `u8` values laid out one per
/// `u32` lane) into a single contiguous `u8x16` channel vector.
/// Two-stage saturating narrow: `_mm_packus_epi32` for u32→u16,
/// then `_mm_packus_epi16` for u16→u8.
///
/// Values are bounded to `[0, 255]` upstream, so saturation never
/// triggers.
#[inline(always)]
#[cfg(feature = "rgb")]
unsafe fn pack_u32x4_quad_to_u8x16(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
  unsafe {
    let lo = _mm_packus_epi32(v0, v1);
    let hi = _mm_packus_epi32(v2, v3);
    _mm_packus_epi16(lo, hi)
  }
}

/// Drops the 2-bit padding and down-shifts each 10-bit channel to
/// 8 bits, producing 16 packed RGB pixels (48 output bytes) from 16
/// X2RGB10 LE pixels (64 input bytes).
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - **SSE4.1** must be available (caller's `target_feature`; or a
///   superset such as `avx2` / `avx512bw`). The two-stage narrow
///   inside [`pack_u32x4_quad_to_u8x16`] uses `_mm_packus_epi32`,
///   which is SSE4.1 — SSSE3 alone is not enough.
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgb_16_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
    let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
    let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));

    // X2RGB10: R at bits 20..29, G at 10..19, B at 0..9.
    // Down-shift by 2 → u8 channel value lives in bits 22..29 / 12..19 / 2..9.
    let r = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<22>(p0),
      extract_10bit_to_u8_lane::<22>(p1),
      extract_10bit_to_u8_lane::<22>(p2),
      extract_10bit_to_u8_lane::<22>(p3),
    );
    let g = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<12>(p0),
      extract_10bit_to_u8_lane::<12>(p1),
      extract_10bit_to_u8_lane::<12>(p2),
      extract_10bit_to_u8_lane::<12>(p3),
    );
    let b = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<2>(p0),
      extract_10bit_to_u8_lane::<2>(p1),
      extract_10bit_to_u8_lane::<2>(p2),
      extract_10bit_to_u8_lane::<2>(p3),
    );

    write_rgb_16(r, g, b, output_ptr);
  }
}

/// Drops the 2-bit padding, down-shifts each 10-bit channel to 8
/// bits, and forces alpha to `0xFF`. 16 input pixels → 16 output
/// RGBA pixels (64 output bytes).
///
/// # Safety
///
/// - `input_ptr` must point to at least 64 readable bytes.
/// - `output_ptr` must point to at least 64 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - **SSE4.1** must be available (caller's `target_feature`; or a
///   superset such as `avx2` / `avx512bw`). See
///   [`x2rgb10_to_rgb_16_pixels`] for the rationale —
///   `_mm_packus_epi32` inside the shared narrow helper is SSE4.1.
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgba_16_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
    let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
    let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));

    let r = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<22>(p0),
      extract_10bit_to_u8_lane::<22>(p1),
      extract_10bit_to_u8_lane::<22>(p2),
      extract_10bit_to_u8_lane::<22>(p3),
    );
    let g = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<12>(p0),
      extract_10bit_to_u8_lane::<12>(p1),
      extract_10bit_to_u8_lane::<12>(p2),
      extract_10bit_to_u8_lane::<12>(p3),
    );
    let b = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<2>(p0),
      extract_10bit_to_u8_lane::<2>(p1),
      extract_10bit_to_u8_lane::<2>(p2),
      extract_10bit_to_u8_lane::<2>(p3),
    );
    let alpha = _mm_set1_epi8(-1i8);
    write_rgba_16(r, g, b, alpha, output_ptr);
  }
}

/// Extracts each 10-bit channel into native-depth `u16` (low-bit
/// aligned, max value `1023`), producing 8 packed RGB pixels (24
/// `u16` elements = 48 output bytes) from 8 X2RGB10 LE pixels (32
/// input bytes).
///
/// # Safety
///
/// - `input_ptr` must point to at least 32 readable bytes.
/// - `output_ptr` must point to at least 48 writable bytes.
/// - `input_ptr` / `output_ptr` ranges must not alias.
/// - **SSE4.1** must be available — `_mm_packus_epi32` is SSE4.1,
///   not SSSE3.
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2rgb10_to_rgb_u16_8_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));

    // Two-stage narrow u32x4x2 → u16x8 (no second narrow needed —
    // u16 is already the destination element type).
    let r = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<20>(p0),
      extract_10bit_to_u16_lane::<20>(p1),
    );
    let g = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<10>(p0),
      extract_10bit_to_u16_lane::<10>(p1),
    );
    let b = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<0>(p0),
      extract_10bit_to_u16_lane::<0>(p1),
    );
    write_rgb_u16_8(r, g, b, output_ptr.cast::<u16>());
  }
}

/// X2BGR10 LE counterpart of [`x2rgb10_to_rgb_16_pixels`]. Channel
/// shift positions are swapped: R at bits 0..9, B at 20..29.
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgb_16_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
    let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
    let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));

    let r = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<2>(p0),
      extract_10bit_to_u8_lane::<2>(p1),
      extract_10bit_to_u8_lane::<2>(p2),
      extract_10bit_to_u8_lane::<2>(p3),
    );
    let g = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<12>(p0),
      extract_10bit_to_u8_lane::<12>(p1),
      extract_10bit_to_u8_lane::<12>(p2),
      extract_10bit_to_u8_lane::<12>(p3),
    );
    let b = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<22>(p0),
      extract_10bit_to_u8_lane::<22>(p1),
      extract_10bit_to_u8_lane::<22>(p2),
      extract_10bit_to_u8_lane::<22>(p3),
    );
    write_rgb_16(r, g, b, output_ptr);
  }
}

/// X2BGR10 LE counterpart of [`x2rgb10_to_rgba_16_pixels`].
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgba_16_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));
    let p2 = x2_load_endian_u32x4::<BE>(input_ptr.add(32));
    let p3 = x2_load_endian_u32x4::<BE>(input_ptr.add(48));

    let r = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<2>(p0),
      extract_10bit_to_u8_lane::<2>(p1),
      extract_10bit_to_u8_lane::<2>(p2),
      extract_10bit_to_u8_lane::<2>(p3),
    );
    let g = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<12>(p0),
      extract_10bit_to_u8_lane::<12>(p1),
      extract_10bit_to_u8_lane::<12>(p2),
      extract_10bit_to_u8_lane::<12>(p3),
    );
    let b = pack_u32x4_quad_to_u8x16(
      extract_10bit_to_u8_lane::<22>(p0),
      extract_10bit_to_u8_lane::<22>(p1),
      extract_10bit_to_u8_lane::<22>(p2),
      extract_10bit_to_u8_lane::<22>(p3),
    );
    let alpha = _mm_set1_epi8(-1i8);
    write_rgba_16(r, g, b, alpha, output_ptr);
  }
}

/// X2BGR10 LE counterpart of [`x2rgb10_to_rgb_u16_8_pixels`].
#[inline(always)]
#[cfg(feature = "rgb")]
pub(super) unsafe fn x2bgr10_to_rgb_u16_8_pixels<const BE: bool>(
  input_ptr: *const u8,
  output_ptr: *mut u8,
) {
  unsafe {
    let p0 = x2_load_endian_u32x4::<BE>(input_ptr);
    let p1 = x2_load_endian_u32x4::<BE>(input_ptr.add(16));

    let r = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<0>(p0),
      extract_10bit_to_u16_lane::<0>(p1),
    );
    let g = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<10>(p0),
      extract_10bit_to_u16_lane::<10>(p1),
    );
    let b = _mm_packus_epi32(
      extract_10bit_to_u16_lane::<20>(p0),
      extract_10bit_to_u16_lane::<20>(p1),
    );
    write_rgb_u16_8(r, g, b, output_ptr.cast::<u16>());
  }
}

// ---- RGB → HSV support --------------------------------------------------
//
// Matches the scalar `rgb_to_hsv_row` within ±1 LSB. Every op mirrors
// the scalar: f32 max/min preserves the same channel selection, and the
// branch cascade uses `_mm_blendv_ps` in the same
// `delta == 0 → v == r → v == g → v == b` priority as the scalar.
// For division we use `_mm_rcp_ps` followed by one Newton‑Raphson
// refinement step (`rcp * (2 - v * rcp)`) — ~3x faster than true
// `_mm_div_ps` at the cost of ±1 LSB in S/H. `#[inline(always)]`
// guarantees each helper inlines into its caller, so the
// SSSE3+SSE4.1 intrinsics execute in whatever `target_feature` context
// (sse4.1 / avx2 / avx512) the outer kernel declares.

/// Deinterleaves 48 bytes of packed RGB into three u8x16 channel
/// vectors (R, G, B). 9 shuffles + 6 ORs — mirror of the swap pattern.
///
/// # Safety
///
/// `input_ptr` must point to at least 48 readable bytes. Caller's
/// `target_feature` must include SSSE3 (via sse4.1 or higher).
#[inline(always)]
pub(super) unsafe fn deinterleave_rgb_16(input_ptr: *const u8) -> (__m128i, __m128i, __m128i) {
  unsafe {
    let in0 = _mm_loadu_si128(input_ptr.cast());
    let in1 = _mm_loadu_si128(input_ptr.add(16).cast());
    let in2 = _mm_loadu_si128(input_ptr.add(32).cast());

    // R bytes live at absolute positions 3k for k=0..15; in chunk 0
    // that's local [0,3,6,9,12,15] (6 values), chunk 1 [2,5,8,11,14]
    // (5 values), chunk 2 [1,4,7,10,13] (5 values).
    let mr0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let mr1 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1);
    let mr2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 4, 7, 10, 13);
    let r = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(in0, mr0), _mm_shuffle_epi8(in1, mr1)),
      _mm_shuffle_epi8(in2, mr2),
    );

    // G bytes at positions 3k+1: chunk 0 [1,4,7,10,13], chunk 1
    // [0,3,6,9,12,15], chunk 2 [2,5,8,11,14].
    let mg0 = _mm_setr_epi8(1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let mg1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1);
    let mg2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14);
    let g = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(in0, mg0), _mm_shuffle_epi8(in1, mg1)),
      _mm_shuffle_epi8(in2, mg2),
    );

    // B bytes at positions 3k+2: chunk 0 [2,5,8,11,14], chunk 1
    // [1,4,7,10,13], chunk 2 [0,3,6,9,12,15].
    let mb0 = _mm_setr_epi8(2, 5, 8, 11, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let mb1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 1, 4, 7, 10, 13, -1, -1, -1, -1, -1, -1);
    let mb2 = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 3, 6, 9, 12, 15);
    let b = _mm_or_si128(
      _mm_or_si128(_mm_shuffle_epi8(in0, mb0), _mm_shuffle_epi8(in1, mb1)),
      _mm_shuffle_epi8(in2, mb2),
    );

    (r, g, b)
  }
}

/// Widens a u8x16 to four f32x4 groups (lanes 0..3, 4..7, 8..11,
/// 12..15). Zero‑extends via `_mm_cvtepu8_epi32` (SSE4.1) then converts
/// to f32.
#[inline(always)]
fn u8x16_to_f32x4_quad(v: __m128i) -> (__m128, __m128, __m128, __m128) {
  unsafe {
    let i0 = _mm_cvtepu8_epi32(v);
    let i1 = _mm_cvtepu8_epi32(_mm_srli_si128::<4>(v));
    let i2 = _mm_cvtepu8_epi32(_mm_srli_si128::<8>(v));
    let i3 = _mm_cvtepu8_epi32(_mm_srli_si128::<12>(v));
    (
      _mm_cvtepi32_ps(i0),
      _mm_cvtepi32_ps(i1),
      _mm_cvtepi32_ps(i2),
      _mm_cvtepi32_ps(i3),
    )
  }
}

/// Packs four f32x4 vectors (16 values in [0, 255]) to one u8x16.
/// Truncates f32 → i32 via `_mm_cvttps_epi32`, matches scalar `as u8`
/// (values are pre‑clamped so saturation on the narrowing steps is
/// a no‑op).
#[inline(always)]
fn f32x4_quad_to_u8x16(a: __m128, b: __m128, c: __m128, d: __m128) -> __m128i {
  unsafe {
    let ai = _mm_cvttps_epi32(a);
    let bi = _mm_cvttps_epi32(b);
    let ci = _mm_cvttps_epi32(c);
    let di = _mm_cvttps_epi32(d);
    let ab = _mm_packus_epi32(ai, bi); // i32x4 x 2 → u16x8
    let cd = _mm_packus_epi32(ci, di);
    _mm_packus_epi16(ab, cd) // u16x8 x 2 → u8x16
  }
}

/// Computes HSV for 4 pixels. Mirrors the scalar
/// `rgb_to_hsv_pixel` op‑for‑op. Returns `(h_quant, s_quant, v_quant)`
/// as f32x4 — already clamped to the scalar output ranges, still f32
/// awaiting the truncating cast in the caller.
#[inline(always)]
fn hsv_group(r: __m128, g: __m128, b: __m128) -> (__m128, __m128, __m128) {
  unsafe {
    let zero = _mm_setzero_ps();
    let half = _mm_set1_ps(0.5);
    let sixty = _mm_set1_ps(60.0);
    let one_twenty = _mm_set1_ps(120.0);
    let two_forty = _mm_set1_ps(240.0);
    let three_sixty = _mm_set1_ps(360.0);
    let one_seventy_nine = _mm_set1_ps(179.0);
    let two_fifty_five = _mm_set1_ps(255.0);

    let two = _mm_set1_ps(2.0);

    // V = max(r, g, b); min = min(r, g, b); delta = V - min.
    let v = _mm_max_ps(_mm_max_ps(r, g), b);
    let min_rgb = _mm_min_ps(_mm_min_ps(r, g), b);
    let delta = _mm_sub_ps(v, min_rgb);

    // Replace `_mm_div_ps` with 11‑bit reciprocal + one Newton‑Raphson
    // refinement step. On Skylake+/Zen4 `_mm_rcp_ps` is ~4 cycles vs
    // `_mm_div_ps` at ~13, and the refinement (`rcp * (2 - v * rcp)`)
    // adds ~7 cycles but brings precision to ~23 bits — more than
    // enough for u8 HSV output. Net ~20% throughput improvement on
    // x86 vs the f32 divide path. Output remains within ±1 LSB of the
    // scalar LUT reference.
    //
    // v = 0 / delta = 0 inputs would produce NaN through the Newton
    // step but are masked to 0 / 0 in the cascade below, so the NaNs
    // are always discarded before quantization.
    let v_rcp0 = _mm_rcp_ps(v);
    let v_rcp = _mm_mul_ps(v_rcp0, _mm_sub_ps(two, _mm_mul_ps(v, v_rcp0)));
    let delta_rcp0 = _mm_rcp_ps(delta);
    let delta_rcp = _mm_mul_ps(delta_rcp0, _mm_sub_ps(two, _mm_mul_ps(delta, delta_rcp0)));

    // S = if v == 0 { 0 } else { 255 * delta * rcp(v) }.
    let mask_v_zero = _mm_cmpeq_ps(v, zero);
    let s_nonzero = _mm_mul_ps(_mm_mul_ps(two_fifty_five, delta), v_rcp);
    let s = _mm_blendv_ps(s_nonzero, zero, mask_v_zero);

    // Hue branches.
    let mask_delta_zero = _mm_cmpeq_ps(delta, zero);
    let mask_v_is_r = _mm_cmpeq_ps(v, r);
    let mask_v_is_g = _mm_cmpeq_ps(v, g);

    // h_r = 60 * (g - b) * rcp(delta); wrap negatives by +360.
    let h_r_raw = _mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(g, b)), delta_rcp);
    let mask_neg = _mm_cmplt_ps(h_r_raw, zero);
    let h_r = _mm_blendv_ps(h_r_raw, _mm_add_ps(h_r_raw, three_sixty), mask_neg);

    // h_g = 60 * (b - r) * rcp(delta) + 120.
    let h_g = _mm_add_ps(
      _mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(b, r)), delta_rcp),
      one_twenty,
    );
    // h_b = 60 * (r - g) * rcp(delta) + 240.
    let h_b = _mm_add_ps(
      _mm_mul_ps(_mm_mul_ps(sixty, _mm_sub_ps(r, g)), delta_rcp),
      two_forty,
    );

    // Cascade priority: delta == 0 → 0; v == r → h_r; v == g → h_g;
    // else → h_b. Same as scalar's `else if` chain.
    let h_g_or_b = _mm_blendv_ps(h_b, h_g, mask_v_is_g);
    let h_nonzero = _mm_blendv_ps(h_g_or_b, h_r, mask_v_is_r);
    let hue = _mm_blendv_ps(h_nonzero, zero, mask_delta_zero);

    // Quantize to scalar output ranges.
    //   h = clamp(hue * 0.5 + 0.5, 0, 179)
    //   s = clamp(s + 0.5, 0, 255)
    //   v = clamp(v + 0.5, 0, 255)
    let h_quant = _mm_min_ps(
      _mm_max_ps(_mm_add_ps(_mm_mul_ps(hue, half), half), zero),
      one_seventy_nine,
    );
    let s_quant = _mm_min_ps(_mm_max_ps(_mm_add_ps(s, half), zero), two_fifty_five);
    let v_quant = _mm_min_ps(_mm_max_ps(_mm_add_ps(v, half), zero), two_fifty_five);

    (h_quant, s_quant, v_quant)
  }
}

/// Converts 16 RGB pixels to planar HSV (OpenCV 8‑bit encoding).
/// Reads 48 bytes from `input_ptr`, writes 16 bytes each to `h_ptr`,
/// `s_ptr`, `v_ptr`.
///
/// # Safety
///
/// - `input_ptr` must point to at least 48 readable bytes.
/// - Each of `h_ptr`, `s_ptr`, `v_ptr` must point to at least 16
///   writable bytes.
/// - No aliasing between input and output.
/// - Caller's `target_feature` must include SSE4.1 (or a superset:
///   avx2, avx512bw).
#[inline(always)]
pub(super) unsafe fn rgb_to_hsv_16_pixels(
  input_ptr: *const u8,
  h_ptr: *mut u8,
  s_ptr: *mut u8,
  v_ptr: *mut u8,
) {
  unsafe {
    let (r_u8, g_u8, b_u8) = deinterleave_rgb_16(input_ptr);

    // Widen each channel to 4 x f32x4 groups (16 pixels → 4 groups of
    // 4 lanes each).
    let (r0, r1, r2, r3) = u8x16_to_f32x4_quad(r_u8);
    let (g0, g1, g2, g3) = u8x16_to_f32x4_quad(g_u8);
    let (b0, b1, b2, b3) = u8x16_to_f32x4_quad(b_u8);

    // HSV compute per group.
    let (h0, s0, v0) = hsv_group(r0, g0, b0);
    let (h1, s1, v1) = hsv_group(r1, g1, b1);
    let (h2, s2, v2) = hsv_group(r2, g2, b2);
    let (h3, s3, v3) = hsv_group(r3, g3, b3);

    // Pack each planar f32 quad back to u8x16 and store.
    _mm_storeu_si128(h_ptr.cast(), f32x4_quad_to_u8x16(h0, h1, h2, h3));
    _mm_storeu_si128(s_ptr.cast(), f32x4_quad_to_u8x16(s0, s1, s2, s3));
    _mm_storeu_si128(v_ptr.cast(), f32x4_quad_to_u8x16(v0, v1, v2, v3));
  }
}

// ---- RGB → luma (Y') support --------------------------------------------
//
// Q15 weighted sum across all 5 ColorMatrix variants. Coefficients are
// hoisted by the caller into pre‑broadcast i32x4 vectors; the per‑pixel
// math is `(k_r·R + k_g·G + k_b·B + RND) >> 15`. Limited‑range output
// applies a second Q15 multiply (`28142 / 32768`) plus the +16 offset.
//
// Bit‑identical to scalar `rgb_to_luma_row`. Shared by SSE4.1 / AVX2 /
// AVX‑512 backends — each calls [`rgb_to_luma_16_pixels`] 1x / 2x / 4x
// per iteration.

/// Computes Y' for 16 RGB pixels (48 input bytes, 16 output bytes).
///
/// `kr_v` / `kg_v` / `kb_v` are pre‑broadcast i32x4 coefficient vectors
/// (one matrix value splatted to all 4 lanes); `rnd_v` is
/// `_mm_set1_epi32(1 << 14)`. `full_range` selects between full‑range
/// `[0, 255]` and limited‑range `[16, 235]` Y' output.
///
/// # Safety
///
/// - `input_ptr` must point to at least 48 readable bytes.
/// - `output_ptr` must point to at least 16 writable bytes.
/// - The two ranges must not alias.
/// - SSE4.1 must be available (caller's `target_feature` includes it
///   or a superset like AVX2 / AVX‑512BW).
#[inline(always)]
#[allow(clippy::too_many_arguments)]
pub(super) unsafe fn rgb_to_luma_16_pixels(
  input_ptr: *const u8,
  output_ptr: *mut u8,
  kr_v: __m128i,
  kg_v: __m128i,
  kb_v: __m128i,
  rnd_v: __m128i,
  full_range: bool,
) {
  unsafe {
    // Deinterleave 16 RGB pixels → 3 x u8x16 channel vectors.
    let (r_u8, g_u8, b_u8) = deinterleave_rgb_16(input_ptr);

    // Y_full per i32x4 quarter: (k_r·R + k_g·G + k_b·B + RND) >> 15.
    // Widen each u8 channel to i32 in 4 quarters via `_mm_cvtepu8_epi32`,
    // then `_mm_mullo_epi32` for the Q15 multiply (full 32‑bit product
    // since `255 * 32768 < 2^23` ≪ i32::MAX).
    let y0 = q15_luma_quarter::<0>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
    let y1 = q15_luma_quarter::<4>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
    let y2 = q15_luma_quarter::<8>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);
    let y3 = q15_luma_quarter::<12>(r_u8, g_u8, b_u8, kr_v, kg_v, kb_v, rnd_v);

    // Pack i32x4x2 → i16x8 (saturating); two i16x8 then pack → u8x16
    // (saturating). The i32 → i16 step preserves the [0, 255] window
    // (negatives clamp to 0, > 32767 clamp to 32767). The i16 → u8
    // step performs the final [0, 255] clamp.
    let y_lo_i16 = _mm_packs_epi32(y0, y1);
    let y_hi_i16 = _mm_packs_epi32(y2, y3);

    let y_u8 = if full_range {
      _mm_packus_epi16(y_lo_i16, y_hi_i16)
    } else {
      // Limited range: (Y_full_clamped * 28142 + RND) >> 15 + 16.
      // Clamp to [0, 255] first (matches scalar `y_full_clamped`),
      // then apply the 28142/32768 Q15 multiply and add 16.
      let y_clamp_u8 = _mm_packus_epi16(y_lo_i16, y_hi_i16);
      limited_range_scale_16(y_clamp_u8, rnd_v)
    };

    _mm_storeu_si128(output_ptr.cast(), y_u8);
  }
}

/// Q15 weighted sum for 4 pixels. `LANE` ∈ {0, 4, 8, 12} selects the
/// 32‑bit slot in each u8x16 channel vector to widen → i32x4. Returns
/// `(k_r·R + k_g·G + k_b·B + RND) >> 15` as i32x4.
///
/// The const generic encodes the lane offset so `_mm_srli_si128` (which
/// requires a literal const) works without runtime branches.
#[inline(always)]
#[allow(clippy::too_many_arguments)]
fn q15_luma_quarter<const LANE: i32>(
  r_u8: __m128i,
  g_u8: __m128i,
  b_u8: __m128i,
  kr_v: __m128i,
  kg_v: __m128i,
  kb_v: __m128i,
  rnd_v: __m128i,
) -> __m128i {
  unsafe {
    // Shift the desired 4‑byte lane down to position 0; cvtepu8_epi32
    // then zero‑extends 4 u8 → 4 i32 (reads only the low 4 bytes).
    let r_shifted = _mm_srli_si128::<LANE>(r_u8);
    let g_shifted = _mm_srli_si128::<LANE>(g_u8);
    let b_shifted = _mm_srli_si128::<LANE>(b_u8);
    let r = _mm_cvtepu8_epi32(r_shifted);
    let g = _mm_cvtepu8_epi32(g_shifted);
    let b = _mm_cvtepu8_epi32(b_shifted);
    let acc = _mm_mullo_epi32(r, kr_v);
    let acc = _mm_add_epi32(acc, _mm_mullo_epi32(g, kg_v));
    let acc = _mm_add_epi32(acc, _mm_mullo_epi32(b, kb_v));
    let acc = _mm_add_epi32(acc, rnd_v);
    _mm_srai_epi32::<15>(acc)
  }
}

/// Limited‑range post‑scale: `16 + ((y_clamped * 28142 + RND) >> 15)`.
/// Input is u8x16 already clamped to `[0, 255]` (the saturating pack
/// from `_mm_packus_epi16`); output is u8x16 in `[16, 235]`.
#[inline(always)]
fn limited_range_scale_16(y_clamp_u8: __m128i, rnd_v: __m128i) -> __m128i {
  unsafe {
    let scale = _mm_set1_epi32(28142);
    let off = _mm_set1_epi16(16);
    // Widen u8x16 → i16x8x2 (zero‑extend; samples are in [0, 255]).
    let y_lo_i16 = _mm_cvtepu8_epi16(y_clamp_u8);
    let y_hi_i16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(y_clamp_u8));
    let y_lim_lo = limited_range_quarter(y_lo_i16, scale, rnd_v);
    let y_lim_hi = limited_range_quarter(y_hi_i16, scale, rnd_v);
    // i16 add 16 (offset), then saturating pack to u8.
    let y_lim_lo = _mm_add_epi16(y_lim_lo, off);
    let y_lim_hi = _mm_add_epi16(y_lim_hi, off);
    _mm_packus_epi16(y_lim_lo, y_lim_hi)
  }
}

/// Q15 multiply of one i16x8 (8 Y' samples in `[0, 255]`) by 28142,
/// returned as i16x8. `(y * 28142 + RND) >> 15`. `y * 28142` reaches
/// at most `255 * 28142 ≈ 7.18M`, well inside i32.
#[inline(always)]
fn limited_range_quarter(y_i16: __m128i, scale_i32: __m128i, rnd_v: __m128i) -> __m128i {
  unsafe {
    let lo_i32 = _mm_cvtepi16_epi32(y_i16);
    let hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(y_i16));
    let lo = _mm_srai_epi32::<15>(_mm_add_epi32(_mm_mullo_epi32(lo_i32, scale_i32), rnd_v));
    let hi = _mm_srai_epi32::<15>(_mm_add_epi32(_mm_mullo_epi32(hi_i32, scale_i32), rnd_v));
    _mm_packs_epi32(lo, hi)
  }
}