colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
//! wasm-simd128 `y_plane_to_luma_u16_row` — zero-extends a u8 Y plane
//! to u16.
//!
//! Processes 16 pixels per iteration: loads 16 u8 via `v128_load`, then
//! uses `u16x8_extend_low_u8x16` and `u16x8_extend_high_u8x16` to produce
//! two v128 vectors of 8 u16 each, stored via two `v128_store` calls.
//! Scalar tail delegates to the reference implementation.

#![cfg_attr(not(feature = "std"), allow(dead_code))]

use core::arch::wasm32::*;

use crate::row::scalar::y_plane_to_luma_u16 as scalar;

/// wasm-simd128 zero-extension: `out[x] = plane[x] as u16` for `x in 0..width`.
///
/// Block size: 16 px / iter.
///
/// # Safety
///
/// simd128 must be enabled at compile time. `plane.len() >= width`;
/// `out.len() >= width`.
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn y_plane_to_luma_u16_row(plane: &[u8], out: &mut [u16], width: usize) {
  debug_assert!(plane.len() >= width, "plane too short");
  debug_assert!(out.len() >= width, "out too short");

  let mut x = 0usize;
  // SAFETY: loop guard `x + 16 <= width` plus debug_asserts guarantee the
  // 16-byte load and the two 16-byte stores stay in bounds.
  unsafe {
    while x + 16 <= width {
      let v = v128_load(plane.as_ptr().add(x).cast());
      let low = u16x8_extend_low_u8x16(v);
      let high = u16x8_extend_high_u8x16(v);
      v128_store(out.as_mut_ptr().add(x).cast(), low);
      v128_store(out.as_mut_ptr().add(x + 8).cast(), high);
      x += 16;
    }
  }

  if x < width {
    scalar::y_plane_to_luma_u16_row(&plane[x..width], &mut out[x..width], width - x);
  }
}

#[cfg(all(test, feature = "std"))]
mod tests {
  use crate::row::scalar::y_plane_to_luma_u16 as scalar;

  fn pseudo_random_u8(out: &mut [u8], seed: u32) {
    let mut state = seed;
    for v in out.iter_mut() {
      state = state.wrapping_mul(1664525).wrapping_add(1013904223);
      *v = (state >> 16) as u8;
    }
  }

  const WIDTHS: &[usize] = &[1, 7, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 128, 130];

  #[test]
  #[cfg_attr(
    miri,
    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
  )]
  fn wasm_y_plane_to_luma_u16_matches_scalar_widths() {
    for &w in WIDTHS {
      let mut plane = std::vec![0u8; w];
      pseudo_random_u8(&mut plane, 0xC0FFEE);
      let mut out_simd = std::vec![0u16; w];
      let mut out_scalar = std::vec![0u16; w];
      unsafe { super::y_plane_to_luma_u16_row(&plane, &mut out_simd, w) };
      scalar::y_plane_to_luma_u16_row(&plane, &mut out_scalar, w);
      assert_eq!(out_simd, out_scalar, "width={w}");
    }
  }
}