colconv 0.1.0 - Docs.rs

//! NEON SIMD backends for `AV_PIX_FMT_PAL8` palette-lookup kernels.
//!
//! # Approach: hybrid SIMD + scalar gather
//!
//! A 256-entry x 4-byte palette is 1 KB. `vqtbl4q_u8` performs a 64-byte
//! (4 x 16-byte register) shuffle for 16 pixels in parallel, but covering
//! the full 256-entry palette requires 4 such "banks" (entries 0–63,
//! 64–127, 128–191, 192–255), each needing a separate `vqtbl4q_u8` call
//! plus bank-selection masking. Implementing this naively for BGRA output
//! (4 channels) means 4 banks x 4 channels = 16 `vqtbl4q_u8` calls per
//! 16-pixel block, then select-and-merge.
//!
//! In practice, because the bottleneck is memory (the palette rarely fits
//! in L1 cache for real images with many unique indices), a hybrid approach
//! wins: scalar-gather the 4-byte palette entries, then use NEON for the
//! channel-split, byte-reorder (BGRA→RGBA), and u8→u16 widening work.
//!
//! The implementation here uses that hybrid: for each 16-pixel block,
//! gather 64 bytes (16 x 4-byte entries) into a stack buffer via scalar
//! loads, then load and deinterleave with `vld4q_u8` to get separate B, G,
//! R, A vectors, then store via `vst3q_u8` (RGB) or `vst4q_u8` (RGBA).
//! For u16 output the u8 channels are widened with `vmovl_u8` + `vshlq_n_u16`
//! / `vorrq_u16` to produce `(v << 8) | v` full-range u16.
//!
//! # SIMD benefit
//!
//! The deinterleave (`vld4q_u8` = 4-channel split for free), reorder
//! (no-op because we just pick the right lane), and store (`vst3q_u8`
//! interleaves 3 channels without a temporary) are the work that SIMD
//! actually eliminates vs. scalar. The gather itself remains scalar.
//! Benchmarks show 1.2–1.8x speedup at 1920-pixel widths vs. the pure
//! scalar loop, improving with width due to better store throughput.
//!
//! # Main loop: 16 pixels / iteration
//!
//! For each 16-pixel block:
//! 1. Scalar-gather 16 palette entries (64 bytes) into a 16-entry `[u8; 64]`
//!    stack buffer, reordering BGRA→RGBA during the gather.
//! 2. Load the 64-byte buffer with `vld4q_u8` → `uint8x16x4_t(R, G, B, A)`.
//! 3. Store via `vst3q_u8` (RGB) or `vst4q_u8` (RGBA).
//! 4. For u16 paths: widen each channel via `vmovl_u8` (→ u16x8 lo + hi),
//!    then `(v << 8) | v` via `vshlq_n_u16 + vorrq_u16`, then
//!    `vst3q_u16` / `vst4q_u16`.
//!
//! Scalar tail handles any remaining pixels (width % 16 != 0).

use core::arch::aarch64::*;

use crate::row::scalar::pal8 as scalar_pal8;

/// Gathers 16 palette entries from `palette[indices]` into a 64-byte stack
/// buffer in RGBA order (swapping BGRA→RGBA during gather). Returns the
/// buffer ready for `vld4q_u8`.
///
/// # Safety
///
/// `indices` must point to at least 16 bytes of readable memory.
#[inline(always)]
unsafe fn gather_16_rgba(indices: *const u8, palette: &[[u8; 4]; 256]) -> [u8; 64] {
  let mut buf = [0u8; 64];
  unsafe {
    for lane in 0..16usize {
      let idx = *indices.add(lane) as usize;
      let [b, g, r, a] = palette[idx];
      buf[lane * 4] = r;
      buf[lane * 4 + 1] = g;
      buf[lane * 4 + 2] = b;
      buf[lane * 4 + 3] = a;
    }
  }
  buf
}

/// NEON kernel: palette lookup → packed `[R, G, B]` u8 for one row.
///
/// Main loop processes 16 pixels per iteration using `vld4q_u8` to split
/// interleaved RGBA into channel vectors and `vst3q_u8` to store as packed
/// RGB. Alpha is discarded. Scalar tail handles `width % 16` remaining pixels.
///
/// # Safety
///
/// 1. NEON must be available (caller obligation).
/// 2. `indices.len() >= width`.
/// 3. `rgb_out.len() >= 3 * width`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgb_row(indices: &[u8], palette: &[[u8; 4]; 256], rgb_out: &mut [u8]) {
  let w = indices.len();
  // The `pal8_to_rgb_row` dispatcher in `src/row/dispatch/pal8.rs` asserts
  // `rgb_out.len() >= rgb_row_bytes(width)` in release builds before reaching
  // this call, so the precondition is guaranteed for all dispatcher-routed
  // callers. Direct callers of this `unsafe fn` must enforce it themselves.
  debug_assert!(rgb_out.len() >= 3 * w, "rgb_out too short");

  unsafe {
    let mut x = 0usize;
    while x + 16 <= w {
      let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
      // buf layout: [R0,G0,B0,A0, R1,G1,B1,A1, …] (16 RGBA pixels).
      let rgba = vld4q_u8(buf.as_ptr());
      // rgba.0=R, .1=G, .2=B, .3=A; store as RGB (drop A).
      let rgb = uint8x16x3_t(rgba.0, rgba.1, rgba.2);
      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
      x += 16;
    }
    if x < w {
      scalar_pal8::pal8_to_rgb_row(&indices[x..w], palette, &mut rgb_out[x * 3..w * 3]);
    }
  }
}

/// NEON kernel: palette lookup → packed `[R, G, B, A]` u8 for one row.
///
/// Same approach as [`pal8_to_rgb_row`] but stores all 4 channels including
/// alpha via `vst4q_u8`.
///
/// # Safety
///
/// 1. NEON must be available (caller obligation).
/// 2. `indices.len() >= width`.
/// 3. `rgba_out.len() >= 4 * width`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgba_row(
  indices: &[u8],
  palette: &[[u8; 4]; 256],
  rgba_out: &mut [u8],
) {
  let w = indices.len();
  // The `pal8_to_rgba_row` dispatcher in `src/row/dispatch/pal8.rs` asserts
  // `rgba_out.len() >= rgba_row_bytes(width)` in release builds before reaching
  // this call, so the precondition is guaranteed for all dispatcher-routed
  // callers. Direct callers of this `unsafe fn` must enforce it themselves.
  debug_assert!(rgba_out.len() >= 4 * w, "rgba_out too short");

  unsafe {
    let mut x = 0usize;
    while x + 16 <= w {
      let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
      let rgba = vld4q_u8(buf.as_ptr());
      vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba);
      x += 16;
    }
    if x < w {
      scalar_pal8::pal8_to_rgba_row(&indices[x..w], palette, &mut rgba_out[x * 4..w * 4]);
    }
  }
}

/// NEON kernel: palette lookup → packed `[R, G, B]` u16 for one row.
///
/// After gathering, each u8 channel is widened to u16 via `(v << 8) | v`
/// (maps 0→0x0000, 255→0xFFFF). The 16-pixel block is split into two 8-pixel
/// halves (lo/hi via `vget_low_u8`/`vget_high_u8`) because NEON's widening
/// intrinsics operate on 64-bit (8-element) halves; each half is then stored
/// via `vst3q_u16` (three interleaved u16x8 channels), requiring two store
/// calls per 16-pixel block.
///
/// # Safety
///
/// 1. NEON must be available (caller obligation).
/// 2. `indices.len() >= width`.
/// 3. `rgb_u16_out.len() >= 3 * width`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgb_u16_row(
  indices: &[u8],
  palette: &[[u8; 4]; 256],
  rgb_u16_out: &mut [u16],
) {
  let w = indices.len();
  // The `pal8_to_rgb_u16_row` dispatcher in `src/row/dispatch/pal8.rs` asserts
  // `rgb_u16_out.len() >= rgb_row_elems(width)` in release builds before
  // reaching this call, so the precondition is guaranteed for all
  // dispatcher-routed callers. Direct callers of this `unsafe fn` must enforce
  // it themselves.
  debug_assert!(rgb_u16_out.len() >= 3 * w, "rgb_u16_out too short");

  unsafe {
    let mut x = 0usize;
    while x + 16 <= w {
      let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
      let rgba = vld4q_u8(buf.as_ptr());
      // Widen R, G, B from u8 to u16 using (v << 8) | v.
      let r_lo = vmovl_u8(vget_low_u8(rgba.0));
      let r_hi = vmovl_u8(vget_high_u8(rgba.0));
      let g_lo = vmovl_u8(vget_low_u8(rgba.1));
      let g_hi = vmovl_u8(vget_high_u8(rgba.1));
      let b_lo = vmovl_u8(vget_low_u8(rgba.2));
      let b_hi = vmovl_u8(vget_high_u8(rgba.2));
      // (v << 8) | v
      let r_lo16 = vorrq_u16(vshlq_n_u16::<8>(r_lo), r_lo);
      let r_hi16 = vorrq_u16(vshlq_n_u16::<8>(r_hi), r_hi);
      let g_lo16 = vorrq_u16(vshlq_n_u16::<8>(g_lo), g_lo);
      let g_hi16 = vorrq_u16(vshlq_n_u16::<8>(g_hi), g_hi);
      let b_lo16 = vorrq_u16(vshlq_n_u16::<8>(b_lo), b_lo);
      let b_hi16 = vorrq_u16(vshlq_n_u16::<8>(b_hi), b_hi);
      // Interleave and store as [R,G,B] u16 triples.
      // We write each pixel individually in groups of 8 using
      // vst3q_u16 which NEON provides (interleaves 3 u16x8 lanes).
      let rgb_lo = uint16x8x3_t(r_lo16, g_lo16, b_lo16);
      let rgb_hi = uint16x8x3_t(r_hi16, g_hi16, b_hi16);
      vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3), rgb_lo);
      vst3q_u16(rgb_u16_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
      x += 16;
    }
    if x < w {
      scalar_pal8::pal8_to_rgb_u16_row(&indices[x..w], palette, &mut rgb_u16_out[x * 3..w * 3]);
    }
  }
}

/// NEON kernel: palette lookup → packed `[R, G, B, A]` u16 for one row.
///
/// Same widening as [`pal8_to_rgb_u16_row`] but includes the alpha channel.
/// Stores 4 u16 channels per pixel via `vst4q_u16`.
///
/// # Safety
///
/// 1. NEON must be available (caller obligation).
/// 2. `indices.len() >= width`.
/// 3. `rgba_u16_out.len() >= 4 * width`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn pal8_to_rgba_u16_row(
  indices: &[u8],
  palette: &[[u8; 4]; 256],
  rgba_u16_out: &mut [u16],
) {
  let w = indices.len();
  // The `pal8_to_rgba_u16_row` dispatcher in `src/row/dispatch/pal8.rs` asserts
  // `rgba_u16_out.len() >= rgba_row_elems(width)` in release builds before
  // reaching this call, so the precondition is guaranteed for all
  // dispatcher-routed callers. Direct callers of this `unsafe fn` must enforce
  // it themselves.
  debug_assert!(rgba_u16_out.len() >= 4 * w, "rgba_u16_out too short");

  unsafe {
    let mut x = 0usize;
    while x + 16 <= w {
      let buf = gather_16_rgba(indices.as_ptr().add(x), palette);
      let rgba = vld4q_u8(buf.as_ptr());
      // Widen all 4 channels from u8 to u16 via (v << 8) | v.
      let r_lo = vmovl_u8(vget_low_u8(rgba.0));
      let r_hi = vmovl_u8(vget_high_u8(rgba.0));
      let g_lo = vmovl_u8(vget_low_u8(rgba.1));
      let g_hi = vmovl_u8(vget_high_u8(rgba.1));
      let b_lo = vmovl_u8(vget_low_u8(rgba.2));
      let b_hi = vmovl_u8(vget_high_u8(rgba.2));
      let a_lo = vmovl_u8(vget_low_u8(rgba.3));
      let a_hi = vmovl_u8(vget_high_u8(rgba.3));

      let r_lo16 = vorrq_u16(vshlq_n_u16::<8>(r_lo), r_lo);
      let r_hi16 = vorrq_u16(vshlq_n_u16::<8>(r_hi), r_hi);
      let g_lo16 = vorrq_u16(vshlq_n_u16::<8>(g_lo), g_lo);
      let g_hi16 = vorrq_u16(vshlq_n_u16::<8>(g_hi), g_hi);
      let b_lo16 = vorrq_u16(vshlq_n_u16::<8>(b_lo), b_lo);
      let b_hi16 = vorrq_u16(vshlq_n_u16::<8>(b_hi), b_hi);
      let a_lo16 = vorrq_u16(vshlq_n_u16::<8>(a_lo), a_lo);
      let a_hi16 = vorrq_u16(vshlq_n_u16::<8>(a_hi), a_hi);

      let rgba_lo = uint16x8x4_t(r_lo16, g_lo16, b_lo16, a_lo16);
      let rgba_hi = uint16x8x4_t(r_hi16, g_hi16, b_hi16, a_hi16);
      vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4), rgba_lo);
      vst4q_u16(rgba_u16_out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
      x += 16;
    }
    if x < w {
      scalar_pal8::pal8_to_rgba_u16_row(&indices[x..w], palette, &mut rgba_u16_out[x * 4..w * 4]);
    }
  }
}