colconv 0.1.0 - Docs.rs

//! f16-native lossless I/O kernels for planar GBR f16 sources.
//!
//! **Scope:** This file handles `half::f16` source planes for lossless
//! interleave and α-scatter output only. For f16-source → integer, luma, or
//! HSV outputs the dispatch layer widens each f16 plane to f32 in a scratch
//! buffer at row entry, then calls the corresponding `gbrpf32_to_*` kernel
//! from [`super::planar_gbr_float`]. No separate f16-source kernels are needed
//! for those paths.
//!
//! ## Endian support
//!
//! All `<const BE: bool>` kernels treat the source planes as opaque `u16`
//! bit-patterns (which they already are for the lossless f16 paths).
//! When `BE = true` each u16 element is byte-swapped before being written to
//! the interleaved output buffer — i.e. we load a big-endian f16 bit-pattern
//! and emit it as host-native f16.
//!
//! ## Kernels in this file
//!
//! | Kernel | In | Out | Notes |
//! |---|---|---|---|
//! | `gbrpf16_to_rgb_f16_row` | G, B, R f16 planes | `R, G, B` f16 | pure interleave, lossless |
//! | `gbrpf16_to_rgba_f16_row` | G, B, R f16 planes | `R, G, B, A` f16 | α = f16(1.0) |
//! | `gbrapf16_to_rgba_f16_row` | G, B, R, A f16 planes | `R, G, B, A` f16 | source α pass-through |
//! | `copy_alpha_plane_f16` | α f16 plane | slot 3 of `R,G,B,A` f16 buf | lossless α scatter |
//!
//! Output order is **R, G, B** per pixel (FFmpeg `AV_PIX_FMT_RGBA64` / packed
//! RGB convention). No arithmetic is performed — these are pure gather-scatter
//! kernels over opaque `u16` bit-patterns.

// Kernels are not yet consumed by any sinker (Task 8 wires MixedSinker impls).
#![cfg_attr(not(test), allow(dead_code))]

// ---- shared BE helper -------------------------------------------------------

/// Load a single `half::f16` sample, target-endian aware.
///
/// The source plane is the raw on-disk / on-wire byte stream reinterpreted
/// as `&[half::f16]`. Each f16 read picks up two bytes in **host-native**
/// order. We then convert that host-native u16 to the value the encoded
/// stream represents:
///
/// - `BE = true`: bytes on disk are big-endian → `u16::from_be` is a no-op
///   on BE hosts and a byte-swap on LE hosts.
/// - `BE = false`: bytes on disk are little-endian → `u16::from_le` is a
///   no-op on LE hosts and a byte-swap on BE hosts.
///
/// **Both** branches go through `from_be` / `from_le` so the
/// LE-data-on-BE-host case is handled correctly too. An unconditional
/// `swap_bytes` would corrupt rows on big-endian hosts (e.g. s390x).
#[inline(always)]
fn load_f16<const BE: bool>(plane: &[half::f16], i: usize) -> half::f16 {
  let raw = plane[i];
  if BE {
    half::f16::from_bits(u16::from_be(raw.to_bits()))
  } else {
    half::f16::from_bits(u16::from_le(raw.to_bits()))
  }
}

/// Widen `n` `half::f16` values from `src[offset..offset + n]` into
/// `dst[..n]` (f32 elements), normalizing source f16 bits **before** the
/// f16 → f32 conversion so the resulting f32 is host-native regardless of
/// the source `BE`.
///
/// `BE = true`: bytes on disk are big-endian → `u16::from_be` is a no-op on
/// BE hosts and a byte-swap on LE hosts. `BE = false`: bytes on disk are
/// little-endian → `u16::from_le` is a no-op on LE hosts and a byte-swap on
/// BE hosts. Both branches go through `from_be` / `from_le` so the BE-source-
/// on-LE-host and LE-source-on-BE-host cases are handled correctly.
///
/// After this widening the scratch is host-native f32; downstream callers
/// (e.g. `gbrpf32_to_*` row kernels) must route the chain with the
/// `cfg!(target_endian = "big")` value (named `HOST_NATIVE_BE` at each call
/// site) — **not** the source `BE` — to avoid double-byte-swapping.
///
/// This is the shared helper used by both the dispatch f16-widen fallback
/// (see `dispatch::planar_gbr_float`) and the per-backend SIMD scalar tails
/// (see `arch::*::planar_gbr_float`). Per-backend tails widening 4 elements
/// into a stack scratch use the same bit-normalize-first contract.
#[cfg_attr(not(feature = "std"), allow(dead_code))]
#[inline(always)]
pub(crate) fn widen_f16_be_to_host_f32<const BE: bool>(
  src: &[half::f16],
  offset: usize,
  dst: &mut [f32],
  n: usize,
) {
  for i in 0..n {
    let raw = src[offset + i].to_bits();
    let host_bits = if BE {
      u16::from_be(raw)
    } else {
      u16::from_le(raw)
    };
    dst[i] = half::f16::from_bits(host_bits).to_f32();
  }
}

// ---- Gbrpf16 → f16 RGB (lossless interleave) --------------------------------

/// Interleaves planar G/B/R `half::f16` rows into packed `R, G, B`
/// **`half::f16`**.
///
/// Pure gather-scatter — no conversion. HDR values, NaN, and Inf are
/// preserved bit-exact. Output order is **R, G, B** per pixel.
///
/// `BE = true`: each f16 element is byte-swapped (BE → host-native) before
/// being written to the interleaved output.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn gbrpf16_to_rgb_f16_row<const BE: bool>(
  g: &[half::f16],
  b: &[half::f16],
  r: &[half::f16],
  rgb_out: &mut [half::f16],
  width: usize,
) {
  debug_assert!(g.len() >= width, "g row too short");
  debug_assert!(b.len() >= width, "b row too short");
  debug_assert!(r.len() >= width, "r row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  for x in 0..width {
    let dst = x * 3;
    rgb_out[dst] = load_f16::<BE>(r, x);
    rgb_out[dst + 1] = load_f16::<BE>(g, x);
    rgb_out[dst + 2] = load_f16::<BE>(b, x);
  }
}

// ---- Gbrpf16 → f16 RGBA (opaque α = f16(1.0)) ------------------------------

/// Interleaves planar G/B/R `half::f16` rows into packed `R, G, B, A`
/// **`half::f16`** with constant opaque α = `half::f16::from_f32(1.0)`.
///
/// Used for `Gbrpf16` sources (no α plane) when `with_rgba_f16` is requested.
///
/// `BE = true`: each f16 element is byte-swapped (BE → host-native) before
/// being written. α is always host-native f16(1.0) regardless of `BE`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn gbrpf16_to_rgba_f16_row<const BE: bool>(
  g: &[half::f16],
  b: &[half::f16],
  r: &[half::f16],
  rgba_out: &mut [half::f16],
  width: usize,
) {
  debug_assert!(g.len() >= width, "g row too short");
  debug_assert!(b.len() >= width, "b row too short");
  debug_assert!(r.len() >= width, "r row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let one_f16 = half::f16::from_f32(1.0);
  for x in 0..width {
    let dst = x * 4;
    rgba_out[dst] = load_f16::<BE>(r, x);
    rgba_out[dst + 1] = load_f16::<BE>(g, x);
    rgba_out[dst + 2] = load_f16::<BE>(b, x);
    rgba_out[dst + 3] = one_f16;
  }
}

// ---- Gbrapf16 → f16 RGBA (source α pass-through) ----------------------------

/// Interleaves planar G/B/R/A `half::f16` rows into packed `R, G, B, A`
/// **`half::f16`** with source α.
///
/// Pure gather-scatter. All four channels including α are copied losslessly —
/// HDR, NaN, and Inf preserved bit-exact.
///
/// `BE = true`: each f16 element (including α) is byte-swapped before write.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn gbrapf16_to_rgba_f16_row<const BE: bool>(
  g: &[half::f16],
  b: &[half::f16],
  r: &[half::f16],
  a: &[half::f16],
  rgba_out: &mut [half::f16],
  width: usize,
) {
  debug_assert!(g.len() >= width, "g row too short");
  debug_assert!(b.len() >= width, "b row too short");
  debug_assert!(r.len() >= width, "r row too short");
  debug_assert!(a.len() >= width, "a row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  for x in 0..width {
    let dst = x * 4;
    rgba_out[dst] = load_f16::<BE>(r, x);
    rgba_out[dst + 1] = load_f16::<BE>(g, x);
    rgba_out[dst + 2] = load_f16::<BE>(b, x);
    rgba_out[dst + 3] = load_f16::<BE>(a, x);
  }
}

// ---- copy_alpha_plane_f16 (lossless α scatter) ------------------------------

/// Scatters a `half::f16` α plane into slot 3 of a packed `R, G, B, A`
/// **`half::f16`** output buffer.
///
/// Only slot 3 of every 4-element tuple is written; R, G, B slots are
/// untouched. Lossless — HDR, NaN, and Inf in the α plane are preserved
/// bit-exact.
///
/// `BE` selects the **byte order** of the encoded source α plane
/// (`false` = LE on disk/wire, e.g. `AV_PIX_FMT_GBRAPF16LE` per the
/// `Gbrapf16Frame` contract; `true` = BE on disk/wire). Each raw f16 is
/// bit-normalised to host-native order via `u16::from_le` / `u16::from_be`
/// BEFORE the slot-3 write so the output buffer always carries host-native
/// `half::f16` (matching the rest of the f16 row kernels). Without this a
/// BE host processing the LE-encoded Frame would emit byte-reversed α bits.
// Only called from the `mod tests` block which is gated on `feature = "std"`.
// Under `cargo test --no-default-features` the test module is compiled out,
// leaving the function without callers; suppress the resulting lint there.
#[cfg_attr(not(feature = "std"), expect(dead_code))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn copy_alpha_plane_f16<const BE: bool>(
  alpha: &[half::f16],
  rgba_out: &mut [half::f16],
  width: usize,
) {
  debug_assert!(alpha.len() >= width, "alpha plane too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out too short");
  for n in 0..width {
    let raw = alpha[n].to_bits();
    let host_bits = if BE {
      u16::from_be(raw)
    } else {
      u16::from_le(raw)
    };
    rgba_out[n * 4 + 3] = half::f16::from_bits(host_bits);
  }
}

// ---- Unit tests ------------------------------------------------------------

#[cfg(all(test, feature = "std"))]
mod tests {
  use super::*;

  // ---- helpers: host-independent f16 LE / BE byte-storage encoders ----------

  /// Re-encode a host-native f16 slice as LE-encoded byte storage. Kernels
  /// called with `BE = false` recover the host-native logical bit pattern via
  /// `u16::from_le` on both LE (no-op) and BE (byte-swap) hosts.
  fn as_le_f16(host: &[half::f16]) -> std::vec::Vec<half::f16> {
    host
      .iter()
      .map(|v| half::f16::from_bits(u16::from_ne_bytes(v.to_bits().to_le_bytes())))
      .collect()
  }

  /// Mirror of `as_le_f16` for kernels invoked with `BE = true`.
  fn as_be_f16(host: &[half::f16]) -> std::vec::Vec<half::f16> {
    host
      .iter()
      .map(|v| half::f16::from_bits(u16::from_ne_bytes(v.to_bits().to_be_bytes())))
      .collect()
  }

  // -- Scalar references for the BE-parity tests --
  //
  // Walk host-native `intended` planes and reproduce each kernel's pure
  // gather-scatter behaviour without any byte-order conversion. Pinning the
  // LE / BE outputs against these absolute references prevents the parity
  // assertion from passing in lock-step on two equally corrupt decodes.

  fn ref_gbrpf16_to_rgb_f16(
    g: &[half::f16],
    b: &[half::f16],
    r: &[half::f16],
    width: usize,
  ) -> std::vec::Vec<half::f16> {
    let mut out = std::vec![half::f16::ZERO; width * 3];
    for x in 0..width {
      let dst = x * 3;
      out[dst] = r[x];
      out[dst + 1] = g[x];
      out[dst + 2] = b[x];
    }
    out
  }

  fn ref_gbrpf16_to_rgba_f16(
    g: &[half::f16],
    b: &[half::f16],
    r: &[half::f16],
    width: usize,
  ) -> std::vec::Vec<half::f16> {
    let mut out = std::vec![half::f16::ZERO; width * 4];
    let one = half::f16::from_f32(1.0);
    for x in 0..width {
      let dst = x * 4;
      out[dst] = r[x];
      out[dst + 1] = g[x];
      out[dst + 2] = b[x];
      out[dst + 3] = one;
    }
    out
  }

  fn ref_gbrapf16_to_rgba_f16(
    g: &[half::f16],
    b: &[half::f16],
    r: &[half::f16],
    a: &[half::f16],
    width: usize,
  ) -> std::vec::Vec<half::f16> {
    let mut out = std::vec![half::f16::ZERO; width * 4];
    for x in 0..width {
      let dst = x * 4;
      out[dst] = r[x];
      out[dst + 1] = g[x];
      out[dst + 2] = b[x];
      out[dst + 3] = a[x];
    }
    out
  }

  fn ref_copy_alpha_plane_f16(
    intended_alpha: &[half::f16],
    fill: half::f16,
    width: usize,
  ) -> std::vec::Vec<half::f16> {
    let mut out = std::vec![fill; width * 4];
    for n in 0..width {
      out[n * 4 + 3] = intended_alpha[n];
    }
    out
  }

  // ---- gbrpf16_to_rgb_f16_row ----------------------------------------------

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrpf16_to_rgb_f16_channel_reorder() {
    // G=0.25, B=0.5, R=1.0 → packed R=1.0, G=0.25, B=0.5
    let g = [half::f16::from_f32(0.25)];
    let b = [half::f16::from_f32(0.5)];
    let r = [half::f16::from_f32(1.0)];
    let mut out = vec![half::f16::ZERO; 3];
    gbrpf16_to_rgb_f16_row::<false>(&g, &b, &r, &mut out, 1);
    assert_eq!(out[0], half::f16::from_f32(1.0), "R");
    assert_eq!(out[1], half::f16::from_f32(0.25), "G");
    assert_eq!(out[2], half::f16::from_f32(0.5), "B");
  }

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrpf16_to_rgb_f16_hdr_preserved() {
    // HDR value 2.5 passes through losslessly.
    let hdr = half::f16::from_f32(2.5);
    let g = [hdr];
    let b = [half::f16::from_f32(0.0)];
    let r = [half::f16::from_f32(0.0)];
    let mut out = vec![half::f16::ZERO; 3];
    gbrpf16_to_rgb_f16_row::<false>(&g, &b, &r, &mut out, 1);
    assert_eq!(out[1], hdr, "HDR G preserved bit-exact");
  }

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrpf16_to_rgb_f16_be_parity() {
    // Build host-native intended planes; materialise as LE / BE byte storage
    // so each kernel's `from_le` / `from_be` recovers the same logical bits
    // on every host. Pin both outputs against an absolute scalar reference.
    let g_intended = [
      half::f16::from_f32(0.0),
      half::f16::from_f32(0.25),
      half::f16::from_f32(0.5),
      half::f16::from_f32(1.0),
    ];
    let b_intended = [
      half::f16::from_f32(0.1),
      half::f16::from_f32(0.3),
      half::f16::from_f32(0.7),
      half::f16::from_f32(0.9),
    ];
    let r_intended = [
      half::f16::from_f32(0.5),
      half::f16::from_f32(0.8),
      half::f16::from_f32(0.2),
      half::f16::from_f32(0.6),
    ];
    let g_le = as_le_f16(&g_intended);
    let b_le = as_le_f16(&b_intended);
    let r_le = as_le_f16(&r_intended);
    let g_be = as_be_f16(&g_intended);
    let b_be = as_be_f16(&b_intended);
    let r_be = as_be_f16(&r_intended);
    let mut le_out = vec![half::f16::ZERO; 4 * 3];
    let mut be_out = vec![half::f16::ZERO; 4 * 3];
    gbrpf16_to_rgb_f16_row::<false>(&g_le, &b_le, &r_le, &mut le_out, 4);
    gbrpf16_to_rgb_f16_row::<true>(&g_be, &b_be, &r_be, &mut be_out, 4);
    let expected = ref_gbrpf16_to_rgb_f16(&g_intended, &b_intended, &r_intended, 4);
    assert_eq!(le_out, expected, "LE path must match scalar reference");
    assert_eq!(be_out, expected, "BE path must match scalar reference");
    assert_eq!(be_out, le_out, "BE gbrpf16_to_rgb_f16_row must match LE");
  }

  // ---- gbrpf16_to_rgba_f16_row ---------------------------------------------

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrpf16_to_rgba_f16_alpha_is_one() {
    let g = [half::f16::from_f32(0.5)];
    let b = [half::f16::from_f32(0.5)];
    let r = [half::f16::from_f32(0.5)];
    let mut out = vec![half::f16::ZERO; 4];
    gbrpf16_to_rgba_f16_row::<false>(&g, &b, &r, &mut out, 1);
    assert_eq!(out[3], half::f16::from_f32(1.0), "alpha must be f16(1.0)");
  }

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrpf16_to_rgba_f16_be_parity() {
    let g_intended = [
      half::f16::from_f32(0.0),
      half::f16::from_f32(0.25),
      half::f16::from_f32(0.5),
      half::f16::from_f32(1.0),
    ];
    let b_intended = [
      half::f16::from_f32(0.1),
      half::f16::from_f32(0.3),
      half::f16::from_f32(0.7),
      half::f16::from_f32(0.9),
    ];
    let r_intended = [
      half::f16::from_f32(0.5),
      half::f16::from_f32(0.8),
      half::f16::from_f32(0.2),
      half::f16::from_f32(0.6),
    ];
    let g_le = as_le_f16(&g_intended);
    let b_le = as_le_f16(&b_intended);
    let r_le = as_le_f16(&r_intended);
    let g_be = as_be_f16(&g_intended);
    let b_be = as_be_f16(&b_intended);
    let r_be = as_be_f16(&r_intended);
    let mut le_out = vec![half::f16::ZERO; 4 * 4];
    let mut be_out = vec![half::f16::ZERO; 4 * 4];
    gbrpf16_to_rgba_f16_row::<false>(&g_le, &b_le, &r_le, &mut le_out, 4);
    gbrpf16_to_rgba_f16_row::<true>(&g_be, &b_be, &r_be, &mut be_out, 4);
    let expected = ref_gbrpf16_to_rgba_f16(&g_intended, &b_intended, &r_intended, 4);
    assert_eq!(le_out, expected, "LE path must match scalar reference");
    assert_eq!(be_out, expected, "BE path must match scalar reference");
    assert_eq!(be_out, le_out, "BE gbrpf16_to_rgba_f16_row must match LE");
  }

  // ---- gbrapf16_to_rgba_f16_row --------------------------------------------

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrapf16_to_rgba_f16_source_alpha_passthrough() {
    let g = [half::f16::from_f32(0.25)];
    let b = [half::f16::from_f32(0.5)];
    let r = [half::f16::from_f32(0.75)];
    let a = [half::f16::from_f32(0.9)];
    let mut out = vec![half::f16::ZERO; 4];
    gbrapf16_to_rgba_f16_row::<false>(&g, &b, &r, &a, &mut out, 1);
    assert_eq!(out[0], half::f16::from_f32(0.75), "R");
    assert_eq!(out[1], half::f16::from_f32(0.25), "G");
    assert_eq!(out[2], half::f16::from_f32(0.5), "B");
    assert_eq!(out[3], half::f16::from_f32(0.9), "A from source");
  }

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn gbrapf16_to_rgba_f16_be_parity() {
    let g_intended = [
      half::f16::from_f32(0.0),
      half::f16::from_f32(0.25),
      half::f16::from_f32(0.5),
      half::f16::from_f32(1.0),
    ];
    let b_intended = [
      half::f16::from_f32(0.1),
      half::f16::from_f32(0.3),
      half::f16::from_f32(0.7),
      half::f16::from_f32(0.9),
    ];
    let r_intended = [
      half::f16::from_f32(0.5),
      half::f16::from_f32(0.8),
      half::f16::from_f32(0.2),
      half::f16::from_f32(0.6),
    ];
    let a_intended = [
      half::f16::from_f32(0.2),
      half::f16::from_f32(0.4),
      half::f16::from_f32(0.6),
      half::f16::from_f32(0.8),
    ];
    let g_le = as_le_f16(&g_intended);
    let b_le = as_le_f16(&b_intended);
    let r_le = as_le_f16(&r_intended);
    let a_le = as_le_f16(&a_intended);
    let g_be = as_be_f16(&g_intended);
    let b_be = as_be_f16(&b_intended);
    let r_be = as_be_f16(&r_intended);
    let a_be = as_be_f16(&a_intended);
    let mut le_out = vec![half::f16::ZERO; 4 * 4];
    let mut be_out = vec![half::f16::ZERO; 4 * 4];
    gbrapf16_to_rgba_f16_row::<false>(&g_le, &b_le, &r_le, &a_le, &mut le_out, 4);
    gbrapf16_to_rgba_f16_row::<true>(&g_be, &b_be, &r_be, &a_be, &mut be_out, 4);
    let expected = ref_gbrapf16_to_rgba_f16(&g_intended, &b_intended, &r_intended, &a_intended, 4);
    assert_eq!(le_out, expected, "LE path must match scalar reference");
    assert_eq!(be_out, expected, "BE path must match scalar reference");
    assert_eq!(be_out, le_out, "BE gbrapf16_to_rgba_f16_row must match LE");
  }

  // ---- copy_alpha_plane_f16 ------------------------------------------------

  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn copy_alpha_plane_f16_only_writes_alpha_slot() {
    // LE-encoded fixture so the kernel's `from_le` recovers host-native values
    // on both LE (no-op) and BE (byte-swap) hosts.
    let host_alpha = [half::f16::from_f32(0.7), half::f16::from_f32(0.3)];
    let alpha: std::vec::Vec<half::f16> = host_alpha
      .iter()
      .map(|v| half::f16::from_bits(u16::from_ne_bytes(v.to_bits().to_le_bytes())))
      .collect();
    let sentinel = half::f16::from_f32(0.1);
    let mut rgba = vec![sentinel; 8];
    copy_alpha_plane_f16::<false>(&alpha, &mut rgba, 2);
    // Only slot 3 written; R, G, B slots (0, 1, 2) must be untouched.
    assert_eq!(rgba[0], sentinel, "R slot 0 untouched");
    assert_eq!(rgba[1], sentinel, "G slot 0 untouched");
    assert_eq!(rgba[2], sentinel, "B slot 0 untouched");
    assert_eq!(rgba[3], half::f16::from_f32(0.7), "A slot 0");
    assert_eq!(rgba[4], sentinel, "R slot 1 untouched");
    assert_eq!(rgba[5], sentinel, "G slot 1 untouched");
    assert_eq!(rgba[6], sentinel, "B slot 1 untouched");
    assert_eq!(rgba[7], half::f16::from_f32(0.3), "A slot 1");
  }

  /// BE parity for `copy_alpha_plane_f16`: byte-swapping the bits of every
  /// f16 in the source α plane and toggling `BE` must produce identical
  /// output. Mirrors the f32 alpha-patch endian-aware fix.
  #[test]
  #[cfg_attr(
    miri,
    ignore = "half::f16 uses inline assembly on aarch64 unsupported by Miri"
  )]
  fn copy_alpha_plane_f16_be_parity_with_swapped_buffer() {
    // Build a single host-native `intended` α plane; materialise as LE / BE
    // byte storage so each kernel's `from_le` / `from_be` recovers the same
    // host-native bits on every host. Pin both outputs against an absolute
    // scalar reference (compared bitwise to be NaN-safe).
    let intended = vec![
      half::f16::from_f32(0.0),
      half::f16::from_f32(0.25),
      half::f16::from_f32(0.5),
      half::f16::from_f32(1.0),
      half::f16::from_f32(2.5),
      half::f16::from_f32(-1.0),
    ];
    let alpha_le = as_le_f16(&intended);
    let alpha_be = as_be_f16(&intended);
    let mut rgba_le = vec![half::f16::ZERO; 24];
    let mut rgba_be = vec![half::f16::ZERO; 24];
    copy_alpha_plane_f16::<false>(&alpha_le, &mut rgba_le, 6);
    copy_alpha_plane_f16::<true>(&alpha_be, &mut rgba_be, 6);
    let expected = ref_copy_alpha_plane_f16(&intended, half::f16::ZERO, 6);
    let bits_le: std::vec::Vec<u16> = rgba_le.iter().map(|v| v.to_bits()).collect();
    let bits_be: std::vec::Vec<u16> = rgba_be.iter().map(|v| v.to_bits()).collect();
    let bits_expected: std::vec::Vec<u16> = expected.iter().map(|v| v.to_bits()).collect();
    assert_eq!(
      bits_le, bits_expected,
      "LE path must match scalar reference"
    );
    assert_eq!(
      bits_be, bits_expected,
      "BE path must match scalar reference"
    );
    assert_eq!(
      bits_le, bits_be,
      "BE flag + bit-swapped buffer must match LE path bit-for-bit"
    );
  }
}