colconv 0.1.0 - Docs.rs

//! Scalar reference kernels for the Tier 12 (DCP / Xyz12) source.
//!
//! Pipeline (per-pixel):
//!
//! ```text
//! xyz_u12  →  xyz_linear (f32)  →  rgb_linear (f32) via M_xyz_to_rgb
//!         →  rgb_gamma (f32) via OETF  →  bgr_u8 / rgb_u8 / etc
//! ```
//!
//! Steps:
//!
//! 1. SMPTE ST 428-1 §8 inverse-OETF:
//!    `xyz_lin = (x_u12 / 4095)^2.6 / 0.91653`. Applied to each X/Y/Z
//!    sample independently.
//! 2. 3x3 matmul against the active gamut's `M_xyz_to_rgb` constant.
//! 3. sRGB-shape OETF (12.92 linear segment + `1.055 * c^(1/2.4) -
//!    0.055` upper segment). Skipped for f32-output paths
//!    (`xyz12_to_rgb_f32_row` / `xyz12_to_xyz_f32_row`).
//! 4. Range scale + integer narrow with round-half-up — only for u8 /
//!    u16 outputs.
//!
//! All kernels are const-generic over `BE: bool` for source endianness;
//! the `BE = false` branch is a compile-time no-op.

use crate::DcpTargetGamut;

use super::xyz12_constants::{
  INV_4095, OETF_POLY_COEFFS, OETF_POLY_DEGREE, OETF_POLY_SEG_BOUNDS, OETF_POLY_SEG_CENTERS,
  OETF_POLY_SEGMENTS, SAMPLE_MASK, SMPTE428_INV_NORM, xyz_to_rgb_matrix,
};

/// `f32` `powf` portable across `std` and `no_std + alloc` builds.
/// `std` provides `f32::powf` directly via libm; `no_std` builds opt
/// into the same routine via the `libm` crate (gated by the `alloc`
/// feature in the crate's `Cargo.toml`).
#[cfg_attr(not(tarpaulin), inline(always))]
fn powf32(x: f32, y: f32) -> f32 {
  #[cfg(feature = "std")]
  {
    f32::powf(x, y)
  }
  #[cfg(all(not(feature = "std"), feature = "alloc"))]
  {
    libm::powf(x, y)
  }
}

/// Test-only helper used by the `oetf_srgb_reference_f64` test oracle.
///
/// Compiled **only** under `cfg(all(test, feature = "std"))`. The xyz12
/// `mod tests;` declaration at the bottom of this file is gated on
/// `feature = "std"`, so no-default-features (`alloc`-only) builds never
/// reach this fn — and the inner `cfg(all(not(feature = "std"), ...))`
/// libm branch is therefore dead code in practice (kept as a
/// compile-time fallback in case the outer test gate is ever relaxed
/// to `any(feature = "std", feature = "alloc")` to match `powf32`).
/// Production `oetf_srgb` uses the polynomial table in
/// `xyz12_constants`, not `powf`, so this helper is only needed in the
/// test harness.
#[cfg(all(test, feature = "std"))]
#[cfg_attr(not(tarpaulin), inline(always))]
fn powf64(x: f64, y: f64) -> f64 {
  #[cfg(feature = "std")]
  {
    f64::powf(x, y)
  }
  #[cfg(all(not(feature = "std"), feature = "alloc"))]
  {
    libm::pow(x, y)
  }
}
// Helpers — kept `pub(crate)` so SIMD backends can re-use the OETF
// formula in their scalar tail / scalar-`powf` lanes.
/// Reads a packed XYZ12 sample with byte-swap if `BE` is set, then
/// extracts the active 12-bit code from the high-bit-packed `u16`.
///
/// FFmpeg's `AV_PIX_FMT_XYZ12LE` / `AV_PIX_FMT_XYZ12BE` formats are
/// described as "the same as RGB48LE/BE, but the lower 4 bits of each
/// component are zero" — i.e. the active 12-bit code lives in bits
/// `[15:4]` of each `u16`, not bits `[11:0]`. After the endian-aware
/// load, we right-shift by 4 to recover the active code, then mask to
/// 12 bits as a defensive guard against dirty-low-bit producers.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn read_xyz12_sample<const BE: bool>(s: u16) -> u16 {
  let raw = if BE { u16::from_be(s) } else { u16::from_le(s) };
  (raw >> 4) & SAMPLE_MASK
}

/// SMPTE ST 428-1 §8 inverse OETF: u12 → linear XYZ value in `f32`.
/// `xyz_lin = (x_u12 / 4095)^2.6 / 0.91653`.
///
/// Input is the **active 12-bit code** (`0..=4095`), already extracted
/// from the high-bit-packed wire `u16` by the caller (scalar callers
/// route through `read_xyz12_sample`; SIMD backends apply a `>> 4`
/// shift in the load path before this function). The internal
/// `& SAMPLE_MASK` is a defensive belt-and-braces clamp against
/// callers passing a non-shifted value.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn smpte428_inverse_oetf(x_u12: u16) -> f32 {
  let normalised = (x_u12 & SAMPLE_MASK) as f32 * INV_4095;
  powf32(normalised, 2.6_f32) * SMPTE428_INV_NORM
}

/// Applies a 3x3 matrix to a linear XYZ vector, returning linear RGB.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn matmul3_xyz_rgb(m: &[[f32; 3]; 3], xyz: [f32; 3]) -> [f32; 3] {
  let [x, y, z] = xyz;
  [
    m[0][0] * x + m[0][1] * y + m[0][2] * z,
    m[1][0] * x + m[1][1] * y + m[1][2] * z,
    m[2][0] * x + m[2][1] * y + m[2][2] * z,
  ]
}

/// sRGB-shape OETF — production entry point used by every integer-
/// output scalar kernel and as the per-lane scalar fall-through by the
/// SIMD backends. Implemented via the piecewise-minimax polynomial
/// (degree 3, 192 segments) generated by
/// `examples/derive_oetf_polynomial.rs`.
///
/// `c < 0.0031308`: `12.92 * c` (linear toe; exact in f32).
/// `c >= 0.0031308`: piecewise polynomial; see the `OETF_POLY_*`
/// tables in `xyz12_constants`.
///
/// Reference target (B' decision): the f64-narrowed sRGB OETF
/// `(1.055_f64 * (c as f64).powf(1/2.4) - 0.055) as f32`. The
/// polynomial matches that reference within ≤ 2 ULP at 65 536 sweep
/// points across `[0.0031308, 1.0]` — verified by
/// `oetf_srgb_polynomial_within_2_ulp_of_reference`. The reference itself
/// is closer to mathematically-correct sRGB OETF than pure-f32
/// `f32::powf` (which is ~2 ULP off truth and platform-dependent), so
/// switching from `f32::powf` to this polynomial is a strict
/// correctness *and* perf upgrade (no `powf` per-pixel cost).
///
/// Scalar↔SIMD parity is 0 ULP by construction: both paths evaluate
/// the same polynomial against the same coefficient tables. SIMD
/// backends vectorise the Horner evaluation across f32 lanes; this
/// function is the SIMD per-lane scalar fall-through where the
/// vectorised segment selector exits early (e.g., a single trailing
/// element).
///
/// Inputs `c < 0` produce a small negative result via the linear toe;
/// inputs `c > 1` produce values somewhat larger than 1 via the upper
/// segment's polynomial extrapolation — callers clamp at the integer
/// narrow (`narrow_unit_to_u{8,16}`) or downstream f16 cast.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn oetf_srgb(c: f32) -> f32 {
  if c < 0.0031308_f32 {
    return 12.92_f32 * c;
  }
  // Segment lookup: walk the bounds table from highest to lowest
  // (`c >= bound[i]` ⇒ segment `i`). With 192 segments this is a
  // bounded scan of length ≤ 192 — vectorisable to a hierarchical
  // compare-tree by the SIMD backends; the scalar tail uses the linear
  // walk for code-size simplicity.
  let mut seg_idx = 0_usize;
  let mut i = OETF_POLY_SEGMENTS;
  while i > 0 {
    i -= 1;
    if c >= OETF_POLY_SEG_BOUNDS[i] {
      seg_idx = i;
      break;
    }
  }
  let center = OETF_POLY_SEG_CENTERS[seg_idx];
  let dx = c - center;
  let base = seg_idx * (OETF_POLY_DEGREE + 1);
  // Centered Horner: `c[d]·dx + c[d-1]·dx + ... + c[0]`. Coefficient
  // count is `OETF_POLY_DEGREE + 1` (= 4 for degree 3).
  let mut acc = 0.0_f32;
  let mut k = OETF_POLY_DEGREE + 1;
  while k > 0 {
    k -= 1;
    acc = acc * dx + OETF_POLY_COEFFS[base + k];
  }
  acc
}

/// Round-half-up f32 → u8 narrow with `[0, 1]` clamp.
/// `(c.clamp(0, 1) * 255 + 0.5) as u8`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn narrow_unit_to_u8(c: f32) -> u8 {
  let scaled = c.clamp(0.0_f32, 1.0_f32) * 255.0_f32 + 0.5_f32;
  scaled.clamp(0.0_f32, 255.0_f32) as u8
}

/// Round-half-up f32 → u16 narrow with `[0, 1]` clamp.
/// `(c.clamp(0, 1) * 65535 + 0.5) as u16`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn narrow_unit_to_u16(c: f32) -> u16 {
  let scaled = c.clamp(0.0_f32, 1.0_f32) * 65535.0_f32 + 0.5_f32;
  scaled.clamp(0.0_f32, 65535.0_f32) as u16
}

/// Computes a single pixel's linear RGB from packed XYZ12 input.
/// Steps 1 + 2 of the pipeline (inverse-OETF + matmul). Used by every
/// downstream output kernel.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_pixel_to_rgb_linear<const BE: bool>(
  m: &[[f32; 3]; 3],
  triple: &[u16; 3],
) -> [f32; 3] {
  let x = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[0]));
  let y = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[1]));
  let z = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[2]));
  matmul3_xyz_rgb(m, [x, y, z])
}

/// Computes a single pixel's linear XYZ (steps 1 only). Used by
/// `xyz12_to_xyz_f32_row` for lossless XYZ pass-through.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_pixel_to_xyz_linear<const BE: bool>(triple: &[u16; 3]) -> [f32; 3] {
  [
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[0])),
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[1])),
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[2])),
  ]
}

// Per-output kernels.
/// XYZ12 → packed RGB (u8). Full pipeline: inverse-OETF + matmul +
/// sRGB OETF + clamp + x255 + round-half-up.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [u8],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = narrow_unit_to_u8(oetf_srgb(rgb_lin[0]));
    rgb_out[i + 1] = narrow_unit_to_u8(oetf_srgb(rgb_lin[1]));
    rgb_out[i + 2] = narrow_unit_to_u8(oetf_srgb(rgb_lin[2]));
  }
}

/// XYZ12 → packed RGBA (u8). Same as RGB; alpha forced to `0xFF`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [u8],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = narrow_unit_to_u8(oetf_srgb(rgb_lin[0]));
    rgba_out[oi + 1] = narrow_unit_to_u8(oetf_srgb(rgb_lin[1]));
    rgba_out[oi + 2] = narrow_unit_to_u8(oetf_srgb(rgb_lin[2]));
    rgba_out[oi + 3] = 0xFF;
  }
}

/// XYZ12 → packed RGB (u16). Full pipeline; full-range scaling
/// `[0, 1] x 65535 + round-half-up`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_u16_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [u16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = narrow_unit_to_u16(oetf_srgb(rgb_lin[0]));
    rgb_out[i + 1] = narrow_unit_to_u16(oetf_srgb(rgb_lin[1]));
    rgb_out[i + 2] = narrow_unit_to_u16(oetf_srgb(rgb_lin[2]));
  }
}

/// XYZ12 → packed RGBA (u16). Same as RGB-u16; alpha forced to
/// `0xFFFF`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_u16_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [u16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = narrow_unit_to_u16(oetf_srgb(rgb_lin[0]));
    rgba_out[oi + 1] = narrow_unit_to_u16(oetf_srgb(rgb_lin[1]));
    rgba_out[oi + 2] = narrow_unit_to_u16(oetf_srgb(rgb_lin[2]));
    rgba_out[oi + 3] = 0xFFFF;
  }
}

/// XYZ12 → packed linear RGB (f32). Lossless after the matrix; **no
/// OETF, no clamp** — out-of-gamut negative R/G/B and HDR > 1 values
/// are emitted bit-exact.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_f32_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [f32],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = rgb_lin[0];
    rgb_out[i + 1] = rgb_lin[1];
    rgb_out[i + 2] = rgb_lin[2];
  }
}

/// XYZ12 → packed linear XYZ (f32). Lossless XYZ pass-through — only
/// step 1 of the pipeline (SMPTE ST 428-1 inverse OETF). No matrix, no
/// gamma, no clamp. Useful for callers that want to do their own gamut
/// conversion downstream.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_xyz_f32_row<const BE: bool>(xyz: &[u16], xyz_out: &mut [f32], width: usize) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(xyz_out.len() >= width * 3, "xyz_out row too short");
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let xyz_lin = xyz12_pixel_to_xyz_linear::<BE>(&triple);
    xyz_out[i] = xyz_lin[0];
    xyz_out[i + 1] = xyz_lin[1];
    xyz_out[i + 2] = xyz_lin[2];
  }
}

/// XYZ12 → packed RGB (f16). Full pipeline like u8 but f16 narrow at
/// the end (IEEE-754 RNE via `f16::from_f32`). Clamp `[0, 1]` before
/// narrowing per integer-output convention.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_f16_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [half::f16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = half::f16::from_f32(oetf_srgb(rgb_lin[0]).clamp(0.0, 1.0));
    rgb_out[i + 1] = half::f16::from_f32(oetf_srgb(rgb_lin[1]).clamp(0.0, 1.0));
    rgb_out[i + 2] = half::f16::from_f32(oetf_srgb(rgb_lin[2]).clamp(0.0, 1.0));
  }
}

/// XYZ12 → packed RGBA (f16). Same as f16 RGB; alpha forced to
/// `1.0_f16`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_f16_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [half::f16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  let one_f16 = half::f16::from_f32(1.0);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = half::f16::from_f32(oetf_srgb(rgb_lin[0]).clamp(0.0, 1.0));
    rgba_out[oi + 1] = half::f16::from_f32(oetf_srgb(rgb_lin[1]).clamp(0.0, 1.0));
    rgba_out[oi + 2] = half::f16::from_f32(oetf_srgb(rgb_lin[2]).clamp(0.0, 1.0));
    rgba_out[oi + 3] = one_f16;
  }
}

// XYZ12-specific RGB → luma helpers.
//
// Routing the `with_luma` / `with_luma_u16` paths through the YUV-leaning
// `ColorMatrix` enum (BT.709 for both DciP3 and Rec709 targets,
// BT.2020Ncl for Rec2020) biases luma for saturated colours under the
// DCI-P3 target — DCI-P3's perceptual brightness has its own weights
// derived from the DCI-white-pointed RGB→XYZ matrix Y row. These helpers
// take the gamut-derived Q15 weights directly (carried on
// `Xyz12Row::luma_q15()`), bypassing the `ColorMatrix` enum entirely.
//
// No SIMD path: luma cost (one Q15 multiply-add per channel) is dwarfed
// by the upstream 6x scalar `powf` work in the matmul + OETF stages —
// vectorising luma here gives no measurable win.
/// XYZ12 luma kernel (u8 output). `luma_q15` carries the gamut-matched
/// Q15 coefficients `(k_r, k_g, k_b)` from
/// [`crate::source::luma_weights_q15_for_gamut`]. Output is full-range Y'
/// in `[0, 255]` — XYZ12's gamma-encoded RGB is full-range by
/// construction (no studio-range concept).
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_rgb_to_luma_row(
  rgb: &[u8],
  luma_out: &mut [u8],
  width: usize,
  luma_q15: (i32, i32, i32),
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  let (k_r, k_g, k_b) = luma_q15;
  const RND: i32 = 1 << 14;
  for x in 0..width {
    let r = rgb[x * 3] as i32;
    let g = rgb[x * 3 + 1] as i32;
    let b = rgb[x * 3 + 2] as i32;
    let y = (k_r * r + k_g * g + k_b * b + RND) >> 15;
    luma_out[x] = y.clamp(0, 255) as u8;
  }
}

/// XYZ12 luma kernel (u16 output). Y' is computed at u8 precision
/// (matches the `with_luma` u8 path) and zero-extended to `u16`,
/// preserving the same `[0, 255]` dynamic range — same convention as
/// every other `*_to_luma_u16_row` kernel in colconv.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_rgb_to_luma_u16_row(
  rgb: &[u8],
  luma_out: &mut [u16],
  width: usize,
  luma_q15: (i32, i32, i32),
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  let (k_r, k_g, k_b) = luma_q15;
  const RND: i32 = 1 << 14;
  for x in 0..width {
    let r = rgb[x * 3] as i32;
    let g = rgb[x * 3 + 1] as i32;
    let b = rgb[x * 3 + 2] as i32;
    let y = (k_r * r + k_g * g + k_b * b + RND) >> 15;
    luma_out[x] = y.clamp(0, 255) as u16;
  }
}

#[cfg(all(test, feature = "std"))]
mod tests;