colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
//! Scalar reference kernels for the AYUV64 packed YUV 4:4:4 16-bit
//! family (FFmpeg `AV_PIX_FMT_AYUV64LE`). Each pixel is a 4-u16
//! quadruple `A(16) ‖ Y(16) ‖ U(16) ‖ V(16)`.
//!
//! Source α is real (depth-converted u16 → u8 for u8 RGBA output;
//! written direct as u16 for u16 RGBA output). Type-distinct; no
//! α-as-padding sibling in scope.
//!
//! The `<ALPHA, ALPHA_SRC>` const-generic template covers all valid
//! monomorphizations: `<false, false>` (RGB-only, drops α), `<true,
//! true>` (RGBA with source α). `<false, true>` is rejected at
//! monomorphization. `<true, false>` (force-max α) is unused — no
//! AYUV64x sibling.
//!
//! u8 output uses i32 chroma (output-range scaling keeps within i32);
//! u16 output uses **i64 chroma** via `q15_chroma64` (Q15 sums
//! overflow i32 at BITS=16/16, peak ~3.7e9 for BT.2020).
//!
//! `<const BE: bool>` — when `true`, each `u16` element of the input
//! slice is byte-swapped before use. This handles the `AYUV64BE`
//! big-endian wire format; `BE = false` is the standard LE path.

use super::*;

/// Extract `(u, y, v, a)` from one AYUV64 pixel quadruple.
///
/// Channel slot order: A at slot 0, Y at slot 1, U at slot 2, V at slot 3
/// (differs from VUYA which has A at slot 3). No right-shift needed — 16-bit
/// native samples with no padding bits.
///
/// Samples are passed already endian-corrected by the caller.
#[cfg_attr(not(tarpaulin), inline(always))]
const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) {
  let a = quad[0]; // slot 0 = A (source α)
  let y = quad[1] as i32; // slot 1 = Y
  let u = quad[2] as i32; // slot 2 = U
  let v = quad[3] as i32; // slot 3 = V
  (u, y, v, a) // returned as (u, y, v, a) for consistency with chroma pipeline
}

/// Load one AYUV64 u16 sample, applying a byte-swap for BE wire format
/// when `BE = true`. Uses target-endian aware `u16::from_be`/`u16::from_le`
/// — these are no-ops when the source byte order matches the host, so the
/// helper produces correct samples on both LE and BE hosts (e.g. s390x).
#[cfg_attr(not(tarpaulin), inline(always))]
fn load_ayuv64_u16<const BE: bool>(v: u16) -> u16 {
  if BE { u16::from_be(v) } else { u16::from_le(v) }
}

// ---- u8 output (i32 chroma) --------------------------------------------

/// Shared scalar kernel for AYUV64 → packed **RGB** (`ALPHA = false,
/// ALPHA_SRC = false`, 3 bpp) or → packed **RGBA** (`ALPHA = true,
/// ALPHA_SRC = true`, 4 bpp + depth-converted source α).
///
/// Input layout per pixel `n`: `packed[n*4] = A`, `packed[n*4+1] = Y`,
/// `packed[n*4+2] = U`, `packed[n*4+3] = V`. All channels are 16-bit
/// native (no padding bits, no shift required).
///
/// Source α is depth-converted u16 → u8 via `>> 8` when `ALPHA_SRC = true`.
///
/// # Panics (debug builds)
///
/// - `packed.len() >= width * 4`.
/// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgb_or_rgba_row<
  const ALPHA: bool,
  const ALPHA_SRC: bool,
  const BE: bool,
>(
  packed: &[u16],
  out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // Source alpha requires RGBA output — there is no 3 bpp store with
  // alpha to put it in.
  const { assert!(!ALPHA_SRC || ALPHA) };

  debug_assert!(packed.len() >= width * 4, "packed row too short");
  let bpp: usize = if ALPHA { 4 } else { 3 };
  debug_assert!(out.len() >= width * bpp, "out row too short");

  let coeffs = Coefficients::for_matrix(matrix);
  let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range);
  let bias = chroma_bias::<16>();

  for x in 0..width {
    let pix_off = x * 4;
    let quad = [
      load_ayuv64_u16::<BE>(packed[pix_off]),
      load_ayuv64_u16::<BE>(packed[pix_off + 1]),
      load_ayuv64_u16::<BE>(packed[pix_off + 2]),
      load_ayuv64_u16::<BE>(packed[pix_off + 3]),
    ];
    let (u, y, v, a) = extract_ayuv64(&quad);
    let u_d = q15_scale(u - bias, c_scale);
    let v_d = q15_scale(v - bias, c_scale);
    let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
    let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
    let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d);

    let y_s = q15_scale(y - y_off, y_scale);
    let off = x * bpp;
    out[off] = clamp_u8(y_s + r_chroma);
    out[off + 1] = clamp_u8(y_s + g_chroma);
    out[off + 2] = clamp_u8(y_s + b_chroma);
    if ALPHA {
      // ALPHA_SRC=true: depth-convert u16 → u8 by taking high byte (>> 8).
      // ALPHA_SRC=false: force opaque (unused — no AYUV64x sibling).
      out[off + 3] = if ALPHA_SRC { (a >> 8) as u8 } else { 0xFF };
    }
  }
}

// ---- RGB / RGBA u8 thin wrappers ----------------------------------------

/// Scalar AYUV64 → packed **RGB** (3 bpp). Source α is discarded.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgb_row<const BE: bool>(
  packed: &[u16],
  rgb_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  ayuv64_to_rgb_or_rgba_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
}

/// Scalar AYUV64 → packed **RGBA** (4 bpp). The source A u16 at slot 0
/// of each pixel quadruple is depth-converted to u8 via `>> 8`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgba_row<const BE: bool>(
  packed: &[u16],
  rgba_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  ayuv64_to_rgb_or_rgba_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
}

// ---- u16 output (i64 chroma) -------------------------------------------

/// Shared scalar kernel for AYUV64 → packed **RGB u16** (`ALPHA = false,
/// ALPHA_SRC = false`, 3 x u16 per pixel) or → packed **RGBA u16**
/// (`ALPHA = true, ALPHA_SRC = true`, 4 x u16 per pixel + source α direct).
///
/// Uses **i64 chroma** via `q15_chroma64` because at BITS=16/16 the
/// Q15 chroma sums exceed i32 range (peak ~3.7x10⁹ for BT.2020-NCL at
/// limited range). Source α is written direct as u16 (no conversion).
///
/// # Panics (debug builds)
///
/// - `packed.len() >= width * 4`.
/// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row<
  const ALPHA: bool,
  const ALPHA_SRC: bool,
  const BE: bool,
>(
  packed: &[u16],
  out: &mut [u16],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // Source alpha requires RGBA output.
  const { assert!(!ALPHA_SRC || ALPHA) };

  debug_assert!(packed.len() >= width * 4, "packed row too short");
  let bpp: usize = if ALPHA { 4 } else { 3 };
  debug_assert!(out.len() >= width * bpp, "out row too short");

  let coeffs = Coefficients::for_matrix(matrix);
  let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range);
  let bias = chroma_bias::<16>();

  for x in 0..width {
    let pix_off = x * 4;
    let quad = [
      load_ayuv64_u16::<BE>(packed[pix_off]),
      load_ayuv64_u16::<BE>(packed[pix_off + 1]),
      load_ayuv64_u16::<BE>(packed[pix_off + 2]),
      load_ayuv64_u16::<BE>(packed[pix_off + 3]),
    ];
    let (u, y, v, a) = extract_ayuv64(&quad);
    // q15_scale returns i32; q15_chroma64 handles the i32→i64 promotion
    // internally — pass i32 values directly (same API as q15_chroma).
    let u_d = q15_scale(u - bias, c_scale);
    let v_d = q15_scale(v - bias, c_scale);
    let r_chroma = q15_chroma64(coeffs.r_u(), u_d, coeffs.r_v(), v_d);
    let g_chroma = q15_chroma64(coeffs.g_u(), u_d, coeffs.g_v(), v_d);
    let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d);

    // Use q15_scale64 for luma: at BITS=16/16 limited range, the product
    // (y - y_off) * y_scale can just exceed i32::MAX for out-of-range inputs.
    let y_s = q15_scale64(y - y_off, y_scale);
    let off = x * bpp;
    out[off] = (y_s + r_chroma).clamp(0, 0xFFFF) as u16;
    out[off + 1] = (y_s + g_chroma).clamp(0, 0xFFFF) as u16;
    out[off + 2] = (y_s + b_chroma).clamp(0, 0xFFFF) as u16;
    if ALPHA {
      // ALPHA_SRC=true: write source α u16 direct (no conversion needed).
      // ALPHA_SRC=false: force opaque (unused — no AYUV64x sibling).
      out[off + 3] = if ALPHA_SRC { a } else { 0xFFFF };
    }
  }
}

// ---- RGB / RGBA u16 thin wrappers ---------------------------------------

/// Scalar AYUV64 → packed **RGB u16** (3 x u16 per pixel). Source α discarded.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgb_u16_row<const BE: bool>(
  packed: &[u16],
  rgb_out: &mut [u16],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  ayuv64_to_rgb_u16_or_rgba_u16_row::<false, false, BE>(packed, rgb_out, width, matrix, full_range);
}

/// Scalar AYUV64 → packed **RGBA u16** (4 x u16 per pixel). The source A u16
/// at slot 0 of each pixel quadruple is written direct (no conversion).
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_rgba_u16_row<const BE: bool>(
  packed: &[u16],
  rgba_out: &mut [u16],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  ayuv64_to_rgb_u16_or_rgba_u16_row::<true, true, BE>(packed, rgba_out, width, matrix, full_range);
}

// ---- Luma extraction ---------------------------------------------------

/// Copies only the Y u16 from each AYUV64 pixel into a u8 luma plane,
/// extracting the high byte via `>> 8`. Y is at slot 1 of each quadruple.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_luma_row<const BE: bool>(
  packed: &[u16],
  luma_out: &mut [u8],
  width: usize,
) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  for x in 0..width {
    luma_out[x] = (load_ayuv64_u16::<BE>(packed[x * 4 + 1]) >> 8) as u8;
  }
}

/// Copies only the Y u16 from each AYUV64 pixel into a u16 luma plane,
/// direct (no shift — 16-bit native). Y is at slot 1 of each quadruple.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn ayuv64_to_luma_u16_row<const BE: bool>(
  packed: &[u16],
  luma_out: &mut [u16],
  width: usize,
) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  for x in 0..width {
    luma_out[x] = load_ayuv64_u16::<BE>(packed[x * 4 + 1]);
  }
}

// ---- Tests -------------------------------------------------------------

#[cfg(all(test, feature = "std"))]
mod tests {
  use super::*;
  use crate::ColorMatrix;

  /// Build a 4-u16 AYUV64 pixel (host-native u16 quadruple) from explicit
  /// components.
  fn pack_ayuv64(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] {
    [a, y, u, v]
  }

  /// Re-encode a slice of host-native u16 values as LE-encoded byte storage,
  /// packed back into `Vec<u16>`. On LE host this is a no-op; on BE host
  /// every u16 is byte-swapped relative to its host-native representation.
  /// Kernels called with `BE = false` recover the intended logical values
  /// via `u16::from_le` on both hosts.
  fn as_le_u16(host: &[u16]) -> Vec<u16> {
    host
      .iter()
      .map(|v| u16::from_ne_bytes(v.to_le_bytes()))
      .collect()
  }

  /// Limited-range BT.709, neutral chroma U=V=32768.
  /// Black:  Y=4096  (limited-range black at 16-bit: 16 * 256 = 4096).
  /// White:  Y=60160 (limited-range white at 16-bit: 235 * 256 = 60160).
  #[test]
  fn ayuv64_known_pattern_rgb_limited_range() {
    let p_black = pack_ayuv64(0xFFFF, 4096, 32768, 32768);
    let p_white = pack_ayuv64(0xFFFF, 60160, 32768, 32768);
    let intended: Vec<u16> = [p_black, p_black, p_white, p_white]
      .iter()
      .flatten()
      .copied()
      .collect();
    let packed = as_le_u16(&intended);
    let mut out = vec![0u8; 4 * 3];
    ayuv64_to_rgb_row::<false>(&packed, &mut out, 4, ColorMatrix::Bt709, false);
    // Black pixels → [0, 0, 0]
    assert_eq!(&out[0..3], &[0u8, 0, 0], "black pixel 0");
    assert_eq!(&out[3..6], &[0u8, 0, 0], "black pixel 1");
    // White pixels → [255, 255, 255]
    assert_eq!(&out[6..9], &[255u8, 255, 255], "white pixel 2");
    assert_eq!(&out[9..12], &[255u8, 255, 255], "white pixel 3");
  }

  /// AYUV64 RGBA u8: source α = 0x42AB / 0x99CD must appear depth-converted
  /// (>> 8) as 0x42 / 0x99 in the output α channel.
  ///
  /// Alpha values are deliberately non-byte-palindromic (low byte ≠ high
  /// byte) so a BE host that fails to LE-encode the fixture would see the
  /// alpha decoded as 0xAB42 / 0xCD99 and the assertion would fail. This
  /// is the test's only sentinel for the alpha pass-through path under
  /// real LE byte storage — palindromic alpha (e.g. 0x4242) would mask a
  /// missing `as_le_u16` wrap because byte-swap is a no-op on palindromes.
  #[test]
  fn ayuv64_rgba_passes_source_alpha_depth_converted() {
    let p0 = pack_ayuv64(0x42AB, 60160, 32768, 32768);
    let p1 = pack_ayuv64(0x99CD, 60160, 32768, 32768);
    let intended: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
    let packed = as_le_u16(&intended);
    let mut out = vec![0u8; 2 * 4];
    ayuv64_to_rgba_row::<false>(&packed, &mut out, 2, ColorMatrix::Bt709, false);
    assert_eq!(out[3], 0x42, "pixel 0 alpha (0x42AB >> 8 = 0x42)");
    assert_eq!(out[7], 0x99, "pixel 1 alpha (0x99CD >> 8 = 0x99)");
  }

  /// AYUV64 RGBA u16: source α = 0x42AB / 0x99CD must appear direct
  /// (no conversion) in the output α channel.
  ///
  /// Same non-palindromic-alpha rationale as the u8 variant above:
  /// asserting the full u16 alpha (not just one byte) plus LE-encoding
  /// the fixture turns this from a byte-vacuous check on BE into an
  /// end-to-end alpha pass-through verification under real LE storage.
  #[test]
  fn ayuv64_rgba_u16_passes_source_alpha_direct() {
    let p0 = pack_ayuv64(0x42AB, 60160, 32768, 32768);
    let p1 = pack_ayuv64(0x99CD, 60160, 32768, 32768);
    let intended: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
    let packed = as_le_u16(&intended);
    let mut out = vec![0u16; 2 * 4];
    ayuv64_to_rgba_u16_row::<false>(&packed, &mut out, 2, ColorMatrix::Bt709, false);
    assert_eq!(out[3], 0x42AB, "pixel 0 alpha u16 direct");
    assert_eq!(out[7], 0x99CD, "pixel 1 alpha u16 direct");
  }

  /// Luma u8: Y at slot 1, extracted via >> 8 (high byte only).
  /// Y=0xFFFF → 0xFF; Y=0x4000 → 0x40.
  #[test]
  fn ayuv64_luma_extract_u8_high_byte() {
    let p0 = pack_ayuv64(0, 0xFFFF, 0, 0);
    let p1 = pack_ayuv64(0, 0x4000, 0, 0);
    let intended: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
    let packed = as_le_u16(&intended);
    let mut out = vec![0u8; 2];
    ayuv64_to_luma_row::<false>(&packed, &mut out, 2);
    assert_eq!(&out[..], &[0xFFu8, 0x40], "luma u8 high-byte extract");
  }

  /// Luma u16: Y at slot 1, written direct (no shift).
  /// Y=0xABCD → 0xABCD; Y=0x1234 → 0x1234.
  #[test]
  fn ayuv64_luma_extract_u16_direct() {
    let p0 = pack_ayuv64(0, 0xABCD, 0, 0);
    let p1 = pack_ayuv64(0, 0x1234, 0, 0);
    let intended: Vec<u16> = [p0, p1].iter().flatten().copied().collect();
    let packed = as_le_u16(&intended);
    let mut out = vec![0u16; 2];
    ayuv64_to_luma_u16_row::<false>(&packed, &mut out, 2);
    assert_eq!(&out[..], &[0xABCDu16, 0x1234], "luma u16 direct extract");
  }

  #[test]
  fn ayuv64_be_roundtrip_matches_byte_swapped_le() {
    // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes`
    // so semantics are host-independent: on every host, `le` carries the
    // intended values as LE-encoded bytes and `be` carries the same values as
    // BE-encoded bytes. Both kernels should therefore decode to the same
    // intended host-native values (and produce identical RGB output) on both
    // LE and BE hosts. The earlier `swap_bytes` pattern only validated this
    // on LE hosts and degenerated to equal-but-wrong on BE hosts.
    let intended = pack_ayuv64(0xFFFF, 60160, 32768, 32768);
    let le_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
    let be_bytes: Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
    let le_buf: Vec<u16> = le_bytes
      .chunks_exact(2)
      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
      .collect();
    let be_buf: Vec<u16> = be_bytes
      .chunks_exact(2)
      .map(|b| u16::from_ne_bytes([b[0], b[1]]))
      .collect();
    let mut out_le = vec![0u8; 3];
    let mut out_be = vec![0u8; 3];
    ayuv64_to_rgb_row::<false>(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false);
    ayuv64_to_rgb_row::<true>(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false);
    assert_eq!(
      out_le, out_be,
      "AYUV64 BE scalar must match byte-swapped LE"
    );
  }
}