colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
use super::super::*;
use crate::{ColorMatrix, row::scalar};

/// Verify multi-channel Y+U lane order for Y2xx (BITS-generic Y210/Y212).
///
/// Y2xx YUYV-shape u16x2: `[Y0, U, Y1, V]` per 2 pixels (4:2:2).
/// MSB-aligned: low `(16 - BITS)` bits are zero, active value in high BITS.
/// - `Y[n] = ((n + 1) as u16) << shift`
/// - `U[k] = ((2k + 1) as u16) << shift`  (one U per pair)
/// - `V = 0x8000` (neutral midpoint, same for BITS=10 and BITS=12)
///
/// Part 1: luma u16 natural-order check.
/// Part 2: SIMD vs scalar parity on u16 RGB output.
///
/// NEON threshold: 8 px/iter. W=16 covers exactly 2 full SIMD iterations.
fn check_y2xx_lane_order_per_pixel_y_and_u<const BITS: u32>() {
  const W: usize = 16;
  let shift: u16 = (16 - BITS) as u16;
  let neutral_chroma: u16 = (1u16 << (BITS - 1)) << shift; // 0x8000 for both BITS=10,12

  // Build Y2xx YUYV-shape: [Y0, U, Y1, V] per 2-pixel pair.
  let mut packed = std::vec![0u16; W * 2];
  for k in 0..(W / 2) {
    let y0 = ((2 * k) as u16 + 1) << shift;
    let y1 = ((2 * k) as u16 + 2) << shift;
    let u = ((2 * k) as u16 + 1) << shift;
    packed[k * 4] = y0;
    packed[k * 4 + 1] = u;
    packed[k * 4 + 2] = y1;
    packed[k * 4 + 3] = neutral_chroma;
  }

  // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits).
  let mut luma_u16 = std::vec![0u16; W];
  unsafe {
    y2xx_n_to_luma_u16_row::<BITS, false>(&packed, &mut luma_u16, W);
  }
  let expected_luma: std::vec::Vec<u16> = (1..=W as u16).collect();
  assert_eq!(
    luma_u16, expected_luma,
    "y2xx<BITS={BITS}> luma_u16 reorder bug"
  );

  // Part 2: SIMD vs scalar parity at u16 RGB.
  let mut simd_rgb = std::vec![0u16; W * 3];
  let mut scalar_rgb = std::vec![0u16; W * 3];
  unsafe {
    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
      &packed,
      &mut simd_rgb,
      W,
      ColorMatrix::Bt709,
      false,
    );
  }
  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
    &packed,
    &mut scalar_rgb,
    W,
    ColorMatrix::Bt709,
    false,
  );
  assert_eq!(
    simd_rgb, scalar_rgb,
    "y2xx<BITS={BITS}> SIMD vs scalar diverges (u16 RGB)"
  );
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y2xx_lane_order_per_pixel_y_and_u_bits10() {
  check_y2xx_lane_order_per_pixel_y_and_u::<10>();
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y2xx_lane_order_per_pixel_y_and_u_bits12() {
  check_y2xx_lane_order_per_pixel_y_and_u::<12>();
}

/// Builds a deterministic pseudo-random Y210-shaped u16 buffer with
/// `width * 2` u16 samples (one quadruple = 4 u16 = 2 pixels). Each
/// u16 sample has 10 active bits sitting in the high bits, low 6
/// bits zero (matches Y210's MSB-aligned encoding).
fn pseudo_random_y210(width: usize, seed: usize) -> std::vec::Vec<u16> {
  (0..width * 2)
    .map(|i| {
      let s = ((i.wrapping_mul(seed).wrapping_add(seed * 3)) & 0x3FF) as u16;
      s << 6
    })
    .collect()
}

fn check_rgb<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bool) {
  let p = pseudo_random_y210(width, 0xAA55);
  let mut s = std::vec![0u8; width * 3];
  let mut k = std::vec![0u8; width * 3];
  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut s, width, matrix, full_range);
  unsafe {
    y2xx_n_to_rgb_or_rgba_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
  }
  assert_eq!(
    s, k,
    "NEON y2xx<{BITS}>→RGB diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
  );
}

fn check_rgba<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bool) {
  let p = pseudo_random_y210(width, 0xAA55);
  let mut s = std::vec![0u8; width * 4];
  let mut k = std::vec![0u8; width * 4];
  scalar::y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut s, width, matrix, full_range);
  unsafe {
    y2xx_n_to_rgb_or_rgba_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
  }
  assert_eq!(
    s, k,
    "NEON y2xx<{BITS}>→RGBA diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
  );
}

fn check_rgb_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bool) {
  let p = pseudo_random_y210(width, 0xAA55);
  let mut s = std::vec![0u16; width * 3];
  let mut k = std::vec![0u16; width * 3];
  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(
    &p, &mut s, width, matrix, full_range,
  );
  unsafe {
    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, false, false>(&p, &mut k, width, matrix, full_range);
  }
  assert_eq!(
    s, k,
    "NEON y2xx<{BITS}>→RGB u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
  );
}

fn check_rgba_u16<const BITS: u32>(width: usize, matrix: ColorMatrix, full_range: bool) {
  let p = pseudo_random_y210(width, 0xAA55);
  let mut s = std::vec![0u16; width * 4];
  let mut k = std::vec![0u16; width * 4];
  scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(
    &p, &mut s, width, matrix, full_range,
  );
  unsafe {
    y2xx_n_to_rgb_u16_or_rgba_u16_row::<BITS, true, false>(&p, &mut k, width, matrix, full_range);
  }
  assert_eq!(
    s, k,
    "NEON y2xx<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
  );
}

fn check_luma<const BITS: u32>(width: usize) {
  let p = pseudo_random_y210(width, 0xC001);
  let mut s = std::vec![0u8; width];
  let mut k = std::vec![0u8; width];
  scalar::y2xx_n_to_luma_row::<BITS, false>(&p, &mut s, width);
  unsafe {
    y2xx_n_to_luma_row::<BITS, false>(&p, &mut k, width);
  }
  assert_eq!(s, k, "NEON y2xx<{BITS}>→luma diverges (width={width})");
}

fn check_luma_u16<const BITS: u32>(width: usize) {
  let p = pseudo_random_y210(width, 0xC001);
  let mut s = std::vec![0u16; width];
  let mut k = std::vec![0u16; width];
  scalar::y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut s, width);
  unsafe {
    y2xx_n_to_luma_u16_row::<BITS, false>(&p, &mut k, width);
  }
  assert_eq!(s, k, "NEON y2xx<{BITS}>→luma u16 diverges (width={width})");
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y210_rgb_matches_scalar_all_matrices() {
  for m in [
    ColorMatrix::Bt601,
    ColorMatrix::Bt709,
    ColorMatrix::Bt2020Ncl,
    ColorMatrix::Smpte240m,
    ColorMatrix::Fcc,
    ColorMatrix::YCgCo,
  ] {
    for full in [true, false] {
      check_rgb::<10>(16, m, full);
      check_rgba::<10>(16, m, full);
      check_rgb_u16::<10>(16, m, full);
      check_rgba_u16::<10>(16, m, full);
    }
  }
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y210_matches_scalar_widths() {
  for w in [2usize, 4, 14, 16, 18, 30, 32, 34, 62, 64, 66, 1920, 1922] {
    check_rgb::<10>(w, ColorMatrix::Bt709, false);
    check_rgba::<10>(w, ColorMatrix::Bt709, true);
    check_rgb_u16::<10>(w, ColorMatrix::Bt2020Ncl, true);
    check_rgba_u16::<10>(w, ColorMatrix::Bt601, false);
  }
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y210_luma_matches_scalar_widths() {
  for w in [2usize, 4, 14, 16, 18, 30, 32, 34, 62, 64, 66, 1920, 1922] {
    check_luma::<10>(w);
    check_luma_u16::<10>(w);
  }
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y212_matches_scalar_widths() {
  // 12-bit MSB-aligned generator: shift by 4 instead of 6.
  fn pseudo_random_y212(width: usize, seed: usize) -> std::vec::Vec<u16> {
    (0..width * 2)
      .map(|i| {
        let s = ((i.wrapping_mul(seed).wrapping_add(seed * 3)) & 0xFFF) as u16;
        s << 4
      })
      .collect()
  }
  for w in [2usize, 4, 14, 16, 18, 30, 32, 34, 62, 64, 66, 1920, 1922] {
    let p = pseudo_random_y212(w, 0xAA55);
    let mut s = std::vec![0u8; w * 3];
    let mut k = std::vec![0u8; w * 3];
    scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false);
    unsafe {
      y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false);
    }
    assert_eq!(s, k, "NEON y2xx<12>→RGB diverges (width={w})");

    let mut s_u16 = std::vec![0u16; w * 4];
    let mut k_u16 = std::vec![0u16; w * 4];
    scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
      &p,
      &mut s_u16,
      w,
      ColorMatrix::Bt2020Ncl,
      true,
    );
    unsafe {
      y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(
        &p,
        &mut k_u16,
        w,
        ColorMatrix::Bt2020Ncl,
        true,
      );
    }
    assert_eq!(s_u16, k_u16, "NEON y2xx<12>→RGBA u16 diverges (width={w})");

    let mut sl = std::vec![0u8; w];
    let mut kl = std::vec![0u8; w];
    scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w);
    unsafe {
      y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w);
    }
    assert_eq!(sl, kl, "NEON y2xx<12>→luma diverges (width={w})");

    let mut slu = std::vec![0u16; w];
    let mut klu = std::vec![0u16; w];
    scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w);
    unsafe {
      y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w);
    }
    assert_eq!(slu, klu, "NEON y2xx<12>→luma u16 diverges (width={w})");
  }
}

// Host-independent BE/LE SIMD parity tests.
//
// Constructs LE/BE buffers from raw bytes via `to_le_bytes` /
// `to_be_bytes` and reinterprets as host-native `u16` via `from_ne_bytes`.
// The byte-level encoding is host-independent — on every host the LE
// buffer carries the intended values as LE-encoded bytes and the BE
// buffer carries the same values as BE-encoded bytes — so both kernel
// monomorphizations decode to the same logical values and produce
// byte-identical output on both LE and BE hosts. Locks down the
// `BE == HOST_NATIVE_BE` host-endian gate on the NEON Y2xx SIMD bodies.

/// Builds intended Y2xx-shaped values then materializes both LE-encoded
/// and BE-encoded `&[u16]` planes from raw bytes (host-independent).
fn build_le_be_y2xx<const BITS: u32>(
  width: usize,
  seed: usize,
) -> (std::vec::Vec<u16>, std::vec::Vec<u16>) {
  let shift = 16 - BITS;
  let mask: u16 = (1u16 << BITS) - 1;
  let intended: std::vec::Vec<u16> = (0..width * 2)
    .map(|i| {
      let s = ((i.wrapping_mul(seed).wrapping_add(seed * 3)) & mask as usize) as u16;
      s << shift
    })
    .collect();
  let le_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_le_bytes()).collect();
  let be_bytes: std::vec::Vec<u8> = intended.iter().flat_map(|v| v.to_be_bytes()).collect();
  let le: std::vec::Vec<u16> = le_bytes
    .chunks_exact(2)
    .map(|b| u16::from_ne_bytes([b[0], b[1]]))
    .collect();
  let be: std::vec::Vec<u16> = be_bytes
    .chunks_exact(2)
    .map(|b| u16::from_ne_bytes([b[0], b[1]]))
    .collect();
  (le, be)
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y2xx_be_le_simd_parity_bits10() {
  // Widths covering the SIMD body (8 px), tail-only (< 8), and
  // body+tail (8 + tail) so both code paths are exercised on each host.
  for w in [4usize, 8, 14, 16, 22, 32, 1920] {
    let (le, be) = build_le_be_y2xx::<10>(w, 0xBEEF);
    // u8 RGB
    let mut le_rgb = std::vec![0u8; w * 3];
    let mut be_rgb = std::vec![0u8; w * 3];
    unsafe {
      y2xx_n_to_rgb_or_rgba_row::<10, false, false>(&le, &mut le_rgb, w, ColorMatrix::Bt709, false);
      y2xx_n_to_rgb_or_rgba_row::<10, false, true>(&be, &mut be_rgb, w, ColorMatrix::Bt709, false);
    }
    assert_eq!(le_rgb, be_rgb, "y2xx<10> NEON LE vs BE RGB parity (w={w})");

    // u16 RGB
    let mut le_u16 = std::vec![0u16; w * 3];
    let mut be_u16 = std::vec![0u16; w * 3];
    unsafe {
      y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(
        &le,
        &mut le_u16,
        w,
        ColorMatrix::Bt709,
        false,
      );
      y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(
        &be,
        &mut be_u16,
        w,
        ColorMatrix::Bt709,
        false,
      );
    }
    assert_eq!(
      le_u16, be_u16,
      "y2xx<10> NEON LE vs BE RGB u16 parity (w={w})"
    );

    // luma u8
    let mut le_l = std::vec![0u8; w];
    let mut be_l = std::vec![0u8; w];
    unsafe {
      y2xx_n_to_luma_row::<10, false>(&le, &mut le_l, w);
      y2xx_n_to_luma_row::<10, true>(&be, &mut be_l, w);
    }
    assert_eq!(le_l, be_l, "y2xx<10> NEON LE vs BE luma u8 parity (w={w})");

    // luma u16
    let mut le_lu = std::vec![0u16; w];
    let mut be_lu = std::vec![0u16; w];
    unsafe {
      y2xx_n_to_luma_u16_row::<10, false>(&le, &mut le_lu, w);
      y2xx_n_to_luma_u16_row::<10, true>(&be, &mut be_lu, w);
    }
    assert_eq!(
      le_lu, be_lu,
      "y2xx<10> NEON LE vs BE luma u16 parity (w={w})"
    );
  }
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_y2xx_be_le_simd_parity_bits12() {
  for w in [4usize, 8, 14, 16, 22, 32, 1920] {
    let (le, be) = build_le_be_y2xx::<12>(w, 0xC0DE);

    let mut le_rgba = std::vec![0u8; w * 4];
    let mut be_rgba = std::vec![0u8; w * 4];
    unsafe {
      y2xx_n_to_rgb_or_rgba_row::<12, true, false>(
        &le,
        &mut le_rgba,
        w,
        ColorMatrix::Bt2020Ncl,
        true,
      );
      y2xx_n_to_rgb_or_rgba_row::<12, true, true>(
        &be,
        &mut be_rgba,
        w,
        ColorMatrix::Bt2020Ncl,
        true,
      );
    }
    assert_eq!(
      le_rgba, be_rgba,
      "y2xx<12> NEON LE vs BE RGBA parity (w={w})"
    );

    let mut le_lu = std::vec![0u16; w];
    let mut be_lu = std::vec![0u16; w];
    unsafe {
      y2xx_n_to_luma_u16_row::<12, false>(&le, &mut le_lu, w);
      y2xx_n_to_luma_u16_row::<12, true>(&be, &mut be_lu, w);
    }
    assert_eq!(
      le_lu, be_lu,
      "y2xx<12> NEON LE vs BE luma u16 parity (w={w})"
    );
  }
}