colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
//! Scalar reference kernels for the Tier 12 (DCP / Xyz12) source.
//!
//! Pipeline (per-pixel):
//!
//! ```text
//! xyz_u12  →  xyz_linear (f32)  →  rgb_linear (f32) via M_xyz_to_rgb
//!         →  rgb_gamma (f32) via OETF  →  bgr_u8 / rgb_u8 / etc
//! ```
//!
//! Steps:
//!
//! 1. SMPTE ST 428-1 §8 inverse-OETF:
//!    `xyz_lin = (x_u12 / 4095)^2.6 / 0.91653`. Applied to each X/Y/Z
//!    sample independently.
//! 2. 3x3 matmul against the active gamut's `M_xyz_to_rgb` constant.
//! 3. sRGB-shape OETF (12.92 linear segment + `1.055 * c^(1/2.4) -
//!    0.055` upper segment). Skipped for f32-output paths
//!    (`xyz12_to_rgb_f32_row` / `xyz12_to_xyz_f32_row`).
//! 4. Range scale + integer narrow with round-half-up — only for u8 /
//!    u16 outputs.
//!
//! All kernels are const-generic over `BE: bool` for source endianness;
//! the `BE = false` branch is a compile-time no-op.

use crate::DcpTargetGamut;

use super::xyz12_constants::{
  INV_4095, OETF_POLY_COEFFS, OETF_POLY_DEGREE, OETF_POLY_SEG_BOUNDS, OETF_POLY_SEG_CENTERS,
  OETF_POLY_SEGMENTS, SAMPLE_MASK, SMPTE428_INV_NORM, xyz_to_rgb_matrix,
};

/// `f32` `powf` portable across `std` and `no_std + alloc` builds.
/// `std` provides `f32::powf` directly via libm; `no_std` builds opt
/// into the same routine via the `libm` crate (gated by the `alloc`
/// feature in the crate's `Cargo.toml`).
#[cfg_attr(not(tarpaulin), inline(always))]
fn powf32(x: f32, y: f32) -> f32 {
  #[cfg(feature = "std")]
  {
    f32::powf(x, y)
  }
  #[cfg(all(not(feature = "std"), feature = "alloc"))]
  {
    libm::powf(x, y)
  }
}

/// Test-only helper used by the `oetf_srgb_reference_f64` test oracle.
///
/// Compiled **only** under `cfg(all(test, feature = "std"))`. The xyz12
/// `mod tests;` declaration at the bottom of this file is gated on
/// `feature = "std"`, so no-default-features (`alloc`-only) builds never
/// reach this fn — and the inner `cfg(all(not(feature = "std"), ...))`
/// libm branch is therefore dead code in practice (kept as a
/// compile-time fallback in case the outer test gate is ever relaxed
/// to `any(feature = "std", feature = "alloc")` to match `powf32`).
/// Production `oetf_srgb` uses the polynomial table in
/// `xyz12_constants`, not `powf`, so this helper is only needed in the
/// test harness.
#[cfg(all(test, feature = "std"))]
#[cfg_attr(not(tarpaulin), inline(always))]
fn powf64(x: f64, y: f64) -> f64 {
  #[cfg(feature = "std")]
  {
    f64::powf(x, y)
  }
  #[cfg(all(not(feature = "std"), feature = "alloc"))]
  {
    libm::pow(x, y)
  }
}
// Helpers — kept `pub(crate)` so SIMD backends can re-use the OETF
// formula in their scalar tail / scalar-`powf` lanes.
/// Reads a packed XYZ12 sample with byte-swap if `BE` is set, then
/// extracts the active 12-bit code from the high-bit-packed `u16`.
///
/// FFmpeg's `AV_PIX_FMT_XYZ12LE` / `AV_PIX_FMT_XYZ12BE` formats are
/// described as "the same as RGB48LE/BE, but the lower 4 bits of each
/// component are zero" — i.e. the active 12-bit code lives in bits
/// `[15:4]` of each `u16`, not bits `[11:0]`. After the endian-aware
/// load, we right-shift by 4 to recover the active code, then mask to
/// 12 bits as a defensive guard against dirty-low-bit producers.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn read_xyz12_sample<const BE: bool>(s: u16) -> u16 {
  let raw = if BE { u16::from_be(s) } else { u16::from_le(s) };
  (raw >> 4) & SAMPLE_MASK
}

/// SMPTE ST 428-1 §8 inverse OETF: u12 → linear XYZ value in `f32`.
/// `xyz_lin = (x_u12 / 4095)^2.6 / 0.91653`.
///
/// Input is the **active 12-bit code** (`0..=4095`), already extracted
/// from the high-bit-packed wire `u16` by the caller (scalar callers
/// route through `read_xyz12_sample`; SIMD backends apply a `>> 4`
/// shift in the load path before this function). The internal
/// `& SAMPLE_MASK` is a defensive belt-and-braces clamp against
/// callers passing a non-shifted value.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn smpte428_inverse_oetf(x_u12: u16) -> f32 {
  let normalised = (x_u12 & SAMPLE_MASK) as f32 * INV_4095;
  powf32(normalised, 2.6_f32) * SMPTE428_INV_NORM
}

/// Applies a 3x3 matrix to a linear XYZ vector, returning linear RGB.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn matmul3_xyz_rgb(m: &[[f32; 3]; 3], xyz: [f32; 3]) -> [f32; 3] {
  let [x, y, z] = xyz;
  [
    m[0][0] * x + m[0][1] * y + m[0][2] * z,
    m[1][0] * x + m[1][1] * y + m[1][2] * z,
    m[2][0] * x + m[2][1] * y + m[2][2] * z,
  ]
}

/// sRGB-shape OETF — production entry point used by every integer-
/// output scalar kernel and as the per-lane scalar fall-through by the
/// SIMD backends. Implemented via the piecewise-minimax polynomial
/// (degree 3, 192 segments) generated by
/// `examples/derive_oetf_polynomial.rs`.
///
/// `c < 0.0031308`: `12.92 * c` (linear toe; exact in f32).
/// `c >= 0.0031308`: piecewise polynomial; see the `OETF_POLY_*`
/// tables in `xyz12_constants`.
///
/// Reference target (B' decision): the f64-narrowed sRGB OETF
/// `(1.055_f64 * (c as f64).powf(1/2.4) - 0.055) as f32`. The
/// polynomial matches that reference within ≤ 2 ULP at 65 536 sweep
/// points across `[0.0031308, 1.0]` — verified by
/// `oetf_srgb_polynomial_within_2_ulp_of_reference`. The reference itself
/// is closer to mathematically-correct sRGB OETF than pure-f32
/// `f32::powf` (which is ~2 ULP off truth and platform-dependent), so
/// switching from `f32::powf` to this polynomial is a strict
/// correctness *and* perf upgrade (no `powf` per-pixel cost).
///
/// Scalar↔SIMD parity is 0 ULP by construction: both paths evaluate
/// the same polynomial against the same coefficient tables. SIMD
/// backends vectorise the Horner evaluation across f32 lanes; this
/// function is the SIMD per-lane scalar fall-through where the
/// vectorised segment selector exits early (e.g., a single trailing
/// element).
///
/// Inputs `c < 0` produce a small negative result via the linear toe;
/// inputs `c > 1` produce values somewhat larger than 1 via the upper
/// segment's polynomial extrapolation — callers clamp at the integer
/// narrow (`narrow_unit_to_u{8,16}`) or downstream f16 cast.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn oetf_srgb(c: f32) -> f32 {
  if c < 0.0031308_f32 {
    return 12.92_f32 * c;
  }
  // Segment lookup: walk the bounds table from highest to lowest
  // (`c >= bound[i]` ⇒ segment `i`). With 192 segments this is a
  // bounded scan of length ≤ 192 — vectorisable to a hierarchical
  // compare-tree by the SIMD backends; the scalar tail uses the linear
  // walk for code-size simplicity.
  let mut seg_idx = 0_usize;
  let mut i = OETF_POLY_SEGMENTS;
  while i > 0 {
    i -= 1;
    if c >= OETF_POLY_SEG_BOUNDS[i] {
      seg_idx = i;
      break;
    }
  }
  let center = OETF_POLY_SEG_CENTERS[seg_idx];
  let dx = c - center;
  let base = seg_idx * (OETF_POLY_DEGREE + 1);
  // Centered Horner: `c[d]·dx + c[d-1]·dx + ... + c[0]`. Coefficient
  // count is `OETF_POLY_DEGREE + 1` (= 4 for degree 3).
  let mut acc = 0.0_f32;
  let mut k = OETF_POLY_DEGREE + 1;
  while k > 0 {
    k -= 1;
    acc = acc * dx + OETF_POLY_COEFFS[base + k];
  }
  acc
}

/// Round-half-up f32 → u8 narrow with `[0, 1]` clamp.
/// `(c.clamp(0, 1) * 255 + 0.5) as u8`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn narrow_unit_to_u8(c: f32) -> u8 {
  let scaled = c.clamp(0.0_f32, 1.0_f32) * 255.0_f32 + 0.5_f32;
  scaled.clamp(0.0_f32, 255.0_f32) as u8
}

/// Round-half-up f32 → u16 narrow with `[0, 1]` clamp.
/// `(c.clamp(0, 1) * 65535 + 0.5) as u16`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn narrow_unit_to_u16(c: f32) -> u16 {
  let scaled = c.clamp(0.0_f32, 1.0_f32) * 65535.0_f32 + 0.5_f32;
  scaled.clamp(0.0_f32, 65535.0_f32) as u16
}

/// Computes a single pixel's linear RGB from packed XYZ12 input.
/// Steps 1 + 2 of the pipeline (inverse-OETF + matmul). Used by every
/// downstream output kernel.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_pixel_to_rgb_linear<const BE: bool>(
  m: &[[f32; 3]; 3],
  triple: &[u16; 3],
) -> [f32; 3] {
  let x = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[0]));
  let y = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[1]));
  let z = smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[2]));
  matmul3_xyz_rgb(m, [x, y, z])
}

/// Computes a single pixel's linear XYZ (steps 1 only). Used by
/// `xyz12_to_xyz_f32_row` for lossless XYZ pass-through.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_pixel_to_xyz_linear<const BE: bool>(triple: &[u16; 3]) -> [f32; 3] {
  [
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[0])),
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[1])),
    smpte428_inverse_oetf(read_xyz12_sample::<BE>(triple[2])),
  ]
}

// Per-output kernels.
/// XYZ12 → packed RGB (u8). Full pipeline: inverse-OETF + matmul +
/// sRGB OETF + clamp + x255 + round-half-up.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [u8],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = narrow_unit_to_u8(oetf_srgb(rgb_lin[0]));
    rgb_out[i + 1] = narrow_unit_to_u8(oetf_srgb(rgb_lin[1]));
    rgb_out[i + 2] = narrow_unit_to_u8(oetf_srgb(rgb_lin[2]));
  }
}

/// XYZ12 → packed RGBA (u8). Same as RGB; alpha forced to `0xFF`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [u8],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = narrow_unit_to_u8(oetf_srgb(rgb_lin[0]));
    rgba_out[oi + 1] = narrow_unit_to_u8(oetf_srgb(rgb_lin[1]));
    rgba_out[oi + 2] = narrow_unit_to_u8(oetf_srgb(rgb_lin[2]));
    rgba_out[oi + 3] = 0xFF;
  }
}

/// XYZ12 → packed RGB (u16). Full pipeline; full-range scaling
/// `[0, 1] x 65535 + round-half-up`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_u16_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [u16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = narrow_unit_to_u16(oetf_srgb(rgb_lin[0]));
    rgb_out[i + 1] = narrow_unit_to_u16(oetf_srgb(rgb_lin[1]));
    rgb_out[i + 2] = narrow_unit_to_u16(oetf_srgb(rgb_lin[2]));
  }
}

/// XYZ12 → packed RGBA (u16). Same as RGB-u16; alpha forced to
/// `0xFFFF`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_u16_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [u16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = narrow_unit_to_u16(oetf_srgb(rgb_lin[0]));
    rgba_out[oi + 1] = narrow_unit_to_u16(oetf_srgb(rgb_lin[1]));
    rgba_out[oi + 2] = narrow_unit_to_u16(oetf_srgb(rgb_lin[2]));
    rgba_out[oi + 3] = 0xFFFF;
  }
}

/// XYZ12 → packed linear RGB (f32). Lossless after the matrix; **no
/// OETF, no clamp** — out-of-gamut negative R/G/B and HDR > 1 values
/// are emitted bit-exact.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_f32_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [f32],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = rgb_lin[0];
    rgb_out[i + 1] = rgb_lin[1];
    rgb_out[i + 2] = rgb_lin[2];
  }
}

/// XYZ12 → packed linear XYZ (f32). Lossless XYZ pass-through — only
/// step 1 of the pipeline (SMPTE ST 428-1 inverse OETF). No matrix, no
/// gamma, no clamp. Useful for callers that want to do their own gamut
/// conversion downstream.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_xyz_f32_row<const BE: bool>(xyz: &[u16], xyz_out: &mut [f32], width: usize) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(xyz_out.len() >= width * 3, "xyz_out row too short");
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let xyz_lin = xyz12_pixel_to_xyz_linear::<BE>(&triple);
    xyz_out[i] = xyz_lin[0];
    xyz_out[i + 1] = xyz_lin[1];
    xyz_out[i + 2] = xyz_lin[2];
  }
}

/// XYZ12 → packed RGB (f16). Full pipeline like u8 but f16 narrow at
/// the end (IEEE-754 RNE via `f16::from_f32`). Clamp `[0, 1]` before
/// narrowing per integer-output convention.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgb_f16_row<const BE: bool>(
  xyz: &[u16],
  rgb_out: &mut [half::f16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  for x in 0..width {
    let i = x * 3;
    let triple = [xyz[i], xyz[i + 1], xyz[i + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgb_out[i] = half::f16::from_f32(oetf_srgb(rgb_lin[0]).clamp(0.0, 1.0));
    rgb_out[i + 1] = half::f16::from_f32(oetf_srgb(rgb_lin[1]).clamp(0.0, 1.0));
    rgb_out[i + 2] = half::f16::from_f32(oetf_srgb(rgb_lin[2]).clamp(0.0, 1.0));
  }
}

/// XYZ12 → packed RGBA (f16). Same as f16 RGB; alpha forced to
/// `1.0_f16`.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_to_rgba_f16_row<const BE: bool>(
  xyz: &[u16],
  rgba_out: &mut [half::f16],
  width: usize,
  target_gamut: DcpTargetGamut,
) {
  debug_assert!(xyz.len() >= width * 3, "xyz row too short");
  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
  let m = xyz_to_rgb_matrix(target_gamut);
  let one_f16 = half::f16::from_f32(1.0);
  for x in 0..width {
    let xi = x * 3;
    let oi = x * 4;
    let triple = [xyz[xi], xyz[xi + 1], xyz[xi + 2]];
    let rgb_lin = xyz12_pixel_to_rgb_linear::<BE>(&m, &triple);
    rgba_out[oi] = half::f16::from_f32(oetf_srgb(rgb_lin[0]).clamp(0.0, 1.0));
    rgba_out[oi + 1] = half::f16::from_f32(oetf_srgb(rgb_lin[1]).clamp(0.0, 1.0));
    rgba_out[oi + 2] = half::f16::from_f32(oetf_srgb(rgb_lin[2]).clamp(0.0, 1.0));
    rgba_out[oi + 3] = one_f16;
  }
}

// XYZ12-specific RGB → luma helpers.
//
// Routing the `with_luma` / `with_luma_u16` paths through the YUV-leaning
// `ColorMatrix` enum (BT.709 for both DciP3 and Rec709 targets,
// BT.2020Ncl for Rec2020) biases luma for saturated colours under the
// DCI-P3 target — DCI-P3's perceptual brightness has its own weights
// derived from the DCI-white-pointed RGB→XYZ matrix Y row. These helpers
// take the gamut-derived Q15 weights directly (carried on
// `Xyz12Row::luma_q15()`), bypassing the `ColorMatrix` enum entirely.
//
// No SIMD path: luma cost (one Q15 multiply-add per channel) is dwarfed
// by the upstream 6x scalar `powf` work in the matmul + OETF stages —
// vectorising luma here gives no measurable win.
/// XYZ12 luma kernel (u8 output). `luma_q15` carries the gamut-matched
/// Q15 coefficients `(k_r, k_g, k_b)` from
/// [`crate::source::luma_weights_q15_for_gamut`]. Output is full-range Y'
/// in `[0, 255]` — XYZ12's gamma-encoded RGB is full-range by
/// construction (no studio-range concept).
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_rgb_to_luma_row(
  rgb: &[u8],
  luma_out: &mut [u8],
  width: usize,
  luma_q15: (i32, i32, i32),
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  let (k_r, k_g, k_b) = luma_q15;
  const RND: i32 = 1 << 14;
  for x in 0..width {
    let r = rgb[x * 3] as i32;
    let g = rgb[x * 3 + 1] as i32;
    let b = rgb[x * 3 + 2] as i32;
    let y = (k_r * r + k_g * g + k_b * b + RND) >> 15;
    luma_out[x] = y.clamp(0, 255) as u8;
  }
}

/// XYZ12 luma kernel (u16 output). Y' is computed at u8 precision
/// (matches the `with_luma` u8 path) and zero-extended to `u16`,
/// preserving the same `[0, 255]` dynamic range — same convention as
/// every other `*_to_luma_u16_row` kernel in colconv.
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn xyz12_rgb_to_luma_u16_row(
  rgb: &[u8],
  luma_out: &mut [u16],
  width: usize,
  luma_q15: (i32, i32, i32),
) {
  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");
  let (k_r, k_g, k_b) = luma_q15;
  const RND: i32 = 1 << 14;
  for x in 0..width {
    let r = rgb[x * 3] as i32;
    let g = rgb[x * 3 + 1] as i32;
    let b = rgb[x * 3 + 2] as i32;
    let y = (k_r * r + k_g * g + k_b * b + RND) >> 15;
    luma_out[x] = y.clamp(0, 255) as u16;
  }
}

#[cfg(all(test, feature = "std"))]
mod tests;