colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
//! Scalar reference implementations of the row primitives.
//!
//! Always compiled. SIMD backends live in [`super::arch`] and dispatch
//! to these as their tail fallback. Per-call dispatch in
//! [`super`]`::{yuv_420_to_rgb_row, rgb_to_hsv_row}` picks the best
//! backend at the module boundary.
//!
//! # Rounding convention
//!
//! The crate uses two distinct rounding strategies — choose based on
//! whether the operation is *precision-critical* or *bookkeeping*:
//!
//! - **Q15 chroma + Y arithmetic (final RGB output)**: round-to-nearest,
//!   implemented as `(value + (1 << 14)) >> 15` (or via the `q15_shift`
//!   helper). Maximum error: ±0.5 LSB symmetric. Used in every YUV→RGB
//!   pixel computation across all formats × backends.
//!
//! - **Narrow→wider depth conversions** (e.g., 16-bit luma → 8-bit
//!   luma via `Y_u16 >> 8`, or 10-bit packed → 8-bit RGB via `>> 2`):
//!   plain truncation, no rounding bias. Maximum error: -0.5 to 0 LSB
//!   (uniformly downward bias). Used in every `*_to_luma_row` (u8
//!   variant) for high-bit-depth sources, and in the `X2RGB10`/`X2BGR10`
//!   → u8 RGB conversion at the last narrow step.
//!
//! The asymmetry is intentional: precision-critical arithmetic earns
//! the rounding bias's symmetric error bound; depth-conversion is
//! bookkeeping where consistent downward-truncation matches FFmpeg's
//! `swscale` behavior and preserves "no-clip-into-overflow" guarantees.
//! Cross-format consistency on this distinction is verified by the
//! per-arch SIMD-vs-scalar parity tests.

use crate::ColorMatrix;

// Per-conversion-family submodules. Each holds a self-contained
// cluster of scalar reference kernels; `mod.rs` retains only the
// cross-cutting helpers (`clamp_u8`, `q15_*`, `bits_mask`,
// `Coefficients`, …) that every family pulls in.
// Consumers: source families with a source-α channel (`gbr` Gbrap,
// `gray` Ya8 / Ya16, `rgb` 16-bit RGBA at_3, `yuv-444-packed`
// AYUV64 / VUYA, `yuva` planar α).
#[cfg(any(
  feature = "gbr",
  feature = "gray",
  feature = "rgb",
  feature = "yuv-444-packed",
  feature = "yuva",
))]
pub(crate) mod alpha_extract;
#[cfg(feature = "yuv-444-packed")]
mod ayuv64;
#[cfg(feature = "bayer")]
mod bayer;
#[cfg(feature = "gray")]
pub(crate) mod gray;
#[cfg(feature = "gray")]
pub(crate) mod grayf32;
mod hsv;
#[cfg(feature = "rgb-legacy")]
pub(crate) mod legacy_rgb;
#[cfg(feature = "mono")]
pub(crate) mod mono1bit;
#[cfg(feature = "rgb")]
mod packed_rgb;
#[cfg(feature = "rgb")]
mod packed_rgb_16bit;
#[cfg(feature = "rgb-float")]
mod packed_rgb_float;
#[cfg(feature = "yuv-packed")]
mod packed_yuv_4_1_1;
#[cfg(feature = "yuv-packed")]
mod packed_yuv_8bit;
#[cfg(feature = "mono")]
pub(crate) mod pal8;
#[cfg(feature = "gbr")]
mod planar_gbr;
#[cfg(feature = "gbr")]
pub(crate) mod planar_gbr_f16;
#[cfg(feature = "gbr")]
pub(crate) mod planar_gbr_float;
#[cfg(feature = "gbr")]
pub(crate) mod planar_gbr_high_bit;
mod rgb_expand;
#[cfg(feature = "yuv-semi-planar")]
mod semi_planar_8bit;
// `subsampled_high_bit_pn` provides the scalar reference kernels for
// both the 4:2:0 (P010 / P012 / P016) and 4:4:4 (P410 / P412 / P416)
// families. The 4:4:4 helpers are consumed by `dispatch::pn`
// (yuv-semi-planar-gated, no yuv-planar dep), so a single
// `yuv-semi-planar` gate keeps them reachable. The 4:2:0 helpers are
// flagged unused under yuv-semi-planar alone (their dispatchers live
// under the yuv-planar-gated `dispatch::yuv420` parent) — see the
// per-fn cfg in `subsampled_high_bit_pn.rs` itself.
#[cfg(feature = "yuv-semi-planar")]
mod subsampled_high_bit_pn;
#[cfg(feature = "v210")]
mod v210;
#[cfg(feature = "yuv-444-packed")]
mod v30x;
#[cfg(feature = "yuv-444-packed")]
mod v410;
#[cfg(feature = "yuv-444-packed")]
mod vuya;
#[cfg(feature = "yuv-444-packed")]
mod xv36;
#[cfg(all(feature = "xyz", any(feature = "std", feature = "alloc")))]
pub(crate) mod xyz12;
#[cfg(all(feature = "xyz", any(feature = "std", feature = "alloc")))]
pub(crate) mod xyz12_constants;
#[cfg(feature = "y2xx")]
mod y216;
#[cfg(feature = "y2xx")]
mod y2xx;
// See `dispatch::mod.rs` for the consumer list.
#[cfg(any(
  feature = "gray",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
pub(crate) mod y_plane_to_luma_u16;
#[cfg(feature = "gray")]
pub(crate) mod ya16;
#[cfg(feature = "gray")]
pub(crate) mod ya8;
// yuv_planar_16bit also contains the P016 semi-planar 4:2:0 / P216
// semi-planar 4:2:2 / P416 semi-planar 4:4:4 16-bit kernels (`p16_to_rgb*_row`),
// so compile whenever either `yuv-planar` or `yuv-semi-planar` is enabled.
#[cfg(any(feature = "yuv-planar", feature = "yuv-semi-planar"))]
mod yuv_planar_16bit;
#[cfg(feature = "yuv-planar")]
mod yuv_planar_8bit;
#[cfg(feature = "yuv-planar")]
mod yuv_planar_high_bit;

// alpha_extract functions are imported directly by dispatch::alpha_extract
// via `crate::row::scalar::alpha_extract as scalar` (the module path).
// This glob re-exports into `crate::row::scalar::*` for Task 8+ callers;
// suppress unused-imports until then.
#[cfg(any(
  feature = "gbr",
  feature = "gray",
  feature = "rgb",
  feature = "yuv-444-packed",
  feature = "yuva",
))]
#[allow(unused_imports)]
pub(crate) use alpha_extract::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use ayuv64::*;
#[cfg(feature = "bayer")]
pub(crate) use bayer::*;
// legacy_rgb functions are consumed by the dispatcher via `use crate::row::{..., scalar};`
// and called as `scalar::legacy_rgb::...`.
// This glob re-exports them into the scalar namespace for direct callers (SIMD tails, tests).
#[cfg(feature = "rgb-legacy")]
#[allow(unused_imports)]
pub(crate) use legacy_rgb::*;
// gray functions are consumed by dispatch::gray via `crate::row::scalar::gray as scalar`.
// This glob re-exports them into the scalar namespace for direct callers (SIMD tails, tests).
#[cfg(feature = "gray")]
#[allow(unused_imports)]
pub(crate) use gray::*;
#[cfg(feature = "gray")]
#[allow(unused_imports)]
pub(crate) use grayf32::*;
pub(crate) use hsv::*;
// mono1bit functions are consumed by dispatch via the module path.
#[cfg(feature = "mono")]
#[allow(unused_imports)]
pub(crate) use mono1bit::*;
#[cfg(feature = "rgb")]
pub(crate) use packed_rgb::*;
#[cfg(feature = "rgb")]
pub(crate) use packed_rgb_16bit::*;
#[cfg(feature = "rgb-float")]
pub(crate) use packed_rgb_float::*;
#[cfg(feature = "yuv-packed")]
pub(crate) use packed_yuv_4_1_1::*;
#[cfg(feature = "yuv-packed")]
pub(crate) use packed_yuv_8bit::*;
#[cfg(feature = "gbr")]
pub(crate) use planar_gbr::*;
#[cfg(feature = "gbr")]
#[allow(unused_imports)]
pub(crate) use planar_gbr_f16::*;
#[cfg(feature = "gbr")]
#[allow(unused_imports)]
pub(crate) use planar_gbr_float::*;
#[cfg(feature = "gbr")]
pub(crate) use planar_gbr_high_bit::*;
// Same consumer set as the `rgb_expand` helpers themselves: every source
// family that fans an RGB row out to an RGBA row via Strategy A
// (Bayer is RGB-only, mono / rgb-float / rgb-legacy / xyz never go
// through the fan-out, so they're excluded).
#[cfg(all(
  any(feature = "std", feature = "alloc"),
  any(
    feature = "gbr",
    feature = "gray",
    feature = "rgb",
    feature = "v210",
    feature = "y2xx",
    feature = "yuv-444-packed",
    feature = "yuv-packed",
    feature = "yuv-planar",
    feature = "yuv-semi-planar",
    feature = "yuva",
  ),
))]
pub(crate) use rgb_expand::*;
#[cfg(feature = "yuv-semi-planar")]
pub(crate) use semi_planar_8bit::*;
#[cfg(feature = "yuv-semi-planar")]
pub(crate) use subsampled_high_bit_pn::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use v30x::*;
#[cfg(feature = "v210")]
pub(crate) use v210::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use v410::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use vuya::*;
#[cfg(feature = "yuv-444-packed")]
pub(crate) use xv36::*;
// `xyz12` and `xyz12_constants` are crate-internal modules; consumers (dispatcher
// + SIMD tails) reach in via `crate::row::scalar::xyz12::xyz12_to_rgb_row::<BE>`
// rather than a glob re-export, so the constants table and helpers stay
// addressable without polluting the scalar namespace.
#[cfg(feature = "y2xx")]
pub(crate) use y2xx::*;
#[cfg(feature = "y2xx")]
pub(crate) use y216::*;
#[cfg(feature = "gray")]
#[allow(unused_imports)]
pub(crate) use ya8::*;
#[cfg(feature = "gray")]
#[allow(unused_imports)]
pub(crate) use ya16::*;
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_8bit::*;
// The file is compiled whenever either family is on, but its public
// items are gated more tightly: `yuv_{420,444}p16_to_*` need
// `yuv-planar`, and `p16_to_*` needs both `yuv-planar` and
// `yuv-semi-planar`. So the re-export only carries items when
// `yuv-planar` is enabled.
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_16bit::*;
#[cfg(feature = "yuv-planar")]
pub(crate) use yuv_planar_high_bit::*;

// ---- Shared scalar helpers (used across all conversion families) -------

/// Reads one `u16` from the byte address `ptr` in the endianness
/// indicated by `BE`. `BE = false` → little-endian (native v210/Y2xx
/// on-wire format); `BE = true` → big-endian. The unused branch is
/// eliminated by the compiler when the caller is monomorphized.
///
/// **Target-endian aware** — this matches the SIMD `load_endian_u16x*`
/// helpers' semantics: `u16::from_be_bytes` / `u16::from_le_bytes`
/// each emit a `bswap` only when the source byte order differs from
/// the host CPU's native order. On a BE host the `BE = true` branch
/// is a plain load (no swap) and the `BE = false` branch swaps; on
/// an LE host the polarity reverses. This is the strict-superset-of-
/// bugs alternative to a naive `if BE { x.swap_bytes() }` pattern,
/// which would corrupt rows on s390x / other BE hosts.
///
/// # Safety
///
/// `ptr` must point to at least 2 readable bytes.
#[cfg(feature = "y2xx")]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) unsafe fn load_endian_u16<const BE: bool>(ptr: *const u8) -> u16 {
  let bytes = unsafe { [*ptr, *ptr.add(1)] };
  if BE {
    u16::from_be_bytes(bytes)
  } else {
    u16::from_le_bytes(bytes)
  }
}

/// Reads one `u32` from the byte address `ptr` in the endianness
/// indicated by `BE`. `BE = false` → little-endian; `BE = true` →
/// big-endian. The unused branch is eliminated by the compiler when
/// the caller is monomorphized.
///
/// **Target-endian aware** — `u32::from_be_bytes` / `u32::from_le_bytes`
/// each emit a `bswap` only when the source byte order differs from
/// the host CPU's native order, matching the SIMD `load_endian_u32x*`
/// helpers. See [`load_endian_u16`] for the full target-endian
/// contract.
///
/// # Safety
///
/// `ptr` must point to at least 4 readable bytes.
#[cfg(feature = "v210")]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) unsafe fn load_endian_u32<const BE: bool>(ptr: *const u8) -> u32 {
  let bytes = unsafe { [*ptr, *ptr.add(1), *ptr.add(2), *ptr.add(3)] };
  if BE {
    u32::from_be_bytes(bytes)
  } else {
    u32::from_le_bytes(bytes)
  }
}

#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) fn clamp_u8(v: i32) -> u8 {
  v.clamp(0, 255) as u8
}

/// Normalize a `u16` sample (just read host-native from memory) to the
/// host-native interpretation of the source byte order indicated by `BE`.
/// `BE = false` → little-endian source; `BE = true` → big-endian source.
/// The `if BE` branch is dead-code-eliminated per monomorphization, so
/// the matching-endian path is a zero-overhead no-op.
///
/// **Target-endian aware** — matches the SIMD `load_endian_u16x*::<BE>`
/// helpers' semantics: `u16::from_be` / `u16::from_le` each emit a
/// `bswap` only when the source byte order differs from the host CPU's
/// native order. On a BE host the `BE = true` branch is a plain pass-
/// through (no swap) and the `BE = false` branch swaps; on an LE host
/// the polarity reverses. This is the strict-superset-of-bugs
/// alternative to a naive `if BE { v.swap_bytes() }` pattern, which
/// would corrupt rows on s390x / other BE hosts.
#[cfg(any(feature = "yuv-planar", feature = "yuv-semi-planar", feature = "yuva",))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) const fn load_u16<const BE: bool>(v: u16) -> u16 {
  if BE { u16::from_be(v) } else { u16::from_le(v) }
}

/// `(sample * scale_q15 + RND) >> 15`. With input masked to BITS,
/// the `sample * scale` product cannot overflow i32 for any
/// reasonable `OUT_BITS ≤ 16`, so plain arithmetic is sufficient.
#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) fn q15_scale(sample: i32, scale_q15: i32) -> i32 {
  (sample * scale_q15 + (1 << 14)) >> 15
}

/// `(c_u * u_d + c_v * v_d + RND) >> 15`. Chroma sum max ≈ 10⁹ for
/// 14‑bit masked input, well within i32.
#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) fn q15_chroma(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 {
  (c_u * u_d + c_v * v_d + (1 << 14)) >> 15
}

/// `(c_u * u_d + c_v * v_d + RND) >> 15` computed in i64. Chroma sum
/// max ≈ 4.3·10⁹ at 16-bit limited range — above i32 but well within
/// i64. Result after the shift is bounded by ~130 000 so the final
/// `as i32` narrow is lossless.
#[cfg(any(
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) fn q15_chroma64(c_u: i32, u_d: i32, c_v: i32, v_d: i32) -> i32 {
  let sum = (c_u as i64) * (u_d as i64) + (c_v as i64) * (v_d as i64);
  ((sum + (1 << 14)) >> 15) as i32
}

/// `(sample * scale_q15 + RND) >> 15` computed in i64. For 16-bit
/// samples at limited-range 16 → u16 scaling, `sample * y_scale` can
/// reach ~2.35·10⁹ — just over i32::MAX — when unclamped `u16` input
/// exceeds the nominal limited-range Y max. Result after the shift
/// is bounded by ~65 536 so the final `as i32` narrow is lossless.
#[cfg(any(
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) fn q15_scale64(sample: i32, scale_q15: i32) -> i32 {
  (((sample as i64) * (scale_q15 as i64) + (1 << 14)) >> 15) as i32
}

/// Compile‑time sample mask for `BITS`: `(1 << BITS) - 1` as `u16`.
/// Returns `0x03FF` for 10‑bit, `0x0FFF` for 12‑bit, `0x3FFF` for
/// 14‑bit. SIMD backends splat this into a vector constant and AND
/// every load against it.
#[cfg(any(feature = "gray", feature = "yuv-planar", feature = "yuva"))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) const fn bits_mask<const BITS: u32>() -> u16 {
  ((1u32 << BITS) - 1) as u16
}

/// Chroma bias for input bit depth `BITS` — `128 << (BITS - 8)`.
/// 128 for 8‑bit, 512 for 10‑bit, 2048 for 12‑bit, 8192 for 14‑bit.
/// Exposed at module visibility so SIMD backends can reuse it.
#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) const fn chroma_bias<const BITS: u32>() -> i32 {
  128i32 << (BITS - 8)
}

/// Range‑scaling params `(y_off, y_scale_q15, c_scale_q15)` for the
/// high‑bit‑depth kernel family.
///
/// `BITS` is the input bit depth (10 / 12 / 14); `OUT_BITS` is the
/// target output range (8 for u8‑packed RGB, equal to `BITS` for
/// native‑depth `u16` output).
///
/// The scales are chosen so that after `((sample - y_off) * scale + RND) >> 15`
/// the result lies in `[0, (1 << OUT_BITS) - 1]` without further
/// downshifting. This keeps the fast path a single Q15 multiply for
/// both output widths.
///
/// - Full range: luma and chroma both use the same scale, mapping
///   `[0, in_max]` to `[0, out_max]`. Same shape as 8‑bit's
///   `(0, 1<<15, 1<<15)` for `BITS == OUT_BITS`.
/// - Limited range: luma maps `[16·k, 235·k]` to `[0, out_max]`,
///   chroma maps `[16·k, 240·k]` to `[0, out_max]`, where
///   `k = 1 << (BITS - 8)`. Matches FFmpeg's `AVCOL_RANGE_MPEG`
///   semantics.
#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(super) const fn range_params_n<const BITS: u32, const OUT_BITS: u32>(
  full_range: bool,
) -> (i32, i32, i32) {
  let in_max: i64 = (1i64 << BITS) - 1;
  let out_max: i64 = (1i64 << OUT_BITS) - 1;
  if full_range {
    // `scale = round((out_max << 15) / in_max)`. For `BITS == OUT_BITS`
    // the quotient is exactly `1 << 15` (no rounding needed); for
    // 10‑bit→8‑bit it's `(255 << 15) / 1023 ≈ 8167.5`, which rounds to 8168.
    let scale = ((out_max << 15) + in_max / 2) / in_max;
    (0, scale as i32, scale as i32)
  } else {
    let y_off = 16i32 << (BITS - 8);
    let y_range: i64 = 219i64 << (BITS - 8);
    let c_range: i64 = 224i64 << (BITS - 8);
    let y_scale = ((out_max << 15) + y_range / 2) / y_range;
    let c_scale = ((out_max << 15) + c_range / 2) / c_range;
    (y_off, y_scale as i32, c_scale as i32)
  }
}

/// Q15 YUV → RGB coefficients for a given matrix.
///
/// Full generalized 3×3 matrix:
/// - `R = Y + r_u·u_d + r_v·v_d`
/// - `G = Y + g_u·u_d + g_v·v_d`
/// - `B = Y + b_u·u_d + b_v·v_d`
///
/// where `u_d = U - 128`, `v_d = V - 128`. Standard matrices
/// (BT.601, BT.709, BT.2020-NCL, SMPTE 240M, FCC) have sparse layout
/// with `r_u = b_v = 0`; YCgCo uses all six entries.
#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
pub(super) struct Coefficients {
  r_u: i32,
  r_v: i32,
  g_u: i32,
  g_v: i32,
  b_u: i32,
  b_v: i32,
}

#[cfg(any(
  feature = "v210",
  feature = "y2xx",
  feature = "yuv-444-packed",
  feature = "yuv-packed",
  feature = "yuv-planar",
  feature = "yuv-semi-planar",
  feature = "yuva",
))]
impl Coefficients {
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn for_matrix(m: ColorMatrix) -> Self {
    match m {
      // BT.601: r_v=1.402, g_u=-0.344136, g_v=-0.714136, b_u=1.772.
      ColorMatrix::Bt601 | ColorMatrix::Fcc => Self {
        r_u: 0,
        r_v: 45941,
        g_u: -11277,
        g_v: -23401,
        b_u: 58065,
        b_v: 0,
      },
      // BT.709: r_v=1.5748, g_u=-0.1873, g_v=-0.4681, b_u=1.8556.
      ColorMatrix::Bt709 => Self {
        r_u: 0,
        r_v: 51606,
        g_u: -6136,
        g_v: -15339,
        b_u: 60808,
        b_v: 0,
      },
      // BT.2020-NCL: r_v=1.4746, g_u=-0.164553, g_v=-0.571353, b_u=1.8814.
      ColorMatrix::Bt2020Ncl => Self {
        r_u: 0,
        r_v: 48325,
        g_u: -5391,
        g_v: -18722,
        b_u: 61653,
        b_v: 0,
      },
      // SMPTE 240M: r_v=1.576, g_u=-0.2253, g_v=-0.4767, b_u=1.826.
      // Coefficients are taken from the SMPTE 240M-1999 published rounded
      // table values, NOT re-derived from KR/KB. Re-derivation from
      // KR=0.212, KB=0.087, KG=0.701 yields g_u ≈ -0.2266 (Q15 ≈ -7423),
      // which differs by ~0.13% (~43 LSB pre-Q15-shift). This is well
      // within rounding tolerance and matches the standard's published
      // text — do not "fix" to the analytic value without coordinating
      // with downstream pipelines that also use the published table.
      ColorMatrix::Smpte240m => Self {
        r_u: 0,
        r_v: 51642,
        g_u: -7383,
        g_v: -15620,
        b_u: 59834,
        b_v: 0,
      },
      // YCgCo per H.273 MatrixCoefficients = 8.
      //   U plane → Cg, V plane → Co (biased by 128 each).
      //   R = Y - (Cg - 128) + (Co - 128) = Y - u_d + v_d
      //   G = Y + (Cg - 128)              = Y + u_d
      //   B = Y - (Cg - 128) - (Co - 128) = Y - u_d - v_d
      // Each coefficient is ±1.0 → ±32768 in Q15.
      ColorMatrix::YCgCo => Self {
        r_u: -32768,
        r_v: 32768,
        g_u: 32768,
        g_v: 0,
        b_u: -32768,
        b_v: -32768,
      },
      // ColorMatrix is #[non_exhaustive] in mediaframe; fall back to BT.709
      // for any future variants added there before colconv is updated.
      _ => Self {
        r_u: 0,
        r_v: 51606,
        g_u: -6136,
        g_v: -15339,
        b_u: 60808,
        b_v: 0,
      },
    }
  }

  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn r_u(&self) -> i32 {
    self.r_u
  }
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn r_v(&self) -> i32 {
    self.r_v
  }
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn g_u(&self) -> i32 {
    self.g_u
  }
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn g_v(&self) -> i32 {
    self.g_v
  }
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn b_u(&self) -> i32 {
    self.b_u
  }
  #[cfg_attr(not(tarpaulin), inline(always))]
  pub(super) const fn b_v(&self) -> i32 {
    self.b_v
  }
}

// ---- BGR ↔ RGB byte swap ------------------------------------------------

/// Swaps the outer two channels of each packed RGB / BGR triple
/// (byte 0 ↔ byte 2), leaving the middle byte (G) untouched.
///
/// This is the shared implementation behind both `bgr_to_rgb_row` and
/// `rgb_to_bgr_row` — the transformation is a self‑inverse.
#[cfg(feature = "rgb")]
#[cfg_attr(not(tarpaulin), inline(always))]
pub(crate) fn bgr_rgb_swap_row(input: &[u8], output: &mut [u8], width: usize) {
  debug_assert!(input.len() >= width * 3, "input row too short");
  debug_assert!(output.len() >= width * 3, "output row too short");
  for x in 0..width {
    let i = x * 3;
    output[i] = input[i + 2];
    output[i + 1] = input[i + 1];
    output[i + 2] = input[i];
  }
}

#[cfg(all(test, feature = "std"))]
mod tests;