colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
//! AVX-512 VUYA / VUYX (packed YUV 4:4:4, 8-bit) kernels.
//!
//! ## Layout
//!
//! Four `u8` elements per pixel: `V(8) ‖ U(8) ‖ Y(8) ‖ A(8)`.
//! VUYA carries a real alpha channel in byte 3. VUYX treats byte 3 as
//! padding and forces output α to `0xFF`.
//!
//! ## Per-iter pipeline (64 px / iter)
//!
//! Four contiguous `_mm512_loadu_si512` loads fetch 256 bytes = 64
//! pixels of `V U Y A`. Each 512-bit register holds 16 pixels (4 in
//! each of its four 128-bit lanes). Per-128-bit-lane `_mm512_shuffle_epi8`
//! gathers each channel's 4 bytes from each lane into the lane's low
//! 32 bits (one `i32` lane); the upper 12 bytes of each lane are zeroed.
//!
//! After per-lane shuffle (e.g. for V on `raw_c0` covering pixels 0..15):
//! ```text
//!   v_c0 i32 lanes:
//!     [V0..V3, _, _, _, V4..V7, _, _, _, V8..V11, _, _, _, V12..V15, _, _, _]
//! ```
//! (lane 0 holds the 4-byte i32 packing `V0|V1|V2|V3`; lanes 1, 2, 3
//! are zero; lane 4 holds `V4|V5|V6|V7`; etc.) Two rounds of
//! `_mm512_permutex2var_epi32` (cross-vector i32 gather) then
//! consolidate the 16 valid i32 lanes scattered across the four
//! per-load partials into a single naturally-ordered 64-byte channel
//! vector:
//!
//! - Round 1: from `(v_c0, v_c1)` pick valid i32 lanes (0, 4, 8, 12) of
//!   each, producing 8 valid lanes covering V0..V31 in i32-lanes 0..7
//!   of the result. Same for `(v_c2, v_c3)` covering V32..V63.
//! - Round 2: combine the two half-results into a full 16-lane vector
//!   via `_mm512_permutex2var_epi32` with index `[0..7, 16..23]`.
//!
//! Net deinterleave: 4 loads + 16 shuffles + 12 permutes for all four
//! channels (each per-load shuffle is per-channel; channels are
//! independent and parallelizable). The cross-lane primitive used is
//! `vpermt2d` (i32) from AVX-512F — no AVX-512VBMI required (which
//! would be needed for `_mm512_permutex2var_epi8`).
//!
//! After deinterleave: zero-extend each 64-byte channel to two
//! `i16x32` halves via `_mm512_cvtepu8_epi16` on the low / high 256-bit
//! halves. The Q15 chroma + Y pipeline at BITS=8 is byte-identical to
//! the AVX-512 NV24 / packed YUV422 kernels — `chroma_i16x32`,
//! `scale_y`, `narrow_u8x64` from `mod.rs`.
//!
//! ## α handling
//!
//! When `ALPHA && ALPHA_SRC`, the A channel from the deinterleave is
//! passed straight through. When `ALPHA && !ALPHA_SRC`,
//! `_mm512_set1_epi8(-1)` (= 0xFF) is used. The `<false, false>`
//! monomorphization (RGB) drops the A channel entirely. The
//! `<false, true>` combination is rejected at monomorphization via
//! `const { assert! }`.
//!
//! ## Tail
//!
//! `width % 64` remaining pixels fall through to
//! `scalar::vuya_to_rgb_or_rgba_row`.
use core::arch::x86_64::*;

use super::*;
use crate::{ColorMatrix, row::scalar};

// ---- Static permute index tables ----------------------------------------
//
// VUYA layout per pixel: [V, U, Y, A] (4 u8 per pixel).
// 64 pixels occupy 256 bytes across 4 × __m512i loads:
//   raw_c0: bytes  0..63  → pixels  0..15
//   raw_c1: bytes 64..127 → pixels 16..31
//   raw_c2: bytes 128..191 → pixels 32..47
//   raw_c3: bytes 192..255 → pixels 48..63
//
// Within one __m512i, each of its 4 × 128-bit lanes holds 4 pixels' worth
// of VUYA (16 bytes). Per-128-bit-lane shuffle masks gather each channel's
// 4 bytes into the low 4 bytes of each lane (one `i32` lane in the low
// 32 bits of the lane); the upper 12 bytes of each lane are zeroed.
//
// After per-lane shuffle, the result holds the channel's bytes at i32
// lanes 0, 4, 8, 12 (one valid i32 per 128-bit lane). All other i32
// lanes are zero.
//
// Two rounds of `_mm512_permutex2var_epi32` then consolidate four
// such per-load partials into a single 16-lane (i32) channel vector
// holding 64 bytes in natural pixel order.

/// Round-1 index for `_mm512_permutex2var_epi32(a, idx, b)`: gather the
/// valid i32 lanes (0, 4, 8, 12) of `a` (covering 16 pixels of one
/// channel) and the valid i32 lanes (0, 4, 8, 12) of `b` (covering the
/// next 16 pixels) into the low 8 lanes of the result. Lanes 8..16 are
/// don't-care (use lane 0 as a safe index).
///
/// For `_mm512_permutex2var_epi32`:
///   `idx[i] < 16`  → output lane i = a[idx[i]]
///   `idx[i] >= 16` → output lane i = b[idx[i] - 16]
#[rustfmt::skip]
static GATHER_PAIR_IDX: [i32; 16] = [
  // From a (lanes 0..4 of output): 16 pixels of channel from one __m512i.
  0, 4, 8, 12,
  // From b (lanes 4..8 of output): 16 pixels from the next __m512i.
  16, 20, 24, 28,
  // Don't-care lanes 8..16: safe index 0.
  0, 0, 0, 0, 0, 0, 0, 0,
];

/// Round-2 index: combine two 8-i32-lane half-vectors into a full
/// 16-lane vector. Low 8 lanes from `a` (pixels 0..31), high 8 lanes
/// from `b` (pixels 32..63).
#[rustfmt::skip]
static COMBINE_IDX: [i32; 16] = [
  0, 1, 2, 3, 4, 5, 6, 7,
  16, 17, 18, 19, 20, 21, 22, 23,
];

// ---- Deinterleave helper ------------------------------------------------

/// Loads 64 VUYA quadruples (256 bytes = 64 pixels) from `ptr` and
/// unpacks them into four `__m512i` channel vectors holding 64 bytes
/// in natural pixel order (lane n = byte from pixel n):
/// - `v_vec`, `u_vec`, `y_vec`, `a_vec` — each 64 bytes, lane n = V/U/Y/A
///   from pixel n.
///
/// Strategy:
/// 1. 4 × `_mm512_loadu_si512` (one per 16-pixel group).
/// 2. Per-128-bit-lane `_mm512_shuffle_epi8` per channel, gathering each
///    channel's 4 bytes from each of the 4 × 128-bit lanes into the
///    low 4 bytes of each lane (one `i32` lane). Upper bytes zeroed.
/// 3. Two rounds of `_mm512_permutex2var_epi32` per channel: round 1
///    gathers the valid i32 lanes (0, 4, 8, 12) from each of two
///    consecutive `__m512i` partials into the low 8 lanes of an
///    intermediate; round 2 combines the two 8-lane halves into the
///    full 16-lane channel vector.
///
/// # Safety
///
/// `ptr` must point to at least 256 readable bytes (64 VUYA quadruples).
/// Caller's `target_feature` must include AVX-512F + AVX-512BW (BW
/// provides `_mm512_shuffle_epi8`; F provides `_mm512_permutex2var_epi32`).
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
unsafe fn deinterleave_vuya_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i, __m512i) {
  // SAFETY: caller obligation — `ptr` has 256 bytes readable; AVX-512F
  // + AVX-512BW are available.
  unsafe {
    // Load 4 × __m512i contiguously (64 pixels × 4 channels × u8 = 256 bytes).
    //
    // Each load covers 16 contiguous pixels (4 pixels per 128-bit lane):
    //   raw_c0 lanes: P0..3, P4..7, P8..11, P12..15
    //   raw_c1 lanes: P16..19, P20..23, P24..27, P28..31
    //   raw_c2 lanes: P32..35, P36..39, P40..43, P44..47
    //   raw_c3 lanes: P48..51, P52..55, P56..59, P60..63
    let raw_c0 = _mm512_loadu_si512(ptr.cast());
    let raw_c1 = _mm512_loadu_si512(ptr.add(64).cast());
    let raw_c2 = _mm512_loadu_si512(ptr.add(128).cast());
    let raw_c3 = _mm512_loadu_si512(ptr.add(192).cast());

    // Per-128-bit-lane shuffle masks: gather each channel's 4 bytes from
    // a 128-bit lane (4 pixels of VUYA = 16 bytes) into the low 4 bytes
    // of the lane. -1 zeroes the upper 12 bytes of each lane.
    //
    // `_mm512_broadcast_i32x4` replicates a 16-byte mask across all four
    // 128-bit lanes of the __m512i.
    let v_lane_mask = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let u_lane_mask = _mm_setr_epi8(1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let y_lane_mask = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let a_lane_mask = _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let v_mask = _mm512_broadcast_i32x4(v_lane_mask);
    let u_mask = _mm512_broadcast_i32x4(u_lane_mask);
    let y_mask = _mm512_broadcast_i32x4(y_lane_mask);
    let a_mask = _mm512_broadcast_i32x4(a_lane_mask);

    // Apply the per-channel masks to each load. Each result holds
    // the channel's 16 valid bytes spread across i32 lanes 0, 4, 8, 12.
    //
    // For V (analogous for U/Y/A), v_c0 (covering pixels 0..15) holds:
    //   i32-lane 0:  V0V1V2V3   (4 bytes, low 32 of lane 0)
    //   i32-lane 1:  zero
    //   i32-lane 2:  zero
    //   i32-lane 3:  zero
    //   i32-lane 4:  V4V5V6V7
    //   i32-lane 5:  zero
    //   i32-lane 6:  zero
    //   i32-lane 7:  zero
    //   i32-lane 8:  V8V9V10V11
    //   ...
    //   i32-lane 12: V12V13V14V15
    //   i32-lanes 13, 14, 15: zero
    let v_c0 = _mm512_shuffle_epi8(raw_c0, v_mask);
    let v_c1 = _mm512_shuffle_epi8(raw_c1, v_mask);
    let v_c2 = _mm512_shuffle_epi8(raw_c2, v_mask);
    let v_c3 = _mm512_shuffle_epi8(raw_c3, v_mask);
    let u_c0 = _mm512_shuffle_epi8(raw_c0, u_mask);
    let u_c1 = _mm512_shuffle_epi8(raw_c1, u_mask);
    let u_c2 = _mm512_shuffle_epi8(raw_c2, u_mask);
    let u_c3 = _mm512_shuffle_epi8(raw_c3, u_mask);
    let y_c0 = _mm512_shuffle_epi8(raw_c0, y_mask);
    let y_c1 = _mm512_shuffle_epi8(raw_c1, y_mask);
    let y_c2 = _mm512_shuffle_epi8(raw_c2, y_mask);
    let y_c3 = _mm512_shuffle_epi8(raw_c3, y_mask);
    let a_c0 = _mm512_shuffle_epi8(raw_c0, a_mask);
    let a_c1 = _mm512_shuffle_epi8(raw_c1, a_mask);
    let a_c2 = _mm512_shuffle_epi8(raw_c2, a_mask);
    let a_c3 = _mm512_shuffle_epi8(raw_c3, a_mask);

    // Permute index tables.
    let pair_idx = _mm512_loadu_si512(GATHER_PAIR_IDX.as_ptr().cast());
    let comb_idx = _mm512_loadu_si512(COMBINE_IDX.as_ptr().cast());

    // Round 1: gather valid i32 lanes (0, 4, 8, 12) from each pair of
    // partials. Result has 8 valid lanes (0..8) covering 32 pixels'
    // worth of channel bytes; lanes 8..16 are don't-care.
    //
    // For V: v_01 covers pixels 0..31 (lanes 0..8 = V0..V31 packed
    // 4 bytes per i32-lane); v_23 covers pixels 32..63.
    let v_01 = _mm512_permutex2var_epi32(v_c0, pair_idx, v_c1);
    let v_23 = _mm512_permutex2var_epi32(v_c2, pair_idx, v_c3);
    let u_01 = _mm512_permutex2var_epi32(u_c0, pair_idx, u_c1);
    let u_23 = _mm512_permutex2var_epi32(u_c2, pair_idx, u_c3);
    let y_01 = _mm512_permutex2var_epi32(y_c0, pair_idx, y_c1);
    let y_23 = _mm512_permutex2var_epi32(y_c2, pair_idx, y_c3);
    let a_01 = _mm512_permutex2var_epi32(a_c0, pair_idx, a_c1);
    let a_23 = _mm512_permutex2var_epi32(a_c2, pair_idx, a_c3);

    // Round 2: combine two 8-lane half-vectors into a full 16-lane
    // (i32) channel vector. `COMBINE_IDX` picks lanes 0..8 from `a` and
    // lanes 16..24 (= `b` lanes 0..8) for the high half.
    //
    // For V: v_vec is a __m512i with 64 bytes in natural pixel order:
    // byte n = V from pixel n.
    let v_vec = _mm512_permutex2var_epi32(v_01, comb_idx, v_23);
    let u_vec = _mm512_permutex2var_epi32(u_01, comb_idx, u_23);
    let y_vec = _mm512_permutex2var_epi32(y_01, comb_idx, y_23);
    let a_vec = _mm512_permutex2var_epi32(a_01, comb_idx, a_23);

    (v_vec, u_vec, y_vec, a_vec)
  }
}

// ---- Shared RGB / RGBA kernel (64 px/iter) ------------------------------

/// AVX-512 VUYA / VUYX → packed u8 RGB or RGBA.
///
/// Byte-identical to `scalar::vuya_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>`.
///
/// Block size: 64 pixels per SIMD iteration (four `_mm512_loadu_si512`
/// loads, 256 bytes total).
///
/// The three valid monomorphizations are:
/// - `<false, false>` — RGB (drops α)
/// - `<true, true>`  — RGBA, source α pass-through (VUYA)
/// - `<true, false>` — RGBA, force α = `0xFF` (VUYX)
///
/// `<false, true>` is rejected at monomorphization via `const { assert! }`.
///
/// # Safety
///
/// 1. **AVX-512F + AVX-512BW must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuya_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
  packed: &[u8],
  out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // Source alpha requires RGBA output.
  const { assert!(!ALPHA_SRC || ALPHA) };
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  let bpp: usize = if ALPHA { 4 } else { 3 };
  debug_assert!(out.len() >= width * bpp, "out row too short");

  let coeffs = scalar::Coefficients::for_matrix(matrix);
  let (y_off, y_scale, c_scale) = scalar::range_params_n::<8, 8>(full_range);
  let bias = scalar::chroma_bias::<8>();
  const RND: i32 = 1 << 14;

  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    let rnd_v = _mm512_set1_epi32(RND);
    let y_off_v = _mm512_set1_epi16(y_off as i16);
    let y_scale_v = _mm512_set1_epi32(y_scale);
    let c_scale_v = _mm512_set1_epi32(c_scale);
    let bias_v = _mm512_set1_epi16(bias as i16);
    let cru = _mm512_set1_epi32(coeffs.r_u());
    let crv = _mm512_set1_epi32(coeffs.r_v());
    let cgu = _mm512_set1_epi32(coeffs.g_u());
    let cgv = _mm512_set1_epi32(coeffs.g_v());
    let cbu = _mm512_set1_epi32(coeffs.b_u());
    let cbv = _mm512_set1_epi32(coeffs.b_v());
    let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
    // 0xFF for VUYX forced-opaque path.
    let alpha_u8 = _mm512_set1_epi8(-1i8);

    let mut x = 0usize;
    while x + 64 <= width {
      // Deinterleave 64 VUYA quadruples → V, U, Y, A as u8x64 in
      // natural pixel order.
      let (v_u8, u_u8, y_u8, a_u8) = deinterleave_vuya_avx512(packed.as_ptr().add(x * 4));

      // Zero-extend each channel to two i16x32 halves (low 32 bytes →
      // pixels 0..31, high 32 bytes → pixels 32..63).
      let v_lo_i16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(v_u8));
      let v_hi_i16 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(v_u8));
      let u_lo_i16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(u_u8));
      let u_hi_i16 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(u_u8));
      let y_lo_i16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(y_u8));
      let y_hi_i16 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(y_u8));

      // Subtract chroma bias (128 for 8-bit).
      let u_lo_sub = _mm512_sub_epi16(u_lo_i16, bias_v);
      let u_hi_sub = _mm512_sub_epi16(u_hi_i16, bias_v);
      let v_lo_sub = _mm512_sub_epi16(v_lo_i16, bias_v);
      let v_hi_sub = _mm512_sub_epi16(v_hi_i16, bias_v);

      // Widen each i16x32 chroma half into two i32x16 halves for Q15
      // multiply.
      let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_sub));
      let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_sub));
      let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_sub));
      let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_sub));
      let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_sub));
      let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_sub));
      let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_sub));
      let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_sub));

      // u_d / v_d = (u * c_scale + RND) >> 15.
      let u_d_lo_a = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(u_lo_a, c_scale_v),
        rnd_v,
      ));
      let u_d_lo_b = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(u_lo_b, c_scale_v),
        rnd_v,
      ));
      let u_d_hi_a = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(u_hi_a, c_scale_v),
        rnd_v,
      ));
      let u_d_hi_b = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(u_hi_b, c_scale_v),
        rnd_v,
      ));
      let v_d_lo_a = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(v_lo_a, c_scale_v),
        rnd_v,
      ));
      let v_d_lo_b = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(v_lo_b, c_scale_v),
        rnd_v,
      ));
      let v_d_hi_a = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(v_hi_a, c_scale_v),
        rnd_v,
      ));
      let v_d_hi_b = q15_shift(_mm512_add_epi32(
        _mm512_mullo_epi32(v_hi_b, c_scale_v),
        rnd_v,
      ));

      // 64 chroma per channel: two `chroma_i16x32` calls per channel
      // (no chroma duplication at 4:4:4 — one chroma sample per Y pixel).
      let r_chroma_lo = chroma_i16x32(
        cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
      );
      let r_chroma_hi = chroma_i16x32(
        cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
      );
      let g_chroma_lo = chroma_i16x32(
        cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
      );
      let g_chroma_hi = chroma_i16x32(
        cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
      );
      let b_chroma_lo = chroma_i16x32(
        cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup,
      );
      let b_chroma_hi = chroma_i16x32(
        cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup,
      );

      // Y path: scale each i16x32 half independently.
      let y_scaled_lo = scale_y(y_lo_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);
      let y_scaled_hi = scale_y(y_hi_i16, y_off_v, y_scale_v, rnd_v, pack_fixup);

      // Saturating i16 add Y + chroma per channel, then narrow to u8x64
      // with natural lane order via `narrow_u8x64`.
      let r_lo = _mm512_adds_epi16(y_scaled_lo, r_chroma_lo);
      let r_hi = _mm512_adds_epi16(y_scaled_hi, r_chroma_hi);
      let g_lo = _mm512_adds_epi16(y_scaled_lo, g_chroma_lo);
      let g_hi = _mm512_adds_epi16(y_scaled_hi, g_chroma_hi);
      let b_lo = _mm512_adds_epi16(y_scaled_lo, b_chroma_lo);
      let b_hi = _mm512_adds_epi16(y_scaled_hi, b_chroma_hi);

      let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
      let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
      let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup);

      let out_ptr = out.as_mut_ptr().add(x * bpp);
      if ALPHA {
        let a_vec = if ALPHA_SRC { a_u8 } else { alpha_u8 };
        write_rgba_64(r_u8, g_u8, b_u8, a_vec, out_ptr);
      } else {
        write_rgb_64(r_u8, g_u8, b_u8, out_ptr);
      }

      x += 64;
    }

    // Scalar tail — remaining < 64 pixels.
    if x < width {
      scalar::vuya_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
        &packed[x * 4..],
        &mut out[x * bpp..],
        width - x,
        matrix,
        full_range,
      );
    }
  }
}

// ---- Thin wrappers ------------------------------------------------------

/// AVX-512 VUYA / VUYX → packed **RGB** (3 bpp). Alpha byte in source is
/// discarded — RGB output has no alpha channel.
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuya_to_rgb_row(
  packed: &[u8],
  rgb_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
  }
}

/// AVX-512 VUYA → packed **RGBA** (4 bpp). Source A byte is passed
/// through verbatim.
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuya_to_rgba_row(
  packed: &[u8],
  rgba_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
  }
}

/// AVX-512 VUYX → packed **RGBA** (4 bpp). Source A byte is padding;
/// output α is forced to `0xFF` (opaque).
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuyx_to_rgba_row(
  packed: &[u8],
  rgba_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range);
  }
}

// ---- Luma extraction (64 px/iter) ---------------------------------------

/// AVX-512 VUYA / VUYX → u8 luma. Y is the third byte (offset 2) of each
/// pixel quadruple.
///
/// Byte-identical to `scalar::vuya_to_luma_row`.
///
/// Block size: 64 pixels per SIMD iteration. Reuses the full 4-channel
/// deinterleave and discards V/U/A: keeping the same code path gives the
/// lane-order regression test the strongest possible coverage — any
/// deinterleave bug in the V/U/Y path manifests identically here.
///
/// # Safety
///
/// 1. **AVX-512F + AVX-512BW must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `luma_out.len() >= width`.
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuya_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");

  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    let mut x = 0usize;
    while x + 64 <= width {
      let (_v, _u, y_vec, _a) = deinterleave_vuya_avx512(packed.as_ptr().add(x * 4));
      _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_vec);
      x += 64;
    }

    // Scalar tail — remaining < 64 pixels.
    if x < width {
      scalar::vuya_to_luma_row(&packed[x * 4..], &mut luma_out[x..], width - x);
    }
  }
}

/// AVX-512 VUYA → u16 luma (zero-extended Y bytes). Y is the third byte
/// (offset 2) of each pixel quadruple. 64 pixels per SIMD iteration.
///
/// Strategy: deinterleave 64 VUYA quadruples to get a `__m512i` of 64 Y
/// bytes. `_mm512_cvtepu8_epi16` widens the low 32 bytes (low 256-bit lane)
/// to u16x32; same for the high 32 bytes. Two `_mm512_storeu_si512` writes
/// output 64 u16 values.
///
/// Byte-identical to `scalar::vuya_to_luma_u16_row`.
///
/// # Safety
///
/// 1. **AVX-512F + AVX-512BW must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width`.
#[cfg_attr(not(any(feature = "std", feature = "alloc")), allow(dead_code))]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuya_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(out.len() >= width, "out too short");

  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    let mut x = 0usize;
    while x + 64 <= width {
      // Deinterleave 64 VUYA quadruples; channel 2 = Y (u8 × 64).
      let (_v, _u, y_u8, _a) = deinterleave_vuya_avx512(packed.as_ptr().add(x * 4));

      // Widen low 32 Y bytes → u16x32 (pixels 0-31).
      let lo_u16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(y_u8));
      // Widen high 32 Y bytes → u16x32 (pixels 32-63).
      let hi_u16 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64::<1>(y_u8));

      _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), lo_u16);
      _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), hi_u16);
      x += 64;
    }

    // Scalar tail — remaining < 64 pixels.
    if x < width {
      scalar::vuya_to_luma_u16_row(&packed[x * 4..], &mut out[x..], width - x);
    }
  }
}

/// AVX-512 VUYX → u16 luma (zero-extended Y bytes). Byte-identical to
/// [`vuya_to_luma_u16_row`] — Y is at byte offset 2 of each quadruple
/// regardless of α semantics; the X byte is discarded.
///
/// # Safety
///
/// 1. **AVX-512F + AVX-512BW must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width`.
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "avx512f,avx512bw")]
pub(crate) unsafe fn vuyx_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
  // SAFETY: AVX-512F + AVX-512BW availability is the caller's obligation.
  unsafe {
    vuya_to_luma_u16_row(packed, out, width);
  }
}