colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
//! SSE4.1 kernels for VUYA / VUYX packed YUV 4:4:4 8-bit family.
//!
//! ## Layout
//!
//! Four `u8` elements per pixel: `V(8) ‖ U(8) ‖ Y(8) ‖ A(8)`.
//! VUYA carries a real alpha channel in byte 3. VUYX treats byte 3 as
//! padding and forces output α to `0xFF`.
//!
//! ## Per-iter pipeline (16 px / iter)
//!
//! Four `_mm_loadu_si128` loads fetch 64 bytes = 16 pixels of
//! `V U Y A V U Y A V U Y A V U Y A`. Each 16-byte register holds 4
//! pixels. Four `_mm_shuffle_epi8` masks extract bytes at positions
//! 0/4/8/12 (V), 1/5/9/13 (U), 2/6/10/14 (Y), 3/7/11/15 (A) —
//! placing each channel's 4 bytes in the low lanes with zeros elsewhere.
//! A `_mm_unpacklo_epi32` / `_mm_unpackhi_epi32` cascade merges the
//! 4 × 4-byte chunks into a full 16-byte channel vector.
//!
//! For each combined channel vector (V/U/Y), zero-extend low/high halves
//! to i16x8 via `_mm_cvtepu8_epi16`, subtract chroma bias (128), widen
//! to i32x4, and run the Q15 pipeline identical to the NV24 and XView36
//! SSE4.1 siblings. Pack RGB output via `_mm_packus_epi16` cascades.
//!
//! α handling: when `ALPHA && ALPHA_SRC`, use the A vector from the
//! deinterleave. When `ALPHA && !ALPHA_SRC`, use `_mm_set1_epi8(-1)`
//! (= 0xFF). RGB interleave via `write_rgb_16`; RGBA via `write_rgba_16`.
//!
//! ## Tail
//!
//! `width % 16` remaining pixels fall through to
//! `scalar::vuya_to_rgb_or_rgba_row`.
use core::arch::x86_64::*;

use super::*;
use crate::{ColorMatrix, row::scalar};

// ---- Deinterleave helper ------------------------------------------------

/// Deinterleaves 16 VUYA quadruples (64 bytes = 16 pixels) from `ptr`
/// into `(v_vec, u_vec, y_vec, a_vec)` — four `__m128i` vectors each
/// holding 16 `u8` samples.
///
/// ## Strategy
///
/// Load 4 × 16 bytes (4 pixels each). Each load contains bytes in the
/// order `V U Y A V U Y A V U Y A V U Y A`. Four shuffle masks extract:
/// - V: bytes at offsets 0, 4, 8, 12 → first 4 bytes, rest zero
/// - U: bytes at offsets 1, 5, 9, 13
/// - Y: bytes at offsets 2, 6, 10, 14
/// - A: bytes at offsets 3, 7, 11, 15
///
/// Combining with `_mm_unpacklo_epi32` / `_mm_unpackhi_epi32` assembles
/// the 4 × 4-byte chunks from all 4 loads into a single 16-byte vector
/// per channel.
///
/// # Safety
///
/// `ptr` must point to at least 64 readable bytes (16 VUYA quadruples).
/// Caller's `target_feature` must include SSE4.1 (implies SSSE3 for
/// `_mm_shuffle_epi8`).
#[inline]
#[target_feature(enable = "sse4.1")]
unsafe fn deinterleave_vuya(ptr: *const u8) -> (__m128i, __m128i, __m128i, __m128i) {
  unsafe {
    // Load 4 × 16 bytes (4 pixels each).
    let raw0 = _mm_loadu_si128(ptr.cast()); // pixels 0-3
    let raw1 = _mm_loadu_si128(ptr.add(16).cast()); // pixels 4-7
    let raw2 = _mm_loadu_si128(ptr.add(32).cast()); // pixels 8-11
    let raw3 = _mm_loadu_si128(ptr.add(48).cast()); // pixels 12-15

    // Shuffle masks: gather the relevant byte from each pixel quadruple
    // into the low 4 bytes; upper 12 bytes are zeroed (0x80 source index).
    // VUYA layout in each 16-byte register: V0 U0 Y0 A0 V1 U1 Y1 A1 V2 U2 Y2 A2 V3 U3 Y3 A3
    //   V at positions 0,  4,  8, 12
    //   U at positions 1,  5,  9, 13
    //   Y at positions 2,  6, 10, 14
    //   A at positions 3,  7, 11, 15
    let v_mask = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let u_mask = _mm_setr_epi8(1, 5, 9, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let y_mask = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    let a_mask = _mm_setr_epi8(3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);

    // Apply masks: each result has 4 valid bytes in the low 4 lanes.
    let v0 = _mm_shuffle_epi8(raw0, v_mask); // [V0..V3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    let v1 = _mm_shuffle_epi8(raw1, v_mask); // [V4..V7, ...]
    let v2 = _mm_shuffle_epi8(raw2, v_mask); // [V8..V11, ...]
    let v3 = _mm_shuffle_epi8(raw3, v_mask); // [V12..V15, ...]

    let u0 = _mm_shuffle_epi8(raw0, u_mask);
    let u1 = _mm_shuffle_epi8(raw1, u_mask);
    let u2 = _mm_shuffle_epi8(raw2, u_mask);
    let u3 = _mm_shuffle_epi8(raw3, u_mask);

    let y0 = _mm_shuffle_epi8(raw0, y_mask);
    let y1 = _mm_shuffle_epi8(raw1, y_mask);
    let y2 = _mm_shuffle_epi8(raw2, y_mask);
    let y3 = _mm_shuffle_epi8(raw3, y_mask);

    let a0 = _mm_shuffle_epi8(raw0, a_mask);
    let a1 = _mm_shuffle_epi8(raw1, a_mask);
    let a2 = _mm_shuffle_epi8(raw2, a_mask);
    let a3 = _mm_shuffle_epi8(raw3, a_mask);

    // Combine 4 × 4-byte chunks into a single 16-byte vector per channel.
    // unpacklo_epi32([A,B,C,D,...],[E,F,G,H,...]) = [A,B,E,F, C,D,G,H]
    // (where A,B,C,D etc. are 32-bit words)
    // Since only the first 32-bit word of each chunk is non-zero, this
    // effectively merges two 4-byte groups into the low 8 bytes.
    //
    // Step 1: interleave pairs (0+1, 2+3) → 8 valid bytes each.
    let v_01 = _mm_unpacklo_epi32(v0, v1); // [V0,V1,V2,V3, V4,V5,V6,V7, 0...]
    let v_23 = _mm_unpacklo_epi32(v2, v3); // [V8..V11, V12..V15, 0...]
    let u_01 = _mm_unpacklo_epi32(u0, u1);
    let u_23 = _mm_unpacklo_epi32(u2, u3);
    let y_01 = _mm_unpacklo_epi32(y0, y1);
    let y_23 = _mm_unpacklo_epi32(y2, y3);
    let a_01 = _mm_unpacklo_epi32(a0, a1);
    let a_23 = _mm_unpacklo_epi32(a2, a3);

    // Step 2: combine the two 8-byte halves into a full 16-byte vector.
    let v_vec = _mm_unpacklo_epi64(v_01, v_23); // [V0..V15]
    let u_vec = _mm_unpacklo_epi64(u_01, u_23); // [U0..U15]
    let y_vec = _mm_unpacklo_epi64(y_01, y_23); // [Y0..Y15]
    let a_vec = _mm_unpacklo_epi64(a_01, a_23); // [A0..A15]

    (v_vec, u_vec, y_vec, a_vec)
  }
}

// ---- shared RGB/RGBA kernel (16 px/iter) --------------------------------

/// SSE4.1 VUYA/VUYX → packed u8 RGB or RGBA.
///
/// Byte-identical to `scalar::vuya_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>`.
///
/// The three valid monomorphizations are:
/// - `<false, false>` — RGB (drops α)
/// - `<true, true>`  — RGBA, source α pass-through (VUYA)
/// - `<true, false>` — RGBA, force α = `0xFF` (VUYX)
///
/// `<false, true>` is rejected at monomorphization via `const { assert! }`.
///
/// # Safety
///
/// 1. **SSE4.1 must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuya_to_rgb_or_rgba_row<const ALPHA: bool, const ALPHA_SRC: bool>(
  packed: &[u8],
  out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // Source alpha requires RGBA output.
  const { assert!(!ALPHA_SRC || ALPHA) };
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  let bpp: usize = if ALPHA { 4 } else { 3 };
  debug_assert!(out.len() >= width * bpp, "out row too short");

  let coeffs = scalar::Coefficients::for_matrix(matrix);
  let (y_off, y_scale, c_scale) = scalar::range_params_n::<8, 8>(full_range);
  let bias = scalar::chroma_bias::<8>();
  const RND: i32 = 1 << 14;

  unsafe {
    let rnd_v = _mm_set1_epi32(RND);
    let y_off_v = _mm_set1_epi16(y_off as i16);
    let y_scale_v = _mm_set1_epi32(y_scale);
    let c_scale_v = _mm_set1_epi32(c_scale);
    let bias_v = _mm_set1_epi16(bias as i16);
    let cru = _mm_set1_epi32(coeffs.r_u());
    let crv = _mm_set1_epi32(coeffs.r_v());
    let cgu = _mm_set1_epi32(coeffs.g_u());
    let cgv = _mm_set1_epi32(coeffs.g_v());
    let cbu = _mm_set1_epi32(coeffs.b_u());
    let cbv = _mm_set1_epi32(coeffs.b_v());
    let alpha_u8 = _mm_set1_epi8(-1i8); // 0xFF for VUYX forced-opaque path

    let mut x = 0usize;
    while x + 16 <= width {
      // Deinterleave 16 VUYA quadruples → V, U, Y, A as u8x16.
      let (v_u8, u_u8, y_u8, a_u8) = deinterleave_vuya(packed.as_ptr().add(x * 4));

      // Zero-extend low/high halves to i16x8.
      let v_lo_i16 = _mm_cvtepu8_epi16(v_u8);
      let v_hi_i16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(v_u8));
      let u_lo_i16 = _mm_cvtepu8_epi16(u_u8);
      let u_hi_i16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(u_u8));
      let y_lo_i16 = _mm_cvtepu8_epi16(y_u8);
      let y_hi_i16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(y_u8));

      // Subtract chroma bias (128 for 8-bit).
      let u_sub_lo = _mm_sub_epi16(u_lo_i16, bias_v);
      let u_sub_hi = _mm_sub_epi16(u_hi_i16, bias_v);
      let v_sub_lo = _mm_sub_epi16(v_lo_i16, bias_v);
      let v_sub_hi = _mm_sub_epi16(v_hi_i16, bias_v);

      // Widen to i32x4 for Q15 chroma-scale multiply — low half.
      let u_lo_a = _mm_cvtepi16_epi32(u_sub_lo);
      let u_lo_b = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_sub_lo));
      let v_lo_a = _mm_cvtepi16_epi32(v_sub_lo);
      let v_lo_b = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_sub_lo));

      let u_d_lo_a = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_a, c_scale_v), rnd_v));
      let u_d_lo_b = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_b, c_scale_v), rnd_v));
      let v_d_lo_a = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_a, c_scale_v), rnd_v));
      let v_d_lo_b = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_b, c_scale_v), rnd_v));

      // Chroma for low 8 lanes.
      let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
      let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);
      let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v);

      // Widen to i32x4 for Q15 chroma-scale multiply — high half.
      let u_hi_a = _mm_cvtepi16_epi32(u_sub_hi);
      let u_hi_b = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_sub_hi));
      let v_hi_a = _mm_cvtepi16_epi32(v_sub_hi);
      let v_hi_b = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_sub_hi));

      let u_d_hi_a = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_a, c_scale_v), rnd_v));
      let u_d_hi_b = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_b, c_scale_v), rnd_v));
      let v_d_hi_a = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_a, c_scale_v), rnd_v));
      let v_d_hi_b = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_b, c_scale_v), rnd_v));

      // Chroma for high 8 lanes.
      let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
      let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);
      let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v);

      // Y: scale both halves.
      let y_scaled_lo = scale_y(y_lo_i16, y_off_v, y_scale_v, rnd_v);
      let y_scaled_hi = scale_y(y_hi_i16, y_off_v, y_scale_v, rnd_v);

      // Saturate-add Y + chroma, then saturate-narrow to u8x16 per channel.
      let r_lo = _mm_adds_epi16(y_scaled_lo, r_chroma_lo);
      let r_hi = _mm_adds_epi16(y_scaled_hi, r_chroma_hi);
      let g_lo = _mm_adds_epi16(y_scaled_lo, g_chroma_lo);
      let g_hi = _mm_adds_epi16(y_scaled_hi, g_chroma_hi);
      let b_lo = _mm_adds_epi16(y_scaled_lo, b_chroma_lo);
      let b_hi = _mm_adds_epi16(y_scaled_hi, b_chroma_hi);

      let r_u8 = _mm_packus_epi16(r_lo, r_hi);
      let g_u8 = _mm_packus_epi16(g_lo, g_hi);
      let b_u8 = _mm_packus_epi16(b_lo, b_hi);

      let out_ptr = out.as_mut_ptr().add(x * bpp);
      if ALPHA {
        let a_vec = if ALPHA_SRC { a_u8 } else { alpha_u8 };
        write_rgba_16(r_u8, g_u8, b_u8, a_vec, out_ptr);
      } else {
        write_rgb_16(r_u8, g_u8, b_u8, out_ptr);
      }

      x += 16;
    }

    // Scalar tail — remaining < 16 pixels.
    if x < width {
      scalar::vuya_to_rgb_or_rgba_row::<ALPHA, ALPHA_SRC>(
        &packed[x * 4..],
        &mut out[x * bpp..],
        width - x,
        matrix,
        full_range,
      );
    }
  }
}

// ---- thin wrappers -------------------------------------------------------

/// SSE4.1 VUYA / VUYX → packed **RGB** (3 bpp). Alpha byte in source is
/// discarded — RGB output has no alpha channel.
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuya_to_rgb_row(
  packed: &[u8],
  rgb_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: SSE4.1 availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<false, false>(packed, rgb_out, width, matrix, full_range);
  }
}

/// SSE4.1 VUYA → packed **RGBA** (4 bpp). Source A byte is passed through
/// verbatim.
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuya_to_rgba_row(
  packed: &[u8],
  rgba_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: SSE4.1 availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<true, true>(packed, rgba_out, width, matrix, full_range);
  }
}

/// SSE4.1 VUYX → packed **RGBA** (4 bpp). Source A byte is padding;
/// output α is forced to `0xFF` (opaque).
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuyx_to_rgba_row(
  packed: &[u8],
  rgba_out: &mut [u8],
  width: usize,
  matrix: ColorMatrix,
  full_range: bool,
) {
  // SAFETY: SSE4.1 availability is the caller's obligation.
  unsafe {
    vuya_to_rgb_or_rgba_row::<true, false>(packed, rgba_out, width, matrix, full_range);
  }
}

// ---- luma extraction (16 px/iter) ---------------------------------------

/// SSE4.1 VUYA / VUYX → u8 luma. Y is the third byte (offset 2) of each
/// pixel quadruple.
///
/// Byte-identical to `scalar::vuya_to_luma_row`.
///
/// # Safety
///
/// 1. **SSE4.1 must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `luma_out.len() >= width`.
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuya_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(luma_out.len() >= width, "luma row too short");

  unsafe {
    // Y bytes are at positions 2, 6, 10, 14 within each 16-byte chunk.
    let y_mask = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);

    let mut x = 0usize;
    while x + 16 <= width {
      let raw0 = _mm_loadu_si128(packed.as_ptr().add(x * 4).cast());
      let raw1 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 16).cast());
      let raw2 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 32).cast());
      let raw3 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 48).cast());

      let y0 = _mm_shuffle_epi8(raw0, y_mask);
      let y1 = _mm_shuffle_epi8(raw1, y_mask);
      let y2 = _mm_shuffle_epi8(raw2, y_mask);
      let y3 = _mm_shuffle_epi8(raw3, y_mask);

      let y_01 = _mm_unpacklo_epi32(y0, y1);
      let y_23 = _mm_unpacklo_epi32(y2, y3);
      let y_vec = _mm_unpacklo_epi64(y_01, y_23);

      _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_vec);
      x += 16;
    }

    // Scalar tail.
    if x < width {
      scalar::vuya_to_luma_row(&packed[x * 4..], &mut luma_out[x..], width - x);
    }
  }
}

/// SSE4.1 VUYA → u16 luma (zero-extended Y bytes). Y is the third byte
/// (offset 2) of each pixel quadruple. 16 pixels per SIMD iteration.
///
/// Strategy: same 4-load + shuffle cascade as the u8 path to gather 16 Y
/// bytes into a `__m128i`, then `_mm_cvtepu8_epi16` widens the low 8 to
/// u16x8 and the high 8 (after `_mm_srli_si128::<8>`) to a second u16x8.
/// Two `_mm_storeu_si128` stores write 16 u16 elements.
///
/// Byte-identical to `scalar::vuya_to_luma_u16_row`.
///
/// # Safety
///
/// 1. **SSE4.1 must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width`.
#[cfg_attr(not(any(feature = "std", feature = "alloc")), allow(dead_code))]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuya_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
  debug_assert!(packed.len() >= width * 4, "packed row too short");
  debug_assert!(out.len() >= width, "out too short");

  unsafe {
    // Y bytes are at positions 2, 6, 10, 14 within each 16-byte chunk.
    let y_mask = _mm_setr_epi8(2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);

    let mut x = 0usize;
    while x + 16 <= width {
      let raw0 = _mm_loadu_si128(packed.as_ptr().add(x * 4).cast());
      let raw1 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 16).cast());
      let raw2 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 32).cast());
      let raw3 = _mm_loadu_si128(packed.as_ptr().add(x * 4 + 48).cast());

      // Gather Y bytes: 4 per register → low 4 bytes.
      let y0 = _mm_shuffle_epi8(raw0, y_mask);
      let y1 = _mm_shuffle_epi8(raw1, y_mask);
      let y2 = _mm_shuffle_epi8(raw2, y_mask);
      let y3 = _mm_shuffle_epi8(raw3, y_mask);

      // Merge into a single 16-byte register of Y u8 values.
      let y_01 = _mm_unpacklo_epi32(y0, y1); // Y[0..7]  in low 8 bytes
      let y_23 = _mm_unpacklo_epi32(y2, y3); // Y[8..15] in low 8 bytes
      let y_vec = _mm_unpacklo_epi64(y_01, y_23); // Y[0..15]

      // Zero-extend low 8 bytes → u16x8; high 8 bytes → u16x8.
      let lo_u16 = _mm_cvtepu8_epi16(y_vec);
      let hi_u16 = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(y_vec));
      _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), lo_u16);
      _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), hi_u16);
      x += 16;
    }

    // Scalar tail.
    if x < width {
      scalar::vuya_to_luma_u16_row(&packed[x * 4..], &mut out[x..], width - x);
    }
  }
}

/// SSE4.1 VUYX → u16 luma (zero-extended Y bytes). Byte-identical to
/// [`vuya_to_luma_u16_row`] — Y is at byte offset 2 of each quadruple
/// regardless of α semantics; the X byte is discarded.
///
/// # Safety
///
/// 1. **SSE4.1 must be available.**
/// 2. `packed.len() >= width * 4`.
/// 3. `out.len() >= width`.
#[allow(dead_code)]
#[inline]
#[target_feature(enable = "sse4.1")]
pub(crate) unsafe fn vuyx_to_luma_u16_row(packed: &[u8], out: &mut [u16], width: usize) {
  // SAFETY: SSE4.1 availability is the caller's obligation.
  unsafe {
    vuya_to_luma_u16_row(packed, out, width);
  }
}