zenavif 0.1.6

Pure Rust AVIF image codec powered by rav1d and zenravif
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
//! Fast YUV to RGB conversion using integer arithmetic
//!
//! Key optimizations:
//! - Fixed-point integer math (much faster than float)
//! - Process 32 pixels at once (AVX2) or 16 pixels (NEON)
//! - Process 2 rows simultaneously for YUV420
//! - Use AVX2/NEON intrinsics for proper SIMD vectorization

// These unsafe fn helpers use SIMD intrinsics that are safe within target_feature context.
#![allow(unsafe_op_in_unsafe_fn)]
// YUV conversion functions naturally require many plane/stride/coefficient parameters.
#![allow(clippy::too_many_arguments)]

use archmage::prelude::*;
use imgref::ImgVec;
use rgb::RGB8;

/// Fast YUV420 to RGB8 using integer arithmetic (AVX2 path)
#[cfg(target_arch = "x86_64")]
#[arcane]
pub fn yuv420_to_rgb8_fast(
    token: Desktop64,
    y_plane: &[u8],
    y_stride: usize,
    u_plane: &[u8],
    u_stride: usize,
    v_plane: &[u8],
    v_stride: usize,
    width: usize,
    height: usize,
) -> ImgVec<RGB8> {
    let mut out = vec![RGB8::default(); width * height];

    // BT.709 coefficients in fixed-point (Q13 format: 8192 = 1.0)
    // Values from yuv crate for BT.709 full range
    let y_coef: i16 = 9539; // 1.164 * 8192
    let cr_coef: i16 = 13075; // 1.596 * 8192
    let cb_coef: i16 = 16525; // 2.018 * 8192
    let g_coef_1: i16 = 6660; // For U component (formula subtracts this)
    let g_coef_2: i16 = 3209; // For V component (formula subtracts this)

    // Bias values
    let y_bias: i16 = 16;
    let uv_bias: i16 = 128;

    // Process 2 rows at a time for YUV420
    for y in (0..height).step_by(2) {
        let y0_row = y;
        let y1_row = (y + 1).min(height - 1);
        let chroma_row = y / 2;

        // Process 32 pixels at a time
        for x in (0..width).step_by(32) {
            let pixels_remaining = (width - x).min(32);

            if pixels_remaining < 32 {
                // Handle remaining pixels with scalar code
                for i in 0..pixels_remaining {
                    for row in [y0_row, y1_row] {
                        if row >= height {
                            continue;
                        }
                        let px = x + i;
                        let chroma_x = px / 2;

                        let y_val = y_plane[row * y_stride + px] as i32 - y_bias as i32;
                        let u_val =
                            u_plane[chroma_row * u_stride + chroma_x] as i32 - uv_bias as i32;
                        let v_val =
                            v_plane[chroma_row * v_stride + chroma_x] as i32 - uv_bias as i32;

                        let y_scaled = (y_val * y_coef as i32) >> 13;
                        let r = y_scaled + ((v_val * cr_coef as i32) >> 13);
                        let g =
                            y_scaled - ((v_val * g_coef_1 as i32 + u_val * g_coef_2 as i32) >> 13);
                        let b = y_scaled + ((u_val * cb_coef as i32) >> 13);

                        out[row * width + px] = RGB8 {
                            r: r.clamp(0, 255) as u8,
                            g: g.clamp(0, 255) as u8,
                            b: b.clamp(0, 255) as u8,
                        };
                    }
                }
                continue;
            }

            // SIMD path for 32 pixels
            // Split the output buffer between row 0 and row 1
            let split_point = y1_row * width;
            let (top_rows, bottom_rows) = out.split_at_mut(split_point);
            let row0_out = &mut top_rows[y0_row * width + x..];
            let row1_out = &mut bottom_rows[x..];

            process_32_pixels_420(
                token,
                &y_plane[y0_row * y_stride + x..],
                &y_plane[y1_row * y_stride + x..],
                &u_plane[chroma_row * u_stride + x / 2..],
                &v_plane[chroma_row * v_stride + x / 2..],
                row0_out,
                row1_out,
                y_coef,
                cr_coef,
                cb_coef,
                g_coef_1,
                g_coef_2,
                y_bias,
                uv_bias,
            );
        }
    }

    ImgVec::new(out, width, height)
}

#[cfg(target_arch = "x86_64")]
#[rite]
fn process_32_pixels_420(
    _token: Desktop64,
    y0: &[u8],
    y1: &[u8],
    u: &[u8],
    v: &[u8],
    out0: &mut [RGB8],
    out1: &mut [RGB8],
    y_coef: i16,
    cr_coef: i16,
    cb_coef: i16,
    g_coef_1: i16,
    g_coef_2: i16,
    y_bias: i16,
    uv_bias: i16,
) {
    use core::arch::x86_64::*;

    // Take only the 32 pixels we're processing
    let out0 = &mut out0[..32];
    let out1 = &mut out1[..32];

    unsafe {
        // Load 32 Y values for each row
        let y0_vals = _mm256_loadu_si256(y0.as_ptr() as *const __m256i);
        let y1_vals = _mm256_loadu_si256(y1.as_ptr() as *const __m256i);

        // Load 16 U and V values (half resolution for 4:2:0)
        let u_vals = _mm_loadu_si128(u.as_ptr() as *const __m128i);
        let v_vals = _mm_loadu_si128(v.as_ptr() as *const __m128i);

        // Broadcast UV bias and Y bias
        let y_corr = _mm256_set1_epi8(y_bias as i8);
        let uv_corr = _mm256_set1_epi16((uv_bias << 2) | (uv_bias >> 6));

        // Broadcast coefficients
        let v_y_coef = _mm256_set1_epi16(y_coef);
        let v_cr_coef = _mm256_set1_epi16(cr_coef);
        let v_cb_coef = _mm256_set1_epi16(cb_coef);
        let v_g_coef_1 = _mm256_set1_epi16(g_coef_1);
        let v_g_coef_2 = _mm256_set1_epi16(g_coef_2);

        // Subtract Y bias
        let y0_sub = _mm256_subs_epu8(y0_vals, y_corr);
        let y1_sub = _mm256_subs_epu8(y1_vals, y_corr);

        // Expand chroma from 16 to 32 values using shuffle
        // Create a shuffle mask that duplicates each byte: [0,0,1,1,2,2,...]
        let shuf_expand = _mm256_setr_epi8(
            0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
            13, 14, 14, 15, 15,
        );

        // Broadcast 128-bit chroma to both lanes of 256-bit register
        let u_256 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(u_vals), u_vals);
        let v_256 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(v_vals), v_vals);

        // Expand each chroma sample to cover 2 pixels
        let u_expanded = _mm256_shuffle_epi8(u_256, shuf_expand);
        let v_expanded = _mm256_shuffle_epi8(v_256, shuf_expand);

        // Expand u8 to i16 by unpacking (creates 10-bit representation)
        let y0_lo = expand_u8_to_i16_lo(y0_sub);
        let y0_hi = expand_u8_to_i16_hi(y0_sub);
        let y1_lo = expand_u8_to_i16_lo(y1_sub);
        let y1_hi = expand_u8_to_i16_hi(y1_sub);

        let u_lo = expand_u8_to_i16_lo(u_expanded);
        let u_hi = expand_u8_to_i16_hi(u_expanded);
        let v_lo = expand_u8_to_i16_lo(v_expanded);
        let v_hi = expand_u8_to_i16_hi(v_expanded);

        // Subtract UV bias
        let u_lo = _mm256_sub_epi16(u_lo, uv_corr);
        let u_hi = _mm256_sub_epi16(u_hi, uv_corr);
        let v_lo = _mm256_sub_epi16(v_lo, uv_corr);
        let v_hi = _mm256_sub_epi16(v_hi, uv_corr);

        // Process low 16 pixels of row 0
        let (r0_lo, g0_lo, b0_lo) = yuv_to_rgb_i16(
            y0_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
        );

        // Process high 16 pixels of row 0
        let (r0_hi, g0_hi, b0_hi) = yuv_to_rgb_i16(
            y0_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
        );

        // Process low 16 pixels of row 1
        let (r1_lo, g1_lo, b1_lo) = yuv_to_rgb_i16(
            y1_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
        );

        // Process high 16 pixels of row 1
        let (r1_hi, g1_hi, b1_hi) = yuv_to_rgb_i16(
            y1_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
        );

        // Pack i16 back to u8 with saturation
        let r0 = _mm256_packus_epi16(r0_lo, r0_hi);
        let g0 = _mm256_packus_epi16(g0_lo, g0_hi);
        let b0 = _mm256_packus_epi16(b0_lo, b0_hi);

        let r1 = _mm256_packus_epi16(r1_lo, r1_hi);
        let g1 = _mm256_packus_epi16(g1_lo, g1_hi);
        let b1 = _mm256_packus_epi16(b1_lo, b1_hi);

        // Deinterleave and store RGB values
        store_rgb_row(out0, r0, g0, b0);
        store_rgb_row(out1, r1, g1, b1);
    }
}

#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn expand_u8_to_i16_lo(v: __m256i) -> __m256i {
    use core::arch::x86_64::*;
    let v_dup = _mm256_unpacklo_epi8(v, v);
    _mm256_srli_epi16::<6>(v_dup)
}

#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn expand_u8_to_i16_hi(v: __m256i) -> __m256i {
    use core::arch::x86_64::*;
    let v_dup = _mm256_unpackhi_epi8(v, v);
    _mm256_srli_epi16::<6>(v_dup)
}

#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn yuv_to_rgb_i16(
    y: __m256i,
    u: __m256i,
    v: __m256i,
    y_coef: __m256i,
    cr_coef: __m256i,
    cb_coef: __m256i,
    g_coef_1: __m256i,
    g_coef_2: __m256i,
) -> (__m256i, __m256i, __m256i) {
    use core::arch::x86_64::*;

    // Scale Y with luma coefficient
    let y_scaled = _mm256_mulhrs_epi16(y, y_coef);

    // Compute color components
    // Note: g_coef_1 applies to V, g_coef_2 applies to U (opposite of variable names!)
    let v_cr = _mm256_mulhrs_epi16(v, cr_coef);
    let u_cb = _mm256_mulhrs_epi16(u, cb_coef);
    let v_g = _mm256_mulhrs_epi16(v, g_coef_1);
    let u_g = _mm256_mulhrs_epi16(u, g_coef_2);

    let r = _mm256_add_epi16(y_scaled, v_cr);
    let b = _mm256_add_epi16(y_scaled, u_cb);
    let g = _mm256_sub_epi16(y_scaled, _mm256_add_epi16(v_g, u_g));

    (r, g, b)
}

#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn store_rgb_row(out: &mut [RGB8], r: __m256i, g: __m256i, b: __m256i) {
    use core::arch::x86_64::*;

    // For now, use simple array extraction to debug
    // TODO: Optimize with shuffle-based interleaving once accuracy is verified
    let mut r_arr = [0u8; 32];
    let mut g_arr = [0u8; 32];
    let mut b_arr = [0u8; 32];

    _mm256_storeu_si256(r_arr.as_mut_ptr() as *mut __m256i, r);
    _mm256_storeu_si256(g_arr.as_mut_ptr() as *mut __m256i, g);
    _mm256_storeu_si256(b_arr.as_mut_ptr() as *mut __m256i, b);

    for i in 0..32 {
        out[i] = RGB8 {
            r: r_arr[i],
            g: g_arr[i],
            b: b_arr[i],
        };
    }
}

/// Interleave planar R, G, B into packed RGB using AVX2 shuffles
/// Input: 32 R values, 32 G values, 32 B values (each in a 256-bit register)
/// Output: 3x 256-bit registers containing 96 bytes of interleaved RGBRGBRGB...
///
/// Ported from yuv crate's avx2_interleave_rgb
#[cfg(target_arch = "x86_64")]
#[inline(always)]
#[allow(dead_code)]
unsafe fn interleave_rgb_avx2(r: __m256i, g: __m256i, b: __m256i) -> (__m256i, __m256i, __m256i) {
    use core::arch::x86_64::*;

    // Shuffle masks to rearrange bytes for RGB interleaving
    let sh_b = _mm256_setr_epi8(
        0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14,
        9, 4, 15, 10, 5,
    );
    let sh_g = _mm256_setr_epi8(
        5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3,
        14, 9, 4, 15, 10,
    );
    let sh_r = _mm256_setr_epi8(
        10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8,
        3, 14, 9, 4, 15,
    );

    // Apply shuffles to each color channel
    let b0 = _mm256_shuffle_epi8(r, sh_b);
    let g0 = _mm256_shuffle_epi8(g, sh_g);
    let r0 = _mm256_shuffle_epi8(b, sh_r);

    // Blend masks for selecting bytes from each shuffled vector
    // -1 (0xFF) selects from second source, 0 selects from first source
    let m0 = _mm256_setr_epi8(
        0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1,
        0, 0, -1, 0, 0,
    );
    let m1 = _mm256_setr_epi8(
        0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
        -1, 0, 0, -1, 0,
    );

    // Blend the shuffled values to create RGB pattern
    let p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
    let p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
    let p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);

    // Permute lanes to get final RGB layout
    let rgb0 = _mm256_permute2x128_si256::<0x20>(p0, p1); // 0x20 = 32
    let rgb1 = _mm256_permute2x128_si256::<0x30>(p2, p0); // 0x30 = 48
    let rgb2 = _mm256_permute2x128_si256::<0x31>(p1, p2); // 0x31 = 49

    (rgb0, rgb1, rgb2)
}

// ============================================================================
// NEON (aarch64) implementation
// ============================================================================

/// Fast YUV420 to RGB8 using integer arithmetic (NEON path)
///
/// Processes 16 pixels at a time using NEON 128-bit registers.
/// Uses the same BT.709 fixed-point coefficients as the AVX2 path.
#[cfg(target_arch = "aarch64")]
#[arcane]
pub fn yuv420_to_rgb8_fast_neon(
    token: NeonToken,
    y_plane: &[u8],
    y_stride: usize,
    u_plane: &[u8],
    u_stride: usize,
    v_plane: &[u8],
    v_stride: usize,
    width: usize,
    height: usize,
) -> ImgVec<RGB8> {
    let mut out = vec![RGB8::default(); width * height];

    // BT.709 coefficients in fixed-point (Q13 format: 8192 = 1.0)
    let y_coef: i16 = 9539; // 1.164 * 8192
    let cr_coef: i16 = 13075; // 1.596 * 8192
    let cb_coef: i16 = 16525; // 2.018 * 8192
    let g_coef_1: i16 = 6660; // For V component (formula subtracts this)
    let g_coef_2: i16 = 3209; // For U component (formula subtracts this)

    // Bias values
    let y_bias: i16 = 16;
    let uv_bias: i16 = 128;

    // Process 2 rows at a time for YUV420
    for y in (0..height).step_by(2) {
        let y0_row = y;
        let y1_row = (y + 1).min(height - 1);
        let chroma_row = y / 2;

        // Process 16 pixels at a time with NEON
        for x in (0..width).step_by(16) {
            let pixels_remaining = (width - x).min(16);

            if pixels_remaining < 16 {
                // Handle remaining pixels with scalar code
                for i in 0..pixels_remaining {
                    for row in [y0_row, y1_row] {
                        if row >= height {
                            continue;
                        }
                        let px = x + i;
                        let chroma_x = px / 2;

                        let y_val = y_plane[row * y_stride + px] as i32 - y_bias as i32;
                        let u_val =
                            u_plane[chroma_row * u_stride + chroma_x] as i32 - uv_bias as i32;
                        let v_val =
                            v_plane[chroma_row * v_stride + chroma_x] as i32 - uv_bias as i32;

                        let y_scaled = (y_val * y_coef as i32) >> 13;
                        let r = y_scaled + ((v_val * cr_coef as i32) >> 13);
                        let g =
                            y_scaled - ((v_val * g_coef_1 as i32 + u_val * g_coef_2 as i32) >> 13);
                        let b = y_scaled + ((u_val * cb_coef as i32) >> 13);

                        out[row * width + px] = RGB8 {
                            r: r.clamp(0, 255) as u8,
                            g: g.clamp(0, 255) as u8,
                            b: b.clamp(0, 255) as u8,
                        };
                    }
                }
                continue;
            }

            // NEON SIMD path for 16 pixels
            // Split the output buffer between row 0 and row 1
            let split_point = y1_row * width;
            let (top_rows, bottom_rows) = out.split_at_mut(split_point);
            let row0_out = &mut top_rows[y0_row * width + x..];
            let row1_out = &mut bottom_rows[x..];

            process_16_pixels_420_neon(
                token,
                &y_plane[y0_row * y_stride + x..],
                &y_plane[y1_row * y_stride + x..],
                &u_plane[chroma_row * u_stride + x / 2..],
                &v_plane[chroma_row * v_stride + x / 2..],
                row0_out,
                row1_out,
                y_coef,
                cr_coef,
                cb_coef,
                g_coef_1,
                g_coef_2,
                y_bias,
                uv_bias,
            );
        }
    }

    ImgVec::new(out, width, height)
}

/// Process 16 pixels of YUV420 using NEON intrinsics
///
/// Loads 16 Y values per row and 8 chroma values, expands chroma to 16 via
/// duplication, then computes RGB using i16 fixed-point math with saturating
/// narrowing to u8.
#[cfg(target_arch = "aarch64")]
#[rite]
fn process_16_pixels_420_neon(
    _token: NeonToken,
    y0: &[u8],
    y1: &[u8],
    u: &[u8],
    v: &[u8],
    out0: &mut [RGB8],
    out1: &mut [RGB8],
    y_coef: i16,
    cr_coef: i16,
    cb_coef: i16,
    g_coef_1: i16,
    g_coef_2: i16,
    y_bias: i16,
    uv_bias: i16,
) {
    use core::arch::aarch64::*;

    let out0 = &mut out0[..16];
    let out1 = &mut out1[..16];

    // Load 16 Y values for each row (safe via safe_unaligned_simd)
    let y0_vals = safe_unaligned_simd::aarch64::vld1q_u8(y0[..16].try_into().unwrap());
    let y1_vals = safe_unaligned_simd::aarch64::vld1q_u8(y1[..16].try_into().unwrap());

    // Load 8 U and V values
    let u_vals = safe_unaligned_simd::aarch64::vld1_u8(u[..8].try_into().unwrap());
    let v_vals = safe_unaligned_simd::aarch64::vld1_u8(v[..8].try_into().unwrap());

    // Broadcast bias values
    let y_corr = vdupq_n_u8(y_bias as u8);
    let uv_corr = vdupq_n_s16((uv_bias << 2) | (uv_bias >> 6));

    // Broadcast coefficients
    let v_y_coef = vdupq_n_s16(y_coef);
    let v_cr_coef = vdupq_n_s16(cr_coef);
    let v_cb_coef = vdupq_n_s16(cb_coef);
    let v_g_coef_1 = vdupq_n_s16(g_coef_1);
    let v_g_coef_2 = vdupq_n_s16(g_coef_2);

    // Subtract Y bias (saturating)
    let y0_sub = vqsubq_u8(y0_vals, y_corr);
    let y1_sub = vqsubq_u8(y1_vals, y_corr);

    // Expand chroma from 8 to 16 values by duplicating each byte
    let u_expanded_lo = vzip1_u8(u_vals, u_vals);
    let u_expanded_hi = vzip2_u8(u_vals, u_vals);
    let u_expanded = vcombine_u8(u_expanded_lo, u_expanded_hi);
    let v_expanded_lo = vzip1_u8(v_vals, v_vals);
    let v_expanded_hi = vzip2_u8(v_vals, v_vals);
    let v_expanded = vcombine_u8(v_expanded_lo, v_expanded_hi);

    // Expand u8 to i16 by unpacking (self-duplicate + shift right)
    let y0_lo = expand_u8_to_i16_lo_neon(y0_sub);
    let y0_hi = expand_u8_to_i16_hi_neon(y0_sub);
    let y1_lo = expand_u8_to_i16_lo_neon(y1_sub);
    let y1_hi = expand_u8_to_i16_hi_neon(y1_sub);

    let u_lo = expand_u8_to_i16_lo_neon(u_expanded);
    let u_hi = expand_u8_to_i16_hi_neon(u_expanded);
    let v_lo = expand_u8_to_i16_lo_neon(v_expanded);
    let v_hi = expand_u8_to_i16_hi_neon(v_expanded);

    // Subtract UV bias
    let u_lo = vsubq_s16(u_lo, uv_corr);
    let u_hi = vsubq_s16(u_hi, uv_corr);
    let v_lo = vsubq_s16(v_lo, uv_corr);
    let v_hi = vsubq_s16(v_hi, uv_corr);

    // Process all pixels
    let (r0_lo, g0_lo, b0_lo) = yuv_to_rgb_i16_neon(
        y0_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
    );
    let (r0_hi, g0_hi, b0_hi) = yuv_to_rgb_i16_neon(
        y0_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
    );
    let (r1_lo, g1_lo, b1_lo) = yuv_to_rgb_i16_neon(
        y1_lo, u_lo, v_lo, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
    );
    let (r1_hi, g1_hi, b1_hi) = yuv_to_rgb_i16_neon(
        y1_hi, u_hi, v_hi, v_y_coef, v_cr_coef, v_cb_coef, v_g_coef_1, v_g_coef_2,
    );

    // Pack i16 -> u8 with saturation
    let r0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
    let g0 = vcombine_u8(vqmovun_s16(g0_lo), vqmovun_s16(g0_hi));
    let b0 = vcombine_u8(vqmovun_s16(b0_lo), vqmovun_s16(b0_hi));

    let r1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
    let g1 = vcombine_u8(vqmovun_s16(g1_lo), vqmovun_s16(g1_hi));
    let b1 = vcombine_u8(vqmovun_s16(b1_lo), vqmovun_s16(b1_hi));

    // Store to output
    store_rgb_row_neon(out0, r0, g0, b0);
    store_rgb_row_neon(out1, r1, g1, b1);
}

/// Expand u8 to i16 by unpacking low half (NEON equivalent of expand_u8_to_i16_lo)
///
/// Duplicates each byte and shifts right by 6, producing a 10-bit representation.
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn expand_u8_to_i16_lo_neon(v: uint8x16_t) -> int16x8_t {
    // Interleave low bytes with themselves: [a0,a0,a1,a1,...,a7,a7]
    let lo = vget_low_u8(v);
    let dup = vzip1q_u8(vcombine_u8(lo, lo), vcombine_u8(lo, lo));
    // Reinterpret as u16 and shift right by 6
    let as_u16 = vreinterpretq_u16_u8(dup);
    vreinterpretq_s16_u16(vshrq_n_u16::<6>(as_u16))
}

/// Expand u8 to i16 by unpacking high half (NEON equivalent of expand_u8_to_i16_hi)
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn expand_u8_to_i16_hi_neon(v: uint8x16_t) -> int16x8_t {
    let hi = vget_high_u8(v);
    let dup = vzip1q_u8(vcombine_u8(hi, hi), vcombine_u8(hi, hi));
    let as_u16 = vreinterpretq_u16_u8(dup);
    vreinterpretq_s16_u16(vshrq_n_u16::<6>(as_u16))
}

/// YUV to RGB conversion in i16 fixed-point (NEON)
///
/// Uses vqrdmulhq_s16 (saturating rounding doubling multiply high) as the
/// NEON equivalent of _mm256_mulhrs_epi16.
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn yuv_to_rgb_i16_neon(
    y: int16x8_t,
    u: int16x8_t,
    v: int16x8_t,
    y_coef: int16x8_t,
    cr_coef: int16x8_t,
    cb_coef: int16x8_t,
    g_coef_1: int16x8_t,
    g_coef_2: int16x8_t,
) -> (int16x8_t, int16x8_t, int16x8_t) {
    // Scale Y with luma coefficient (mulhrs equivalent)
    let y_scaled = vqrdmulhq_s16(y, y_coef);

    // Compute color components
    let v_cr = vqrdmulhq_s16(v, cr_coef);
    let u_cb = vqrdmulhq_s16(u, cb_coef);
    let v_g = vqrdmulhq_s16(v, g_coef_1);
    let u_g = vqrdmulhq_s16(u, g_coef_2);

    let r = vaddq_s16(y_scaled, v_cr);
    let b = vaddq_s16(y_scaled, u_cb);
    let g = vsubq_s16(y_scaled, vaddq_s16(v_g, u_g));

    (r, g, b)
}

/// Store 16 RGB pixels from planar NEON registers to packed RGB8 output
#[cfg(target_arch = "aarch64")]
#[rite(neon)]
#[inline(always)]
fn store_rgb_row_neon(out: &mut [RGB8], r: uint8x16_t, g: uint8x16_t, b: uint8x16_t) {
    let mut r_arr = [0u8; 16];
    let mut g_arr = [0u8; 16];
    let mut b_arr = [0u8; 16];

    safe_unaligned_simd::aarch64::vst1q_u8(&mut r_arr, r);
    safe_unaligned_simd::aarch64::vst1q_u8(&mut g_arr, g);
    safe_unaligned_simd::aarch64::vst1q_u8(&mut b_arr, b);

    for i in 0..16 {
        out[i] = RGB8 {
            r: r_arr[i],
            g: g_arr[i],
            b: b_arr[i],
        };
    }
}