Skip to main content

codec/colorspace/
scale.rs

1// Bilinear scaler — scalar + AVX2 dispatch for 8-bit (Yuv420p) and
2// 10-bit (Yuv420p10le) frames.
3
4use anyhow::{Result, bail};
5use bytes::BytesMut;
6
7use crate::frame::{PixelFormat, VideoFrame};
8
9pub fn scale_frame(
10    frame: &VideoFrame,
11    target_width: u32,
12    target_height: u32,
13) -> Result<VideoFrame> {
14    if frame.width == target_width && frame.height == target_height {
15        return Ok(frame.clone());
16    }
17
18    match frame.format {
19        PixelFormat::Yuv420p => scale_frame_8bit(frame, target_width, target_height),
20        // 10-bit 4:2:0 path. Squad-19 shipped scalar; Squad-29 added
21        // AVX2 specialization (16 × u16 lanes per iter, Q15 bilinear
22        // weights via `_mm256_mulhrs_epi16`). Runtime-dispatched by
23        // `bilinear_scale_plane_u16` (`is_x86_feature_detected!("avx2")`).
24        PixelFormat::Yuv420p10le => scale_frame_10bit(frame, target_width, target_height),
25        _ => bail!(
26            "scaling only implemented for Yuv420p / Yuv420p10le; got {:?}",
27            frame.format
28        ),
29    }
30}
31
32fn scale_frame_8bit(
33    frame: &VideoFrame,
34    target_width: u32,
35    target_height: u32,
36) -> Result<VideoFrame> {
37    let src_w = frame.width as usize;
38    let src_h = frame.height as usize;
39    let dst_w = target_width as usize;
40    let dst_h = target_height as usize;
41
42    let src_y_size = src_w * src_h;
43    let dst_y_size = dst_w * dst_h;
44    let dst_uv_size = dst_y_size / 4;
45
46    let mut out = BytesMut::with_capacity(dst_y_size + dst_uv_size * 2);
47
48    // Bilinear scale Y plane
49    let y_plane = &frame.data[..src_y_size];
50    out.extend(bilinear_scale_plane(y_plane, src_w, src_h, dst_w, dst_h));
51
52    // Scale U plane
53    let u_offset = src_y_size;
54    let u_plane = &frame.data[u_offset..u_offset + src_y_size / 4];
55    out.extend(bilinear_scale_plane(
56        u_plane,
57        src_w / 2,
58        src_h / 2,
59        dst_w / 2,
60        dst_h / 2,
61    ));
62
63    // Scale V plane
64    let v_offset = u_offset + src_y_size / 4;
65    let v_plane = &frame.data[v_offset..v_offset + src_y_size / 4];
66    out.extend(bilinear_scale_plane(
67        v_plane,
68        src_w / 2,
69        src_h / 2,
70        dst_w / 2,
71        dst_h / 2,
72    ));
73
74    Ok(VideoFrame::new(
75        out.freeze(),
76        target_width,
77        target_height,
78        frame.format,
79        frame.color_space,
80        frame.pts,
81    ))
82}
83
84/// 10-bit `Yuv420p10le` bilinear scaler. Each plane is `u16` LE in the
85/// 0..=1023 range. Operates on 16-bit samples directly; output sample
86/// range is preserved (10-bit values stored in 16-bit containers).
87///
88/// Per-plane work runs through `bilinear_scale_plane_u16`, which
89/// runtime-dispatches to AVX2 (Squad-29; 16 × u16 lanes per iter)
90/// when `is_x86_feature_detected!("avx2")` and falls back to the
91/// scalar f64 path (Squad-19) otherwise.
92fn scale_frame_10bit(
93    frame: &VideoFrame,
94    target_width: u32,
95    target_height: u32,
96) -> Result<VideoFrame> {
97    let src_w = frame.width as usize;
98    let src_h = frame.height as usize;
99    let dst_w = target_width as usize;
100    let dst_h = target_height as usize;
101
102    let bytes_per_sample = 2usize;
103    let src_y_size_samples = src_w * src_h;
104    let src_y_size_bytes = src_y_size_samples * bytes_per_sample;
105    let src_c_size_samples = (src_w / 2) * (src_h / 2);
106    let src_c_size_bytes = src_c_size_samples * bytes_per_sample;
107
108    if frame.data.len() < src_y_size_bytes + 2 * src_c_size_bytes {
109        bail!(
110            "10-bit frame data too short for {}x{}: {} bytes",
111            src_w,
112            src_h,
113            frame.data.len()
114        );
115    }
116
117    let dst_y_size_samples = dst_w * dst_h;
118    let dst_c_size_samples = (dst_w / 2) * (dst_h / 2);
119    let dst_total_bytes = (dst_y_size_samples + 2 * dst_c_size_samples) * bytes_per_sample;
120
121    // Decode planes from LE bytes into u16 buffers.
122    let y_plane = super::read_u16le(&frame.data[..src_y_size_bytes]);
123    let u_plane =
124        super::read_u16le(&frame.data[src_y_size_bytes..src_y_size_bytes + src_c_size_bytes]);
125    let v_plane = super::read_u16le(
126        &frame.data
127            [src_y_size_bytes + src_c_size_bytes..src_y_size_bytes + 2 * src_c_size_bytes],
128    );
129
130    // Squad-29: runtime-dispatched (AVX2 when available, scalar fallback).
131    let y_dst = bilinear_scale_plane_u16(&y_plane, src_w, src_h, dst_w, dst_h);
132    let u_dst =
133        bilinear_scale_plane_u16(&u_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
134    let v_dst =
135        bilinear_scale_plane_u16(&v_plane, src_w / 2, src_h / 2, dst_w / 2, dst_h / 2);
136
137    let mut out = BytesMut::with_capacity(dst_total_bytes);
138    super::write_u16le(&mut out, &y_dst);
139    super::write_u16le(&mut out, &u_dst);
140    super::write_u16le(&mut out, &v_dst);
141
142    Ok(VideoFrame::new(
143        out.freeze(),
144        target_width,
145        target_height,
146        frame.format,
147        frame.color_space,
148        frame.pts,
149    ))
150}
151
152/// Scalar bilinear scale on `u16` (10-bit) samples. Mirrors the 8-bit
153/// `bilinear_scale_plane_scalar` algorithm; the only differences are
154/// the wider sample type and the absence of u8 saturation at the
155/// output (10-bit values up to 1023 fit comfortably in u16, no
156/// overflow risk in the f64 intermediate).
157pub fn bilinear_scale_plane_u16_scalar(
158    src: &[u16],
159    src_w: usize,
160    src_h: usize,
161    dst_w: usize,
162    dst_h: usize,
163) -> Vec<u16> {
164    let mut dst = vec![0u16; dst_w * dst_h];
165    let x_ratio = src_w as f64 / dst_w as f64;
166    let y_ratio = src_h as f64 / dst_h as f64;
167
168    for dy in 0..dst_h {
169        let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
170        let y0 = sy as usize;
171        let y1 = (y0 + 1).min(src_h - 1);
172        let fy = sy - y0 as f64;
173
174        for dx in 0..dst_w {
175            let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
176            let x0 = sx as usize;
177            let x1 = (x0 + 1).min(src_w - 1);
178            let fx = sx - x0 as f64;
179
180            let p00 = src[y0 * src_w + x0] as f64;
181            let p10 = src[y0 * src_w + x1] as f64;
182            let p01 = src[y1 * src_w + x0] as f64;
183            let p11 = src[y1 * src_w + x1] as f64;
184
185            let val = p00 * (1.0 - fx) * (1.0 - fy)
186                + p10 * fx * (1.0 - fy)
187                + p01 * (1.0 - fx) * fy
188                + p11 * fx * fy;
189
190            // Round to nearest, clamp to the 10-bit max (1023). The
191            // input is already in 0..=1023 so an in-range bilinear
192            // combination cannot exceed 1023 in exact arithmetic; the
193            // clamp is defensive against fp rounding pushing 1023.0
194            // → 1024.0.
195            dst[dy * dst_w + dx] = val.round().clamp(0.0, 1023.0) as u16;
196        }
197    }
198    dst
199}
200
201/// Runtime-dispatched 10-bit bilinear scale. AVX2 on x86_64 when
202/// available; falls back to `bilinear_scale_plane_u16_scalar` otherwise.
203///
204/// Squad-29 (2026-04-17) added the AVX2 path. 256-bit registers
205/// process 16 × u16 samples per iteration. Internally the same
206/// Q15 fixed-point math as the 8-bit AVX2 path, but `_mm256_mulhrs_epi16`
207/// expects signed lanes — 10-bit samples (0..=1023) fit comfortably
208/// in i16 with no shift gymnastics. Output is clamped to 10-bit
209/// range (0..=1023). Scalar tail (when `dst_w % 16 != 0` or width
210/// < 16) reuses the scalar f64 math row-by-row.
211///
212/// Performance target on 1080p→720p: ≥3× over scalar (realistic
213/// floor for u16 lanes vs u8). Bench in
214/// `crates/codec/benches/bilinear.rs::bilinear_10bit_avx2_vs_scalar`.
215pub fn bilinear_scale_plane_u16(
216    src: &[u16],
217    src_w: usize,
218    src_h: usize,
219    dst_w: usize,
220    dst_h: usize,
221) -> Vec<u16> {
222    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
223    {
224        // 16-lane AVX2 path; gate on dst_w >= 16 so the main loop
225        // runs at least once per row. Narrower outputs fall back to
226        // scalar (cheap — narrow strips aren't a hotspot).
227        if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
228            // SAFETY: avx2 runtime-detected.
229            return unsafe {
230                bilinear_scale_plane_u16_avx2(src, src_w, src_h, dst_w, dst_h)
231            };
232        }
233    }
234    bilinear_scale_plane_u16_scalar(src, src_w, src_h, dst_w, dst_h)
235}
236
237/// AVX2 specialization for the 10-bit bilinear scaler. Processes
238/// 16 × u16 destination samples per iteration via 256-bit registers.
239///
240/// Algorithm mirrors `bilinear_scale_plane_avx2` (the 8-bit AVX2
241/// path) with two differences:
242/// 1. Lanes are `u16` (16 per 256-bit reg, vs 32 × `u8` in 8-bit).
243///    No need to widen u8 → i16 inside the kernel — samples are
244///    already 16-bit.
245/// 2. No need to shift sample values up (Q7 trick) before the
246///    Q15 multiply: 10-bit values in 0..=1023 fit in i16 unshifted.
247///    Final output is straight-clamped to [0, 1023] without saturating
248///    pack-down.
249///
250/// Q15 fixed-point weights via `_mm256_mulhrs_epi16` ((a*b+0x4000)>>15).
251/// `mulhrs` operates on signed lanes; weights are in [0, 32767]
252/// (one ULP shy of 32768; matches the 8-bit AVX2 trick to keep
253/// inputs i16-safe).
254#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
255#[target_feature(enable = "avx2")]
256unsafe fn bilinear_scale_plane_u16_avx2(
257    src: &[u16],
258    src_w: usize,
259    src_h: usize,
260    dst_w: usize,
261    dst_h: usize,
262) -> Vec<u16> {
263    unsafe {
264        #[cfg(target_arch = "x86")]
265        use std::arch::x86::*;
266        #[cfg(target_arch = "x86_64")]
267        use std::arch::x86_64::*;
268
269        let mut dst = vec![0u16; dst_w * dst_h];
270
271        // Q32 fixed-point ratios — same precision pattern as the 8-bit
272        // AVX2 path so the rounding edge cases land identically across
273        // the two specializations.
274        let x_step = ((src_w as u64) << 32) / (dst_w as u64);
275        let y_step = ((src_h as u64) << 32) / (dst_h as u64);
276
277        // Precompute per-dst-x source x0 + Q15 fractional weights.
278        let mut x0s: Vec<u32> = vec![0; dst_w];
279        let mut x1s: Vec<u32> = vec![0; dst_w];
280        let mut fxs_q15: Vec<i16> = vec![0; dst_w];
281        let mut one_minus_fxs_q15: Vec<i16> = vec![0; dst_w];
282        for dx in 0..dst_w {
283            let sx_32_32 = (dx as u64) * x_step;
284            let x0_full = (sx_32_32 >> 32) as usize;
285            let x0 = x0_full.min(src_w - 1);
286            let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u32;
287            // Convert Q16 → Q15 in [0, 32767]; same clamp trick as 8-bit.
288            let fx_q15 = ((fx_q16 as i32) >> 1).min(32767) as i16;
289            if x0 >= src_w - 1 {
290                x0s[dx] = (src_w - 1) as u32;
291                x1s[dx] = (src_w - 1) as u32;
292                fxs_q15[dx] = 0;
293                one_minus_fxs_q15[dx] = 32767;
294            } else {
295                x0s[dx] = x0 as u32;
296                x1s[dx] = (x0 + 1) as u32;
297                fxs_q15[dx] = fx_q15;
298                one_minus_fxs_q15[dx] = 32767 - fx_q15;
299            }
300        }
301
302        let v_max = _mm256_set1_epi16(1023);
303        let v_zero = _mm256_setzero_si256();
304
305        for dy in 0..dst_h {
306            let sy_32_32 = (dy as u64) * y_step;
307            let y0_full = (sy_32_32 >> 32) as usize;
308            let y0 = y0_full.min(src_h - 1);
309            let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
310            let y1 = (y0 + 1).min(src_h - 1);
311            let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
312            let one_minus_fy_q15 = 32767i16 - fy_q15;
313
314            let row0 = y0 * src_w;
315            let row1 = y1 * src_w;
316            let dst_row = dy * dst_w;
317
318            let v_fy = _mm256_set1_epi16(fy_q15);
319            let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
320
321            let mut dx = 0usize;
322            while dx + 16 <= dst_w {
323                // Gather 16 p00 / p10 / p01 / p11 u16 values into stack
324                // scratch buffers, then load as 256-bit registers.
325                // Same approach as the 8-bit AVX2 — bilinear inputs don't
326                // align contiguously (x0/x1 are arbitrary), so a scalar
327                // gather + aligned reload is the cheapest path.
328                let mut p00_buf = [0u16; 16];
329                let mut p10_buf = [0u16; 16];
330                let mut p01_buf = [0u16; 16];
331                let mut p11_buf = [0u16; 16];
332                for i in 0..16 {
333                    let x0 = x0s[dx + i] as usize;
334                    let x1 = x1s[dx + i] as usize;
335                    p00_buf[i] = *src.get_unchecked(row0 + x0);
336                    p10_buf[i] = *src.get_unchecked(row0 + x1);
337                    p01_buf[i] = *src.get_unchecked(row1 + x0);
338                    p11_buf[i] = *src.get_unchecked(row1 + x1);
339                }
340
341                // Load as 256-bit (16 × u16). Treated as i16 for the
342                // signed `mulhrs` — 10-bit samples max=1023, well under
343                // i16_max (32767), so reinterpret is bit-identical and
344                // safe.
345                let p00 = _mm256_loadu_si256(p00_buf.as_ptr() as *const _);
346                let p10 = _mm256_loadu_si256(p10_buf.as_ptr() as *const _);
347                let p01 = _mm256_loadu_si256(p01_buf.as_ptr() as *const _);
348                let p11 = _mm256_loadu_si256(p11_buf.as_ptr() as *const _);
349
350                // Per-lane fx / (1-fx).
351                let v_fx = _mm256_loadu_si256(fxs_q15.as_ptr().add(dx) as *const _);
352                let v_one_minus_fx =
353                    _mm256_loadu_si256(one_minus_fxs_q15.as_ptr().add(dx) as *const _);
354
355                // mulhrs: (a*b + 0x4000) >> 15. Inputs are 10-bit
356                // (0..=1023), weights are Q15 (0..=32767). Product max
357                // ≈ 1023 * 32767 ≈ 33.5M; after >> 15 the value is
358                // ≤ 1023, so signed i16 is plenty.
359                //
360                // Important: because samples are unshifted (unlike the
361                // 8-bit kernel which multiplied by 128 first), the
362                // post-multiply value retains its full sample magnitude.
363                // No final shift-down is needed — `top` and `bottom`
364                // are already in the 0..=1023 range.
365                let top = _mm256_add_epi16(
366                    _mm256_mulhrs_epi16(p00, v_one_minus_fx),
367                    _mm256_mulhrs_epi16(p10, v_fx),
368                );
369                let bottom = _mm256_add_epi16(
370                    _mm256_mulhrs_epi16(p01, v_one_minus_fx),
371                    _mm256_mulhrs_epi16(p11, v_fx),
372                );
373
374                // Vertical interp. Same Q15 → 10-bit scale.
375                let out_i16 = _mm256_add_epi16(
376                    _mm256_mulhrs_epi16(top, v_one_minus_fy),
377                    _mm256_mulhrs_epi16(bottom, v_fy),
378                );
379
380                // Clamp to [0, 1023] — defensive against the Q15 round
381                // trick pushing exactly-1023 inputs to 1024 in extreme
382                // edge cases. Use signed min/max on i16 (values are
383                // non-negative for in-range 10-bit input).
384                let clamped =
385                    _mm256_min_epi16(_mm256_max_epi16(out_i16, v_zero), v_max);
386
387                _mm256_storeu_si256(
388                    dst.as_mut_ptr().add(dst_row + dx) as *mut _,
389                    clamped,
390                );
391
392                dx += 16;
393            }
394
395            // Scalar tail. Mirrors `bilinear_scale_plane_u16_scalar` row
396            // math so parity with the scalar function is byte-exact in
397            // the tail (modulo the ±1-LSB rounding tolerance the SIMD
398            // main loop carries vs scalar f64).
399            while dx < dst_w {
400                let x0 = x0s[dx] as usize;
401                let x1 = x1s[dx] as usize;
402                let fx = fxs_q15[dx] as f64 / 32768.0;
403                let fy = fy_q15 as f64 / 32768.0;
404
405                let p00 = src[row0 + x0] as f64;
406                let p10 = src[row0 + x1] as f64;
407                let p01 = src[row1 + x0] as f64;
408                let p11 = src[row1 + x1] as f64;
409
410                let val = p00 * (1.0 - fx) * (1.0 - fy)
411                    + p10 * fx * (1.0 - fy)
412                    + p01 * (1.0 - fx) * fy
413                    + p11 * fx * fy;
414                dst[dst_row + dx] = val.round().clamp(0.0, 1023.0) as u16;
415                dx += 1;
416            }
417        }
418
419        dst
420    }
421}
422
423/// Runtime-dispatched bilinear scale. AVX2 on x86_64 when available.
424pub fn bilinear_scale_plane(
425    src: &[u8],
426    src_w: usize,
427    src_h: usize,
428    dst_w: usize,
429    dst_h: usize,
430) -> Vec<u8> {
431    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
432    {
433        // perf: AVX2 specialization — Q16 fixed-point 2-tap horiz ×
434        // 2-tap vert, 16 dst pixels / iter. Benched 3-4× faster than
435        // the f64 scalar on 1080p→720p.
436        if std::is_x86_feature_detected!("avx2") && dst_w >= 16 {
437            // SAFETY: avx2 runtime-detected.
438            return unsafe { bilinear_scale_plane_avx2(src, src_w, src_h, dst_w, dst_h) };
439        }
440    }
441    bilinear_scale_plane_scalar(src, src_w, src_h, dst_w, dst_h)
442}
443
444pub fn bilinear_scale_plane_scalar(
445    src: &[u8],
446    src_w: usize,
447    src_h: usize,
448    dst_w: usize,
449    dst_h: usize,
450) -> Vec<u8> {
451    let mut dst = vec![0u8; dst_w * dst_h];
452    let x_ratio = src_w as f64 / dst_w as f64;
453    let y_ratio = src_h as f64 / dst_h as f64;
454
455    for dy in 0..dst_h {
456        let sy = (dy as f64 * y_ratio).min((src_h - 1) as f64);
457        let y0 = sy as usize;
458        let y1 = (y0 + 1).min(src_h - 1);
459        let fy = sy - y0 as f64;
460
461        for dx in 0..dst_w {
462            let sx = (dx as f64 * x_ratio).min((src_w - 1) as f64);
463            let x0 = sx as usize;
464            let x1 = (x0 + 1).min(src_w - 1);
465            let fx = sx - x0 as f64;
466
467            let p00 = src[y0 * src_w + x0] as f64;
468            let p10 = src[y0 * src_w + x1] as f64;
469            let p01 = src[y1 * src_w + x0] as f64;
470            let p11 = src[y1 * src_w + x1] as f64;
471
472            let val = p00 * (1.0 - fx) * (1.0 - fy)
473                + p10 * fx * (1.0 - fy)
474                + p01 * (1.0 - fx) * fy
475                + p11 * fx * fy;
476
477            dst[dy * dst_w + dx] = val.round() as u8;
478        }
479    }
480    dst
481}
482
483#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
484#[target_feature(enable = "avx2")]
485unsafe fn bilinear_scale_plane_avx2(
486    src: &[u8],
487    src_w: usize,
488    src_h: usize,
489    dst_w: usize,
490    dst_h: usize,
491) -> Vec<u8> {
492    unsafe {
493        #[cfg(target_arch = "x86")]
494        use std::arch::x86::*;
495        #[cfg(target_arch = "x86_64")]
496        use std::arch::x86_64::*;
497
498        let mut dst = vec![0u8; dst_w * dst_h];
499
500        // Q16 fixed-point ratios. src_w/dst_w as 16.16.
501        //
502        // sx_q16(dx) = dx * (src_w << 16) / dst_w
503        // Using integer math avoids the f64 hotpath. The sampling
504        // convention matches the scalar (origin-at-pixel-corner); picking
505        // a different convention would change the output but also the
506        // reference, so we mirror scalar precisely.
507        let x_step = ((src_w as u64) << 32) / (dst_w as u64); // 32.32 keeps precision for wide src
508        let y_step = ((src_h as u64) << 32) / (dst_h as u64);
509
510        // Precompute per-dst-x source x0 and fx (Q16 weight for src[x1]).
511        let mut x0s: Vec<u32> = vec![0; dst_w];
512        let mut fxs: Vec<u16> = vec![0; dst_w];
513        for dx in 0..dst_w {
514            let sx_32_32 = (dx as u64) * x_step; // source x in 32.32
515            let x0_full = (sx_32_32 >> 32) as usize;
516            let x0 = x0_full.min(src_w - 1);
517            let fx_q16 = ((sx_32_32 >> 16) & 0xFFFF) as u16; // Q16 fraction
518            // Clamp to src_w-1 if we'd run off the end
519            if x0 >= src_w - 1 {
520                x0s[dx] = (src_w - 1) as u32;
521                fxs[dx] = 0;
522            } else {
523                x0s[dx] = x0 as u32;
524                fxs[dx] = fx_q16;
525            }
526        }
527
528        // Bake (1-fx) into a paired vector for mulhrs-style math. We'll
529        // compute:
530        //   top    = p00 * (1-fx) + p10 * fx
531        //   bottom = p01 * (1-fx) + p11 * fx
532        //   out    = top * (1-fy) + bottom * fy
533        // All in Q15 (mulhrs) with u16 input shifted down by 1 bit, or in
534        // Q14 with straightforward pmulhw.
535        //
536        // Use Q15 + mulhrs_epi16 for the fractional weights. Fractional
537        // weights are in [0, 32768] — 32768 overflows i16, so clamp at
538        // 32767 (error ≤ 1/32768 ≈ 0, sub-LSB for 8-bit output).
539        let mut fx_q15: Vec<i16> = vec![0; dst_w];
540        let mut one_minus_fx_q15: Vec<i16> = vec![0; dst_w];
541        for dx in 0..dst_w {
542            // Convert Q16 weight to Q15 in [0, 32767].
543            let fxq15 = (fxs[dx] as i32 >> 1).min(32767) as i16;
544            fx_q15[dx] = fxq15;
545            one_minus_fx_q15[dx] = 32767 - fxq15;
546        }
547
548        for dy in 0..dst_h {
549            let sy_32_32 = (dy as u64) * y_step;
550            let y0_full = (sy_32_32 >> 32) as usize;
551            let y0 = y0_full.min(src_h - 1);
552            let fy_q16 = ((sy_32_32 >> 16) & 0xFFFF) as u32;
553            let y1 = (y0 + 1).min(src_h - 1);
554            let fy_q15 = ((fy_q16 as i32) >> 1).min(32767) as i16;
555            let one_minus_fy_q15 = 32767i16 - fy_q15;
556
557            let row0 = y0 * src_w;
558            let row1 = y1 * src_w;
559            let dst_row = dy * dst_w;
560
561            // Per-dx we need p00/p10/p01/p11. Bilinear inputs don't align
562            // contiguously (x0/x1 are arbitrary), so we gather by
563            // scalar-load-then-pack for 16 dst-x per iteration.
564            let v_fy = _mm256_set1_epi16(fy_q15);
565            let v_one_minus_fy = _mm256_set1_epi16(one_minus_fy_q15);
566
567            let mut dx = 0usize;
568            while dx + 16 <= dst_w {
569                // Gather 16 p00, p10, p01, p11 into 16-lane u8 buffers.
570                // Use stack scratch and an unaligned load.
571                let mut p00_buf = [0u8; 16];
572                let mut p10_buf = [0u8; 16];
573                let mut p01_buf = [0u8; 16];
574                let mut p11_buf = [0u8; 16];
575                for i in 0..16 {
576                    let x0 = x0s[dx + i] as usize;
577                    let x1 = (x0 + 1).min(src_w - 1);
578                    p00_buf[i] = *src.get_unchecked(row0 + x0);
579                    p10_buf[i] = *src.get_unchecked(row0 + x1);
580                    p01_buf[i] = *src.get_unchecked(row1 + x0);
581                    p11_buf[i] = *src.get_unchecked(row1 + x1);
582                }
583
584                // Widen to i16.
585                let p00 = _mm256_cvtepu8_epi16(
586                    _mm_loadu_si128(p00_buf.as_ptr() as *const _),
587                );
588                let p10 = _mm256_cvtepu8_epi16(
589                    _mm_loadu_si128(p10_buf.as_ptr() as *const _),
590                );
591                let p01 = _mm256_cvtepu8_epi16(
592                    _mm_loadu_si128(p01_buf.as_ptr() as *const _),
593                );
594                let p11 = _mm256_cvtepu8_epi16(
595                    _mm_loadu_si128(p11_buf.as_ptr() as *const _),
596                );
597
598                // Shift u8 (0..255) up to the top of i16's signed range so
599                // mulhrs_epi16 retains precision. Each u8 value × 128 is in
600                // [0, 32640] — safe for signed mul.
601                let p00 = _mm256_slli_epi16::<7>(p00);
602                let p10 = _mm256_slli_epi16::<7>(p10);
603                let p01 = _mm256_slli_epi16::<7>(p01);
604                let p11 = _mm256_slli_epi16::<7>(p11);
605
606                // Load per-lane fx / (1-fx).
607                let v_fx = _mm256_loadu_si256(fx_q15.as_ptr().add(dx) as *const _);
608                let v_one_minus_fx =
609                    _mm256_loadu_si256(one_minus_fx_q15.as_ptr().add(dx) as *const _);
610
611                // Horizontal interp: top = p00*(1-fx) + p10*fx, in Q15.
612                let top = _mm256_add_epi16(
613                    _mm256_mulhrs_epi16(p00, v_one_minus_fx),
614                    _mm256_mulhrs_epi16(p10, v_fx),
615                );
616                let bottom = _mm256_add_epi16(
617                    _mm256_mulhrs_epi16(p01, v_one_minus_fx),
618                    _mm256_mulhrs_epi16(p11, v_fx),
619                );
620
621                // Vertical interp. top/bottom are Q15-scaled of the Q7
622                // u8 → so still in the ~Q7 range (approx 0..254). Apply
623                // (1-fy)/fy via mulhrs to get final Q7, then shift down 7
624                // to recover the u8. mulhrs: (top * (1-fy) + 0x4000) >> 15.
625                let out_q7 = _mm256_add_epi16(
626                    _mm256_mulhrs_epi16(top, v_one_minus_fy),
627                    _mm256_mulhrs_epi16(bottom, v_fy),
628                );
629                // Shift back: (x + 64) >> 7 for round-to-nearest.
630                let rounded = _mm256_add_epi16(out_q7, _mm256_set1_epi16(64));
631                let shifted = _mm256_srai_epi16::<7>(rounded);
632
633                // Saturating pack i16 → u8 (16 lanes).
634                let packed = _mm256_packus_epi16(shifted, shifted);
635                // packus interleaves 128-lane halves — permute so low
636                // 16 bytes of the result are what we want.
637                let packed = _mm256_permute4x64_epi64::<0b00_00_10_00>(packed);
638                _mm_storeu_si128(
639                    dst.as_mut_ptr().add(dst_row + dx) as *mut _,
640                    _mm256_castsi256_si128(packed),
641                );
642
643                dx += 16;
644            }
645
646            // Scalar tail.
647            while dx < dst_w {
648                let x0 = x0s[dx] as usize;
649                let x1 = (x0 + 1).min(src_w - 1);
650                let fx = fxs[dx] as f64 / 65536.0;
651                let fy = fy_q16 as f64 / 65536.0;
652
653                let p00 = src[row0 + x0] as f64;
654                let p10 = src[row0 + x1] as f64;
655                let p01 = src[row1 + x0] as f64;
656                let p11 = src[row1 + x1] as f64;
657
658                let val = p00 * (1.0 - fx) * (1.0 - fy)
659                    + p10 * fx * (1.0 - fy)
660                    + p01 * (1.0 - fx) * fy
661                    + p11 * fx * fy;
662                dst[dst_row + dx] = val.round() as u8;
663                dx += 1;
664            }
665        }
666
667        dst
668    }
669}