mlxrs 0.1.0 - Docs.rs

//! Fully-fallible, PIL-matching RGBA8 image resize (own implementation).
//!
//! This module replaces the third-party `fast_image_resize` crate that
//! [`crate::vlm::image::resize`] previously delegated to. The motivation
//! is allocation safety, not performance parity: `resize`'s target
//! dimensions flow from an UNTRUSTED loaded `preprocessor_config.json`
//! (see [`crate::vlm::load`]), and `fast_image_resize` allocated internal
//! scratch (coefficient tables, per-row work buffers) *infallibly* inside
//! the crate — a hostile-but-under-cap target could `abort()` the process
//! despite our `Result` signature. Owning the whole resize lets EVERY
//! allocation route through `try_reserve_exact`, so `resize` returning
//! `Ok` guarantees no abort path for any (untrusted) target size.
//!
//! ## Correctness reference — PIL `Image.resize`
//! mlx-vlm preprocessing expects **PIL `Image.resize`** semantics (the
//! swift `MediaProcessing.resampleBicubic` mirrors PIL). The convolution
//! filters here reproduce PIL's `src/libImaging/Resample.c` *exactly*,
//! including its fixed-point integer accumulation, so the output is
//! **byte-for-byte identical to PIL** (verified against Pillow 12.2 over
//! bilinear/bicubic/lanczos, upscale + downscale, RGBA — see
//! `tests/vlm_image.rs`). No ±1 LSB tolerance is required for the scalar
//! path; it is bit-exact with PIL.
//!
//! ### Algorithm (matches `Resample.c`)
//! Separable two-pass convolution: a horizontal 1-D pass that emits an
//! 8-bit clamped intermediate image, then a vertical 1-D pass over that
//! intermediate. For each output coordinate the value is a weighted sum
//! of input pixels within the filter's support window, weights from the
//! filter kernel normalized to sum to 1.
//!
//! ### Premultiplied alpha (matches `Image.resize`)
//! PIL's `Image.resize` wrapper converts RGBA -> **premultiplied** `RGBa`
//! *before* any non-NEAREST resample and converts back after
//! (`Image.py`: `if self.mode in ["LA","RGBA"] and resample != NEAREST`).
//! Convolving straight (non-premultiplied) channels is NOT byte-exact for
//! an image with non-opaque alpha — it leaks the colour of
//! fully-transparent pixels into their neighbours. This module mirrors
//! that path exactly: it premultiplies the source colour channels
//! (`MULDIV255`), runs the separable convolution over `RGBa`, then
//! unpremultiplies the destination (`rgba2rgbA`'s `CLIP8(255*c/a)`). For
//! an all-opaque (`A == 255`) image both conversions are the identity, so
//! opaque inputs stay bit-identical to a straight-channel resize.
//! **NEAREST is exempt** (PIL does not premultiply for it — a pure
//! gather): `resize_nearest` keeps straight channels.
//!
//! ### Coordinate mapping + antialiasing (matches `precompute_coeffs`)
//! For output index `xx` along an axis resampled from `in_size` to
//! `out_size`:
//! - `scale = in_size / out_size`
//! - `center = (xx + 0.5) * scale`
//! - `filterscale = max(scale, 1.0)` — the **antialiasing filter-stretch**:
//!   when downscaling (`scale > 1`), the filter support widens by the
//!   scale factor so the kernel averages over the shrinking footprint.
//! - `support = filter_support * filterscale`
//! - window `[floor(center - support + 0.5), floor(center + support + 0.5))`
//!   clamped to `[0, in_size)`
//! - weight for input `x` in the window:
//!   `filter((x - center + 0.5) / filterscale)`, then all weights in the
//!   window normalized so they sum to 1.0.
//!
//! ### Fixed-point accumulation (matches `Resample.c` `clip8`)
//! PIL normalizes the f64 weights to fixed point with
//! `PRECISION_BITS = 22`: `coef_i = round(coef * (1 << 22))` (an `i32`).
//! The per-output accumulator is an `i32` seeded with the rounding bias
//! `1 << (PRECISION_BITS - 1)`, accumulates `pixel * coef_i`, then is
//! finished with an **arithmetic** `>> PRECISION_BITS` (sign-extending,
//! matching C's signed shift) and clamped to `[0, 255]`. The `i32`
//! accumulator does not overflow: the worst-case partial sum for these
//! kernels is `≈ 255 * 1.2 * (1 << 22) ≈ 1.28e9 < i32::MAX ≈ 2.15e9`
//! (the `sum(|coef|)` over each window is `< 1.2` for Keys-cubic a=-0.5
//! and Lanczos a=3; the filterscale spreads coefficients but shrinks each
//! so the bound holds at any scale).
//!
//! ### Nearest
//! PIL's `NEAREST` resize maps output index `o` to input
//! `min(floor((o + 0.5) * in_size / out_size), in_size - 1)` (verified
//! against Pillow). It is a pure pixel gather — no convolution, no
//! coefficient table.
//!
//! ## SIMD
//! The hot loop is the inner per-output-pixel weighted sum over the
//! support window, per channel. RGBA8 is `[u8; 4]` per pixel, so the NEON
//! kernel vectorizes **across the 4 channels**: widen the 4 source bytes
//! to `int32x4`, fused-multiply-accumulate by the (broadcast) `i32`
//! coefficient into an `int32x4` accumulator, then narrow back to 4 `u8`
//! with the same arithmetic shift + clamp. This produces output
//! bit-identical to the scalar path (same `i32` math, same rounding).
//! The coefficient precomputation (cold, once per resize) stays scalar.
//!
//! Per the project SIMD conventions: NEON is gated on
//! `#[cfg(target_arch = "aarch64")]` + a runtime
//! `is_aarch64_feature_detected!("neon")` check, the scalar fallback is
//! ALWAYS compiled, the `#[target_feature(enable = "neon")] unsafe fn`
//! kernels carry numbered `# Safety` clauses, slice-length preconditions
//! are `assert!`ed unconditionally, and the `--cfg mlxrs_force_scalar`
//! escape forces the scalar path even on aarch64. There is NO cargo
//! feature: the dispatch is always-on. (This is self-contained in `vlm`;
//! it can be refactored into a shared `mlxrs::simd` module later.)

use crate::error::{
  ArithmeticOverflowPayload, CapExceededPayload, Error, LengthMismatchPayload, OutOfRangePayload,
  Result, try_with_capacity,
};

/// Interpolation filter for [`resize_rgba8`], mirroring PIL's resampling
/// filters. The variants line up 1:1 with
/// [`crate::vlm::image::ResizeFilter`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum Filter {
  /// Nearest-neighbor pixel gather (no smoothing). PIL `Image.NEAREST`.
  Nearest,
  /// Triangle / linear kernel, support `1.0`. PIL `Image.BILINEAR`.
  Bilinear,
  /// Keys cubic with `a = -0.5`, support `2.0`. PIL `Image.BICUBIC`.
  Bicubic,
  /// Sinc-windowed sinc with `a = 3`, support `3.0`. PIL `Image.LANCZOS`.
  Lanczos3,
}

/// PIL fixed-point precision: `coef_int = round(coef * (1 << 22))`, and
/// the accumulator is finished with `>> 22`. Matches `Resample.c`'s
/// `#define PRECISION_BITS (32 - 8 - 2)`.
const PRECISION_BITS: u32 = 32 - 8 - 2;

/// Rounding bias added to the fixed-point accumulator before the final
/// shift (`1 << (PRECISION_BITS - 1)`), matching `Resample.c`.
const ROUND_BIAS: i32 = 1 << (PRECISION_BITS - 1);

/// RGBA8 has 4 channels (the only pixel layout this module handles — the
/// caller materializes every source variant to RGBA8 first).
const CHANNELS: usize = 4;

/// Byte ceiling for EVERY allocation in the resize path — the same 512 MiB
/// budget [`crate::vlm::image::MAX_DECODED_IMAGE_BYTES`] caps the
/// RGBA-expanded source and final destination with. The public
/// [`crate::vlm::image::resize`] wrapper guards only those two end buffers;
/// the *internal* scratch this module allocates (the horizontal-pass
/// intermediate, the per-axis coefficient tables, the nearest-resize
/// x-index map) is sized from the SAME untrusted target dimensions and can
/// dwarf both ends — e.g. a `1×131072` source resized to `131072×1` has a
/// 0.5 MiB source and a 0.5 MiB destination but a `131072 * 131072 * 4`
/// ≈ 68 GiB horizontal intermediate. `try_reserve_exact` makes an allocator
/// *refusal* recoverable, but on an overcommitting allocator the reservation
/// succeeds and the subsequent zero-fill faults in all 68 GiB → process
/// death. So every scratch buffer is checked against this ceiling BEFORE its
/// `try_reserve_exact` (see [`checked_buffer_bytes`]).
///
/// Kept in sync with — and equal to — `image::MAX_DECODED_IMAGE_BYTES`
/// (`u64` there; `usize` here because these byte counts are compared
/// against `Vec` capacities). On a 32-bit host `usize` is 32-bit but
/// `512 * 1024 * 1024` still fits, so the `as usize` is lossless.
const MAX_DECODED_IMAGE_BYTES: usize = 512 * 1024 * 1024;

/// Compute `elems * elem_size` as a byte count, rejecting BOTH a `usize`
/// overflow and a product exceeding [`MAX_DECODED_IMAGE_BYTES`]. Every
/// `try_with_capacity` / `try_reserve_exact` in the resize path is preceded
/// by this check, so no resize allocation — source, horizontal
/// intermediate, coefficient table, x-index map, destination — can overflow
/// `usize` or exceed the 512 MiB budget.
///
/// `try_reserve_exact` already turns an *allocator refusal* into a
/// recoverable [`Error::OutOfMemory`], but it does not bound the request:
/// an overcommitting allocator hands back a 68 GiB reservation that only
/// faults (and kills the process) when the caller's zero-fill touches the
/// pages. This ceiling check makes the *request itself* recoverable.
///
/// `what` is a static call-site label identifying the buffer (e.g.
/// `"coefficient bounds table"`, `"RGBA8 source"`).
///
/// # Errors
/// - [`Error::ArithmeticOverflow`] if `elems * elem_size` overflows `usize`.
/// - [`Error::CapExceeded`] if `elems * elem_size` exceeds
///   [`MAX_DECODED_IMAGE_BYTES`].
fn checked_buffer_bytes(elems: usize, elem_size: usize, what: &'static str) -> Result<usize> {
  let bytes = elems.checked_mul(elem_size).ok_or_else(|| {
    Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
      "resize: buffer size (elems * elem_size)",
      "usize",
      [("elems", elems as u64), ("elem_size", elem_size as u64)],
    ))
  })?;
  if bytes > MAX_DECODED_IMAGE_BYTES {
    return Err(Error::CapExceeded(CapExceededPayload::new(
      what,
      "MAX_DECODED_IMAGE_BYTES",
      MAX_DECODED_IMAGE_BYTES as u64,
      bytes as u64,
    )));
  }
  Ok(bytes)
}

/// Continuous filter support radius (the half-width of the kernel before
/// the antialiasing filterscale stretch).
fn filter_support(f: Filter) -> f64 {
  match f {
    // Nearest has no continuous kernel; never queried (handled separately).
    Filter::Nearest => 0.0,
    Filter::Bilinear => 1.0,
    Filter::Bicubic => 2.0,
    Filter::Lanczos3 => 3.0,
  }
}

/// Evaluate the continuous filter kernel at `x` (already divided by the
/// filterscale by the caller). Each matches PIL's `Resample.c`:
/// - Bilinear: triangle `1 - |x|` on `[-1, 1]`.
/// - Bicubic: Keys cubic with `a = -0.5`.
/// - Lanczos3: `sinc(x) * sinc(x / 3)` on `[-3, 3]`.
fn filter_eval(f: Filter, x: f64) -> f64 {
  match f {
    Filter::Nearest => 0.0,
    Filter::Bilinear => {
      let x = x.abs();
      if x < 1.0 { 1.0 - x } else { 0.0 }
    }
    Filter::Bicubic => {
      // PIL Keys cubic, a = -0.5.
      const A: f64 = -0.5;
      let x = x.abs();
      if x < 1.0 {
        ((A + 2.0) * x - (A + 3.0)) * x * x + 1.0
      } else if x < 2.0 {
        (((x - 5.0) * x + 8.0) * x - 4.0) * A
      } else {
        0.0
      }
    }
    Filter::Lanczos3 => {
      let x = x.abs();
      if x < 3.0 {
        sinc(x) * sinc(x / 3.0)
      } else {
        0.0
      }
    }
  }
}

/// Normalized sinc, `sin(pi x) / (pi x)`, with `sinc(0) = 1` — matching
/// PIL's `sinc_filter`.
fn sinc(x: f64) -> f64 {
  if x == 0.0 {
    1.0
  } else {
    let px = x * std::f64::consts::PI;
    px.sin() / px
  }
}

/// Precomputed per-output-index convolution coefficients for one axis.
///
/// `bounds[o] = (xmin, n)` gives the input window start and length for
/// output index `o`; `weights[o * ksize .. o * ksize + n]` are the
/// fixed-point `i32` taps for that output (the remaining `ksize - n`
/// slots in the row are zero-padded so every row has a uniform stride —
/// this keeps the convolution inner loop branch-free on row stride).
///
/// All three backing `Vec`s are reserved via `try_reserve_exact`; this
/// type is the "coefficient table" `fast_image_resize` allocated
/// infallibly.
struct Coeffs {
  /// `(xmin, n)` per output index.
  bounds: Vec<(usize, usize)>,
  /// Fixed-point taps, row-major with stride `ksize`.
  weights: Vec<i32>,
  /// Per-output row stride (`max` window length across outputs).
  ksize: usize,
}

/// Precompute the convolution coefficients for resampling one axis from
/// `in_size` to `out_size` with `filter` (PIL `precompute_coeffs` +
/// `normalize_coeffs_8bpc`).
///
/// Every buffer is `try_reserve_exact`-backed; an allocator refusal
/// surfaces as [`Error::OutOfMemory`]. A degenerate `in_size`/`out_size`
/// (zero) surfaces as [`Error::OutOfRange`]; a `ksize` overflow surfaces as
/// [`Error::ArithmeticOverflow`]; a coefficient table exceeding
/// [`MAX_DECODED_IMAGE_BYTES`] surfaces as [`Error::CapExceeded`].
///
/// The coefficient table is `out_size * ksize` taps. `ksize` is small for
/// a sane resize (`ceil(filter_support * filterscale) * 2 + 1`, clamped to
/// `in_size`), but a `131072`-wide output combined with a stretched
/// downscale support could still size a multi-hundred-MiB table — so the
/// table's byte size, the bounds vector, and the per-row f64 scratch are
/// each capped against [`MAX_DECODED_IMAGE_BYTES`] via
/// [`checked_buffer_bytes`] BEFORE their `try_reserve_exact`.
fn precompute_coeffs(in_size: usize, out_size: usize, filter: Filter) -> Result<Coeffs> {
  // Caller guarantees non-zero, but guard defensively: a zero `out_size`
  // would divide by zero in `scale`, a zero `in_size` makes the window
  // empty.
  if in_size == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "precompute_coeffs: in_size",
      "must be non-zero",
      format!("{in_size}"),
    )));
  }
  if out_size == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "precompute_coeffs: out_size",
      "must be non-zero",
      format!("{out_size}"),
    )));
  }
  let scale = in_size as f64 / out_size as f64;
  let filterscale = if scale < 1.0 { 1.0 } else { scale };
  let support = filter_support(filter) * filterscale;
  // `ksize` is the max number of taps any output index can reference:
  // `ceil(support) * 2 + 1`, exactly PIL's `ksize = (int)ceil(support) *
  // 2 + 1`. Bounded by `in_size` (a window can never exceed the input).
  let ksize_unclamped = (support.ceil() as usize)
    .checked_mul(2)
    .and_then(|v| v.checked_add(1))
    .ok_or_else(|| {
      Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
        "precompute_coeffs: ksize (ceil(support) * 2 + 1)",
        "usize",
        [("ceil(support)", support.ceil() as u64)],
      ))
    })?;
  let ksize = ksize_unclamped.min(in_size.max(1));

  // `bounds` is `out_size` `(usize, usize)` pairs; cap its byte size
  // against the 512 MiB budget before reserving — a `131072`-wide output
  // alone is tiny, but the same guard applies uniformly to every scratch
  // buffer so no resize allocation bypasses the ceiling.
  checked_buffer_bytes(
    out_size,
    std::mem::size_of::<(usize, usize)>(),
    "coefficient bounds table",
  )?;
  let mut bounds: Vec<(usize, usize)> = try_with_capacity(out_size)?;
  // `out_size * ksize` `i32` weights. `checked_mul` rejects a `usize`
  // overflow of the element count; `checked_buffer_bytes` then rejects a
  // table whose byte size exceeds `MAX_DECODED_IMAGE_BYTES` — a
  // `131072`-wide output with a stretched downscale support could
  // otherwise reserve a multi-GiB coefficient table that
  // `try_reserve_exact` cannot bound on an overcommitting allocator.
  let weight_len = out_size.checked_mul(ksize).ok_or_else(|| {
    Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
      "precompute_coeffs: weight_len (out_size * ksize)",
      "usize",
      [("out_size", out_size as u64), ("ksize", ksize as u64)],
    ))
  })?;
  checked_buffer_bytes(
    weight_len,
    std::mem::size_of::<i32>(),
    "coefficient weight table",
  )?;
  let mut weights: Vec<i32> = try_with_capacity(weight_len)?;
  weights.resize(weight_len, 0i32);

  // Scratch for one row of f64 weights before fixed-point conversion.
  // Bounded by `ksize`; capped against the budget before reserving (a
  // stretched downscale support can make `ksize` large).
  checked_buffer_bytes(ksize, std::mem::size_of::<f64>(), "coefficient row scratch")?;
  let mut row: Vec<f64> = try_with_capacity(ksize)?;

  let inv_filterscale = 1.0 / filterscale;
  for xx in 0..out_size {
    let center = (xx as f64 + 0.5) * scale;
    // Window `[xmin, xmax)` clamped to `[0, in_size)`. PIL adds 0.5 and
    // truncates toward zero; `center - support` is >= 0 here only after
    // the clamp, and the `+ 0.5` then `as usize`/`as i64` truncation
    // matches C's `(int)`.
    let xmin = {
      let v = (center - support + 0.5).floor();
      if v < 0.0 { 0 } else { v as usize }
    };
    let xmax = {
      let v = (center + support + 0.5).floor();
      let v = if v < 0.0 { 0usize } else { v as usize };
      v.min(in_size)
    };
    let n = xmax.saturating_sub(xmin);
    // Accumulate raw weights, then normalize to sum 1.0 (PIL divides
    // each tap by the window sum).
    row.clear();
    let mut wsum = 0.0f64;
    for i in 0..n {
      let w = filter_eval(
        filter,
        (xmin as f64 + i as f64 - center + 0.5) * inv_filterscale,
      );
      row.push(w);
      wsum += w;
    }
    let base = xx * ksize;
    if wsum != 0.0 {
      let inv = 1.0 / wsum;
      for (i, &w) in row.iter().enumerate() {
        // Fixed-point: round(coef * (1 << PRECISION_BITS)).
        let scaled = (w * inv) * f64::from(1i32 << PRECISION_BITS);
        weights[base + i] = scaled.round() as i32;
      }
    }
    // n is bounded by ksize by construction (window <= ceil(support)*2+1
    // and clamped to in_size). Assert to make the convolution's slice
    // access provably in-bounds.
    debug_assert!(
      n <= ksize,
      "precompute_coeffs: window n={n} exceeds ksize={ksize}"
    );
    bounds.push((xmin, n));
  }
  Ok(Coeffs {
    bounds,
    weights,
    ksize,
  })
}

/// Clamp a finished fixed-point accumulator to `u8` exactly as PIL's
/// `clip8`: arithmetic `>> PRECISION_BITS` (sign-extending) then clamp to
/// `[0, 255]`.
#[inline]
fn clip8(acc: i32) -> u8 {
  // Rust `>>` on `i32` is arithmetic (sign-preserving), matching C's
  // signed right shift used by `clip8`.
  let v = acc >> PRECISION_BITS;
  if v < 0 {
    0
  } else if v > 255 {
    255
  } else {
    v as u8
  }
}

/// Resize an RGBA8 image from `(src_w, src_h)` to `(dst_w, dst_h)` using
/// `filter`. `src` MUST be exactly `src_w * src_h * 4` bytes; the returned
/// `Vec<u8>` is exactly `dst_w * dst_h * 4` bytes (row-major RGBA8).
///
/// EVERY buffer (coefficient tables for both axes, the horizontal-pass
/// intermediate, the output) is `try_reserve_exact`-backed; an allocator
/// refusal surfaces as [`Error::OutOfMemory`], never a process abort. In
/// addition, every buffer is capped against [`MAX_DECODED_IMAGE_BYTES`]
/// (512 MiB) via [`checked_buffer_bytes`] BEFORE its reservation — the
/// public [`crate::vlm::image::resize`] wrapper only bounds the
/// RGBA-source and the destination, but the horizontal intermediate
/// (`src_h * dst_w * 4`) and the coefficient tables are sized from the
/// SAME untrusted target and can dwarf both ends (a `1×131072` →
/// `131072×1` resize has 0.5 MiB ends but a ~68 GiB intermediate). Capping
/// the request itself — not just relying on `try_reserve_exact` — closes
/// the overcommit zero-fill abort. So `resize_rgba8` is safe to call
/// directly, not only through the public wrapper.
///
/// # Errors
/// - [`Error::OutOfRange`] if any dimension is `0`;
///   [`Error::ArithmeticOverflow`] if a byte/element product overflows
///   `usize`; [`Error::LengthMismatch`] if `src.len() != src_w * src_h * 4`;
///   [`Error::CapExceeded`] if ANY buffer in the resize path (source copy,
///   coefficient tables, horizontal intermediate, destination) would exceed
///   [`MAX_DECODED_IMAGE_BYTES`].
/// - [`Error::OutOfMemory`] if any `try_reserve_exact` fails.
///
/// # Panics
/// Does not panic on valid input: the only `assert!`s are slice-length
/// preconditions inside the SIMD/scalar kernels, which the dimension math
/// in this function makes structurally true.
pub(crate) fn resize_rgba8(
  src: &[u8],
  src_w: usize,
  src_h: usize,
  dst_w: usize,
  dst_h: usize,
  filter: Filter,
) -> Result<Vec<u8>> {
  if src_w == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "resize_rgba8: src_w",
      "must be non-zero",
      format!("{src_w}"),
    )));
  }
  if src_h == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "resize_rgba8: src_h",
      "must be non-zero",
      format!("{src_h}"),
    )));
  }
  if dst_w == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "resize_rgba8: dst_w",
      "must be non-zero",
      format!("{dst_w}"),
    )));
  }
  if dst_h == 0 {
    return Err(Error::OutOfRange(OutOfRangePayload::new(
      "resize_rgba8: dst_h",
      "must be non-zero",
      format!("{dst_h}"),
    )));
  }
  let src_len = src_w
    .checked_mul(src_h)
    .and_then(|v| v.checked_mul(CHANNELS))
    .ok_or_else(|| {
      Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
        "resize_rgba8: src_len (src_w * src_h * CHANNELS)",
        "usize",
        [
          ("src_w", src_w as u64),
          ("src_h", src_h as u64),
          ("CHANNELS", CHANNELS as u64),
        ],
      ))
    })?;
  if src.len() != src_len {
    return Err(Error::LengthMismatch(LengthMismatchPayload::new(
      "resize_rgba8: src buffer bytes vs src_w * src_h * CHANNELS",
      src_len,
      src.len(),
    )));
  }
  // Cap the source against the 512 MiB budget too: `src` is borrowed (not
  // allocated here), but the premultiplied copy below is `src.len()` bytes
  // — and a direct caller (not the public `resize` wrapper) has no other
  // guard. `src_len` already cleared the overflow check above.
  checked_buffer_bytes(src_len, 1, "resize_rgba8: RGBA8 source")?;
  let dst_len = dst_w
    .checked_mul(dst_h)
    .and_then(|v| v.checked_mul(CHANNELS))
    .ok_or_else(|| {
      Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
        "resize_rgba8: dst_len (dst_w * dst_h * CHANNELS)",
        "usize",
        [
          ("dst_w", dst_w as u64),
          ("dst_h", dst_h as u64),
          ("CHANNELS", CHANNELS as u64),
        ],
      ))
    })?;
  // Cap the destination against the 512 MiB budget. The public `resize`
  // wrapper already bounds it, but `resize_rgba8` is `pub(crate)` and may
  // be called directly — every entry path is covered here.
  checked_buffer_bytes(dst_len, 1, "resize_rgba8: destination RGBA8")?;

  if filter == Filter::Nearest {
    // PIL exempts `NEAREST` from premultiplication: it is a pure pixel
    // gather, so straight RGBA channels are already byte-exact (see the
    // `premultiply_rgba` doc + `Image.resize`'s `resample != NEAREST`
    // guard).
    return resize_nearest(src, src_w, src_h, dst_w, dst_h, dst_len);
  }

  // --- Premultiplied-alpha staging (PIL parity) ---
  // PIL's `Image.resize` converts RGBA -> premultiplied `RGBa` BEFORE any
  // non-NEAREST resample and converts back after (`Image.py`:
  // `if self.mode in ["LA", "RGBA"] and resample != NEAREST: ...
  // convert("RGBa") ... resize ... convert(self.mode)`). Straight-channel
  // convolution is NOT byte-exact for non-opaque alpha — it bleeds the
  // colour of fully-transparent pixels into their neighbours. We mirror
  // that exact path: premultiply the colour channels into an owned
  // fallible copy, run the existing separable convolution over the
  // premultiplied buffer, then unpremultiply the destination in place.
  // For an all-opaque (`A == 255`) image both passes are the identity
  // (`MULDIV255(c, 255) == c`, and unpremultiply special-cases
  // `alpha == 255`), so opaque inputs are bit-identical to the prior
  // behaviour. (`resize_rgba8` only ever sees RGBA8 — `vlm::image::resize`
  // projects every source variant, including `LumaA8`, to RGBA8 first —
  // so the single RGBA premultiply path also covers PIL's `LA -> La`.)
  let src_pm = premultiply_rgba(src)?;

  // --- Separable convolution ---
  // Horizontal pass: (src_h rows) x (dst_w cols) intermediate, RGBA8.
  // Vertical pass: (dst_h rows) x (dst_w cols) output.
  let hcoeffs = precompute_coeffs(src_w, dst_w, filter)?;
  let vcoeffs = precompute_coeffs(src_h, dst_h, filter)?;

  // Intermediate buffer: src_h * dst_w * 4 bytes, fallible. (PIL emits an
  // 8-bit clamped image between the two passes; the vertical pass reads
  // it back.) CRITICAL: this intermediate's dimensions are `src_h` (input)
  // by `dst_w` (untrusted target) — it is NOT bounded by either the
  // RGBA-source cap (`src_w*src_h*4`) or the destination cap
  // (`dst_w*dst_h*4`) the public `resize` wrapper enforces. A `1×131072`
  // source resized to `131072×1` has a 0.5 MiB source, a 0.5 MiB
  // destination, but a `131072 * 131072 * 4` ≈ 68 GiB intermediate. So
  // cap it explicitly against `MAX_DECODED_IMAGE_BYTES` (overflow →
  // `ArithmeticOverflow`; > 512 MiB → `CapExceeded`) BEFORE the
  // `try_reserve_exact` + zero-fill
  // — `try_reserve_exact` alone cannot stop an overcommitting allocator
  // from handing back 68 GiB that the `resize`/zero-fill then faults in.
  let inter_len = src_h
    .checked_mul(dst_w)
    .and_then(|v| v.checked_mul(CHANNELS))
    .ok_or_else(|| {
      Error::ArithmeticOverflow(ArithmeticOverflowPayload::with_operands(
        "resize_rgba8: inter_len (src_h * dst_w * CHANNELS)",
        "usize",
        [
          ("src_h", src_h as u64),
          ("dst_w", dst_w as u64),
          ("CHANNELS", CHANNELS as u64),
        ],
      ))
    })?;
  checked_buffer_bytes(
    inter_len,
    1,
    "resize_rgba8: horizontal-pass intermediate RGBA8",
  )?;
  let mut inter: Vec<u8> = try_with_capacity(inter_len)?;
  inter.resize(inter_len, 0u8);

  // Output buffer, fallible.
  let mut dst: Vec<u8> = try_with_capacity(dst_len)?;
  dst.resize(dst_len, 0u8);

  // Horizontal pass: for each src row, convolve along x into `inter`.
  // Operates on the PREMULTIPLIED source (`src_pm`).
  convolve_axis(&src_pm, src_w, src_h, &mut inter, dst_w, &hcoeffs);
  // Vertical pass: convolve `inter` along y into `dst`. We transpose the
  // access by treating columns: for each output row `oy`, gather input
  // rows `[ymin, ymin+n)` from `inter`. To reuse `convolve_axis` (which
  // convolves along the contiguous x-axis), the vertical pass is a
  // separate routine because its taps stride by a full row.
  convolve_vertical(&inter, dst_w, src_h, &mut dst, dst_h, &vcoeffs);

  // Convert the premultiplied `dst` back to straight RGBA8 in place (PIL's
  // post-resize `convert(self.mode)`).
  unpremultiply_rgba(&mut dst);

  Ok(dst)
}

/// PIL fixed-point `c * a / 255`, mirroring `libImaging`'s `MULDIV255`
/// macro exactly: `tmp = c * a + 128; ((tmp >> 8) + tmp) >> 8`. The `+128`
/// is PIL's rounding bias and the double-shift is its `/255`
/// approximation (`SHIFTFORDIV255`). Bit-exact with Pillow's premultiply.
#[inline]
fn muldiv255(c: u8, a: u8) -> u8 {
  // `c, a <= 255`, so `c * a + 128 <= 65153` — fits `u32` with room to
  // spare; the result is provably `<= 255`.
  let tmp = u32::from(c) * u32::from(a) + 128;
  (((tmp >> 8) + tmp) >> 8) as u8
}

/// Premultiply an RGBA8 buffer (PIL `rgbA2rgba` — the `RGBA -> RGBa`
/// mode conversion `Image.resize` applies before a non-NEAREST resample).
/// Each colour channel becomes `MULDIV255(c, A)`; alpha is unchanged. The
/// premultiplied buffer is an owned fallible copy (`src` is borrowed and
/// must stay intact); allocator refusal surfaces as
/// [`Error::OutOfMemory`].
///
/// `src.len()` must be a multiple of [`CHANNELS`] (guaranteed by
/// [`resize_rgba8`]'s `src.len() == src_w * src_h * 4` check).
fn premultiply_rgba(src: &[u8]) -> Result<Vec<u8>> {
  let mut out: Vec<u8> = try_with_capacity(src.len())?;
  for px in src.chunks_exact(CHANNELS) {
    let a = px[3];
    // PIL premultiplies the colour channels only; alpha passes through.
    out.push(muldiv255(px[0], a));
    out.push(muldiv255(px[1], a));
    out.push(muldiv255(px[2], a));
    out.push(a);
  }
  // `chunks_exact` drops a partial trailing chunk; the caller guarantees
  // `src.len()` is a whole number of RGBA pixels, so `out.len()` equals
  // `src.len()`. Assert it so a future caller violating that contract
  // fails loudly rather than silently truncating.
  debug_assert_eq!(
    out.len(),
    src.len(),
    "premultiply_rgba: src length must be a multiple of CHANNELS"
  );
  Ok(out)
}

/// Unpremultiply an RGBA8 buffer in place (PIL `rgba2rgbA` — the
/// `RGBa -> RGBA` conversion `Image.resize` applies after the resample).
/// Mirrors `libImaging` exactly: when `alpha` is `255` or `0` the colour
/// channels pass through unchanged, otherwise each is
/// `CLIP8((255 * c) / alpha)` (truncating integer division, clamped to
/// `[0, 255]`). Alpha is unchanged. No allocation — operates on the
/// destination buffer the convolution already produced.
///
/// The `alpha == 0` passthrough matches PIL: after premultiplication a
/// zero-alpha pixel already has colour channels `0` (`MULDIV255(c, 0)
/// == 0`), and the convolution of all-zero contributors keeps them `0`,
/// so the recovered straight colour is `0` regardless — PIL does not
/// special-case it to anything else.
///
/// `buf.len()` must be a multiple of [`CHANNELS`].
fn unpremultiply_rgba(buf: &mut [u8]) {
  for px in buf.chunks_exact_mut(CHANNELS) {
    let a = px[3];
    if a == 0 || a == 255 {
      // PIL passthrough: opaque needs no division, and a zero-alpha
      // pixel's premultiplied colour is already 0.
      continue;
    }
    // `CLIP8((255 * c) / a)`: `255 * c <= 65025` fits `u32`; integer
    // division truncates (matches C). `a` is in `1..=254` here, so the
    // quotient can exceed 255 (a premultiplied colour > alpha, possible
    // after convolution rounding) — `CLIP8` clamps it.
    let a32 = u32::from(a);
    px[0] = clip8_div(u32::from(px[0]), a32);
    px[1] = clip8_div(u32::from(px[1]), a32);
    px[2] = clip8_div(u32::from(px[2]), a32);
    // px[3] (alpha) unchanged.
  }
}

/// PIL `CLIP8((255 * c) / a)` for unpremultiply. `a` must be non-zero
/// (the caller special-cases `a == 0`). Truncating integer division then
/// clamp to `[0, 255]`.
#[inline]
fn clip8_div(c: u32, a: u32) -> u8 {
  let v = (255 * c) / a;
  if v > 255 { 255 } else { v as u8 }
}

/// Nearest-neighbor resize (pure pixel gather, PIL `Image.NEAREST`).
/// Output index `o` maps to input `min(floor((o+0.5)*in/out), in-1)`.
///
/// Both the per-column x-index map (`dst_w` `usize`s) and the destination
/// (`dst_len` bytes) are capped against [`MAX_DECODED_IMAGE_BYTES`] via
/// [`checked_buffer_bytes`] before their `try_reserve_exact`, so this
/// covers a direct caller as well as the dispatch from [`resize_rgba8`].
fn resize_nearest(
  src: &[u8],
  src_w: usize,
  src_h: usize,
  dst_w: usize,
  dst_h: usize,
  dst_len: usize,
) -> Result<Vec<u8>> {
  // Precompute per-output-column source x indices. `dst_w` is an untrusted
  // target dimension; cap the x-index map's byte size against the 512 MiB
  // budget before reserving.
  checked_buffer_bytes(
    dst_w,
    std::mem::size_of::<usize>(),
    "resize_nearest: x-index map",
  )?;
  let mut xmap: Vec<usize> = try_with_capacity(dst_w)?;
  for ox in 0..dst_w {
    let sx = ((ox as f64 + 0.5) * src_w as f64 / dst_w as f64).floor() as usize;
    xmap.push(sx.min(src_w - 1));
  }
  // Cap the destination too — `resize_rgba8` already caps `dst_len` before
  // the dispatch, but a direct caller of `resize_nearest` has no other
  // guard.
  checked_buffer_bytes(dst_len, 1, "resize_nearest: destination RGBA8")?;
  let mut dst: Vec<u8> = try_with_capacity(dst_len)?;
  dst.resize(dst_len, 0u8);
  for oy in 0..dst_h {
    let sy = (((oy as f64 + 0.5) * src_h as f64 / dst_h as f64).floor() as usize).min(src_h - 1);
    let src_row = &src[sy * src_w * CHANNELS..(sy + 1) * src_w * CHANNELS];
    let dst_row = &mut dst[oy * dst_w * CHANNELS..(oy + 1) * dst_w * CHANNELS];
    for ox in 0..dst_w {
      let sx = xmap[ox];
      dst_row[ox * CHANNELS..ox * CHANNELS + CHANNELS]
        .copy_from_slice(&src_row[sx * CHANNELS..sx * CHANNELS + CHANNELS]);
    }
  }
  Ok(dst)
}

/// Horizontal convolution: for each of `rows` source rows, produce
/// `out_w` output pixels into `out` (RGBA8, `rows * out_w * 4` bytes).
/// Dispatches to the NEON kernel on aarch64 (unless `mlxrs_force_scalar`),
/// else the scalar kernel.
fn convolve_axis(
  src: &[u8],
  src_w: usize,
  rows: usize,
  out: &mut [u8],
  out_w: usize,
  coeffs: &Coeffs,
) {
  // Slice-length preconditions (unconditional assert per SIMD conventions):
  // both kernels rely on these to keep their indexing in-bounds.
  assert_eq!(src.len(), src_w * rows * CHANNELS, "convolve_axis: src len");
  assert_eq!(out.len(), out_w * rows * CHANNELS, "convolve_axis: out len");
  assert_eq!(coeffs.bounds.len(), out_w, "convolve_axis: bounds len");

  #[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
  {
    if std::arch::is_aarch64_feature_detected!("neon") {
      // SAFETY: the `neon` target feature is confirmed available by the
      // runtime `is_aarch64_feature_detected!` check immediately above;
      // see `convolve_axis_neon`'s `# Safety` for the full contract.
      unsafe {
        convolve_axis_neon(src, src_w, rows, out, out_w, coeffs);
      }
      return;
    }
  }
  convolve_axis_scalar(src, src_w, rows, out, out_w, coeffs);
}

/// Vertical convolution: read the `src_h x out_w` intermediate `inter`
/// and produce `out_h` output rows into `out` (RGBA8). Taps stride by a
/// full intermediate row.
fn convolve_vertical(
  inter: &[u8],
  out_w: usize,
  src_h: usize,
  out: &mut [u8],
  out_h: usize,
  coeffs: &Coeffs,
) {
  assert_eq!(
    inter.len(),
    out_w * src_h * CHANNELS,
    "convolve_vertical: inter len"
  );
  assert_eq!(
    out.len(),
    out_w * out_h * CHANNELS,
    "convolve_vertical: out len"
  );
  assert_eq!(coeffs.bounds.len(), out_h, "convolve_vertical: bounds len");

  #[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
  {
    if std::arch::is_aarch64_feature_detected!("neon") {
      // SAFETY: `neon` confirmed by the runtime check above; see
      // `convolve_vertical_neon`'s `# Safety`.
      unsafe {
        convolve_vertical_neon(inter, out_w, src_h, out, out_h, coeffs);
      }
      return;
    }
  }
  convolve_vertical_scalar(inter, out_w, src_h, out, out_h, coeffs);
}

/// Scalar horizontal convolution (always compiled). Bit-exact with PIL.
fn convolve_axis_scalar(
  src: &[u8],
  src_w: usize,
  rows: usize,
  out: &mut [u8],
  out_w: usize,
  coeffs: &Coeffs,
) {
  let ksize = coeffs.ksize;
  for y in 0..rows {
    let src_row = &src[y * src_w * CHANNELS..(y + 1) * src_w * CHANNELS];
    let out_row = &mut out[y * out_w * CHANNELS..(y + 1) * out_w * CHANNELS];
    for ox in 0..out_w {
      let (xmin, n) = coeffs.bounds[ox];
      let taps = &coeffs.weights[ox * ksize..ox * ksize + n];
      let mut acc = [ROUND_BIAS; CHANNELS];
      for (i, &w) in taps.iter().enumerate() {
        let px = &src_row[(xmin + i) * CHANNELS..(xmin + i) * CHANNELS + CHANNELS];
        acc[0] += i32::from(px[0]) * w;
        acc[1] += i32::from(px[1]) * w;
        acc[2] += i32::from(px[2]) * w;
        acc[3] += i32::from(px[3]) * w;
      }
      let o = &mut out_row[ox * CHANNELS..ox * CHANNELS + CHANNELS];
      o[0] = clip8(acc[0]);
      o[1] = clip8(acc[1]);
      o[2] = clip8(acc[2]);
      o[3] = clip8(acc[3]);
    }
  }
}

/// Scalar vertical convolution (always compiled). Bit-exact with PIL.
fn convolve_vertical_scalar(
  inter: &[u8],
  out_w: usize,
  _src_h: usize,
  out: &mut [u8],
  out_h: usize,
  coeffs: &Coeffs,
) {
  let ksize = coeffs.ksize;
  let row_stride = out_w * CHANNELS;
  for oy in 0..out_h {
    let (ymin, n) = coeffs.bounds[oy];
    let taps = &coeffs.weights[oy * ksize..oy * ksize + n];
    let out_row = &mut out[oy * row_stride..(oy + 1) * row_stride];
    for ox in 0..out_w {
      let mut acc = [ROUND_BIAS; CHANNELS];
      for (i, &w) in taps.iter().enumerate() {
        let base = (ymin + i) * row_stride + ox * CHANNELS;
        let px = &inter[base..base + CHANNELS];
        acc[0] += i32::from(px[0]) * w;
        acc[1] += i32::from(px[1]) * w;
        acc[2] += i32::from(px[2]) * w;
        acc[3] += i32::from(px[3]) * w;
      }
      let o = &mut out_row[ox * CHANNELS..ox * CHANNELS + CHANNELS];
      o[0] = clip8(acc[0]);
      o[1] = clip8(acc[1]);
      o[2] = clip8(acc[2]);
      o[3] = clip8(acc[3]);
    }
  }
}

/// NEON horizontal convolution. Vectorizes the per-output weighted sum
/// across the 4 RGBA channels: widen the 4 source bytes to `int32x4`,
/// multiply-accumulate by the broadcast `i32` coefficient, then narrow +
/// shift + clamp back to 4 `u8`. Output is bit-identical to
/// [`convolve_axis_scalar`] (identical `i32` arithmetic + rounding).
///
/// # Safety
/// 1. The `neon` target feature must be available at runtime. The sole
///    caller ([`convolve_axis`]) gates this on
///    `is_aarch64_feature_detected!("neon")`, so the `vld*`/`vmlaq`/etc.
///    intrinsics are legal on the executing CPU.
/// 2. `src.len() == src_w * rows * 4`, `out.len() == out_w * rows * 4`,
///    and `coeffs.bounds.len() == out_w` — all asserted unconditionally
///    by the caller before dispatch. Combined with the
///    [`precompute_coeffs`] invariant `xmin + n <= src_w` (window clamped
///    to the input), every byte slice accessed below is in-bounds.
/// 3. All loads/stores are 4-byte (one RGBA8 pixel) and operate on the
///    `&[u8]`/`&mut [u8]` slices directly (no raw pointer aliasing beyond
///    the borrow the references already grant).
#[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
#[target_feature(enable = "neon")]
unsafe fn convolve_axis_neon(
  src: &[u8],
  src_w: usize,
  rows: usize,
  out: &mut [u8],
  out_w: usize,
  coeffs: &Coeffs,
) {
  use std::arch::aarch64::*;
  let ksize = coeffs.ksize;
  for y in 0..rows {
    let src_row = &src[y * src_w * CHANNELS..(y + 1) * src_w * CHANNELS];
    let out_row = &mut out[y * out_w * CHANNELS..(y + 1) * out_w * CHANNELS];
    for ox in 0..out_w {
      let (xmin, n) = coeffs.bounds[ox];
      let taps = &coeffs.weights[ox * ksize..ox * ksize + n];
      // Seed all four lanes with the rounding bias. Value-only NEON
      // intrinsics need no `unsafe` block inside a `#[target_feature]`
      // fn — the feature gate discharges their safety; only the pointer
      // load/store below carry an `unsafe {}` (with a SAFETY note).
      let mut acc = vdupq_n_s32(ROUND_BIAS);
      for (i, &w) in taps.iter().enumerate() {
        let off = (xmin + i) * CHANNELS;
        // `off + 4 <= src_row.len()` by the window invariant
        // (`xmin + n <= src_w`, asserted via Safety clause 2).
        let px4 = [
          src_row[off],
          src_row[off + 1],
          src_row[off + 2],
          src_row[off + 3],
        ];
        // SAFETY: clauses 1+3 — `neon` confirmed by the dispatch gate;
        // `neon_load_rgba` zero-extends 4 RGBA bytes into a `uint8x8_t`
        // and only reads its own 8-byte stack array.
        let v8 = unsafe { neon_load_rgba(px4) };
        let v16 = vmovl_u8(v8); // u8x8 -> u16x8
        let v16lo = vget_low_u16(v16); // first 4 u16 (R,G,B,A)
        let v32 = vreinterpretq_s32_u32(vmovl_u16(v16lo)); // u16x4 -> s32x4
        let wv = vdupq_n_s32(w);
        acc = vmlaq_s32(acc, v32, wv);
      }
      // Arithmetic shift right by PRECISION_BITS (matches scalar `>>`),
      // then narrow with unsigned saturation to u8 (clamps to [0,255],
      // matching `clip8`): `vqmovun_s32` maps negatives to 0, the
      // subsequent `vqmovn_u16` saturates the > 255 case.
      let shifted = vshrq_n_s32::<{ PRECISION_BITS as i32 }>(acc);
      let u16x4 = vqmovun_s32(shifted); // s32x4 -> u16x4 (sat, >=0)
      let u16x8 = vcombine_u16(u16x4, vdup_n_u16(0));
      let u8x8 = vqmovn_u16(u16x8); // u16x8 -> u8x8 (sat to 255)
      let o = &mut out_row[ox * CHANNELS..ox * CHANNELS + CHANNELS];
      // SAFETY: clauses 1+3 — `neon` confirmed by the dispatch gate;
      // `neon_store_rgba` writes only its own 8-byte stack array and `o`
      // is exactly `CHANNELS` bytes (asserted inside the helper).
      unsafe { neon_store_rgba(u8x8, o) };
    }
  }
}

/// Load 4 RGBA bytes into the low half of a `uint8x8_t` (high 4 lanes
/// zero). Isolates the only pointer-based NEON `unsafe` in the kernels.
///
/// # Safety
/// 1. `neon` available at runtime (the kernels are reached only after the
///    dispatch gate's `is_aarch64_feature_detected!("neon")`).
/// 2. Reads exactly 8 bytes from an 8-byte stack array — fully in-bounds.
#[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
#[target_feature(enable = "neon")]
unsafe fn neon_load_rgba(px4: [u8; CHANNELS]) -> std::arch::aarch64::uint8x8_t {
  use std::arch::aarch64::*;
  // Widen to 8 bytes (low 4 = pixel, high 4 = 0) so the single 8-byte
  // `vld1_u8` reads only initialized stack memory.
  let buf = [px4[0], px4[1], px4[2], px4[3], 0, 0, 0, 0];
  // SAFETY: clauses 1+2 — `vld1_u8` reads 8 bytes from `buf` (`[u8; 8]`),
  // all initialized and in-bounds; `neon` confirmed by the dispatch gate.
  unsafe { vld1_u8(buf.as_ptr()) }
}

/// Store the low 4 lanes of a `uint8x8_t` into a 4-byte RGBA output slice.
///
/// # Safety
/// 1. `neon` available at runtime (see [`neon_load_rgba`]).
/// 2. `out.len() == 4` (one RGBA pixel) — the kernels slice exactly
///    `CHANNELS` bytes.
#[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
#[target_feature(enable = "neon")]
unsafe fn neon_store_rgba(v: std::arch::aarch64::uint8x8_t, out: &mut [u8]) {
  use std::arch::aarch64::*;
  assert_eq!(
    out.len(),
    CHANNELS,
    "neon_store_rgba: out must be one RGBA pixel"
  );
  let mut tmp = [0u8; 8];
  // SAFETY: clauses 1+2 — `vst1_u8` writes 8 bytes into `tmp` (`[u8; 8]`),
  // in-bounds; `neon` confirmed by the dispatch gate. Only the low 4
  // (the pixel) are copied out.
  unsafe { vst1_u8(tmp.as_mut_ptr(), v) };
  out.copy_from_slice(&tmp[..CHANNELS]);
}

/// NEON vertical convolution. Same per-channel vectorization as
/// [`convolve_axis_neon`] but taps stride by a full intermediate row.
/// Bit-identical to [`convolve_vertical_scalar`].
///
/// # Safety
/// 1. `neon` available at runtime — gated by the caller
///    ([`convolve_vertical`]) on `is_aarch64_feature_detected!("neon")`.
/// 2. `inter.len() == out_w * src_h * 4`, `out.len() == out_w * out_h *
///    4`, `coeffs.bounds.len() == out_h` — asserted by the caller.
///    Combined with `ymin + n <= src_h` from [`precompute_coeffs`], every
///    `inter[base..base+4]` access is in-bounds.
/// 3. Same 4-byte load/store contract as [`convolve_axis_neon`].
#[cfg(all(target_arch = "aarch64", not(mlxrs_force_scalar)))]
#[target_feature(enable = "neon")]
unsafe fn convolve_vertical_neon(
  inter: &[u8],
  out_w: usize,
  _src_h: usize,
  out: &mut [u8],
  out_h: usize,
  coeffs: &Coeffs,
) {
  use std::arch::aarch64::*;
  let ksize = coeffs.ksize;
  let row_stride = out_w * CHANNELS;
  for oy in 0..out_h {
    let (ymin, n) = coeffs.bounds[oy];
    let taps = &coeffs.weights[oy * ksize..oy * ksize + n];
    let out_row = &mut out[oy * row_stride..(oy + 1) * row_stride];
    for ox in 0..out_w {
      let mut acc = vdupq_n_s32(ROUND_BIAS);
      for (i, &w) in taps.iter().enumerate() {
        let base = (ymin + i) * row_stride + ox * CHANNELS;
        // `base + 4 <= inter.len()` by the window invariant
        // (`ymin + n <= src_h`, Safety clause 2).
        let px4 = [
          inter[base],
          inter[base + 1],
          inter[base + 2],
          inter[base + 3],
        ];
        // SAFETY: clauses 1+3 — see `neon_load_rgba`'s contract.
        let v8 = unsafe { neon_load_rgba(px4) };
        let v16 = vmovl_u8(v8);
        let v16lo = vget_low_u16(v16);
        let v32 = vreinterpretq_s32_u32(vmovl_u16(v16lo));
        let wv = vdupq_n_s32(w);
        acc = vmlaq_s32(acc, v32, wv);
      }
      let shifted = vshrq_n_s32::<{ PRECISION_BITS as i32 }>(acc);
      let u16x4 = vqmovun_s32(shifted);
      let u16x8 = vcombine_u16(u16x4, vdup_n_u16(0));
      let u8x8 = vqmovn_u16(u16x8);
      let o = &mut out_row[ox * CHANNELS..ox * CHANNELS + CHANNELS];
      // SAFETY: clauses 1+3 — see `neon_store_rgba`'s contract.
      unsafe { neon_store_rgba(u8x8, o) };
    }
  }
}

#[cfg(test)]
mod tests;