fast-ssim2 0.8.1

/// SIMD-optimized Recursive Gaussian blur
///
/// Uses archmage/magetypes for cross-platform SIMD.
/// Horizontal pass dispatches via `#[autoversion]` for FMA `mul_add`.
/// Vertical pass uses `#[magetypes]` with `GenericF32x8<Token>` for unified
/// multi-platform SIMD processing of all column groups per height traversal.
use archmage::autoversion;
use archmage::incant;
use archmage::magetypes;
use magetypes::simd::generic::f32x8 as GenericF32x8;

mod consts {
    #![allow(clippy::unreadable_literal)]
    include!(concat!(env!("OUT_DIR"), "/recursive_gaussian.rs"));
}

pub struct SimdGaussian {
    temp_buffer: Vec<f32>,
    max_size: usize,
    /// IIR state for vertical pass: 6 stacked sub-slices of `groups * LANES`
    /// floats each (prev_1, prev_3, prev_5, prev2_1, prev2_3, prev2_5).
    ///
    /// Hoisted out of the per-call SIMD inner function so the 6 allocations
    /// no longer happen on every plane blur. With ssim2's 5 blurs per scale
    /// across 6 scales, that's ~180 small allocations per frame eliminated.
    /// The state is zeroed at the start of every blur because the IIR
    /// initializes to zero — we don't preserve state between calls.
    vert_state: Vec<f32>,
    vert_state_size: usize,
}

const VERT_STATE_LANES: usize = 8;

impl SimdGaussian {
    /// Create a new SIMD Gaussian blur context.
    ///
    /// `max_width` is treated as a hint; the temporary buffer grows on demand
    /// in [`Self::shrink_to`] and [`Self::blur_single_plane_into`], so an
    /// underestimate only costs one reallocation. The hint is intentionally
    /// not multiplied by an assumed maximum height: previously this constructor
    /// pre-allocated `max_width * 4096` floats unconditionally, which both
    /// wasted memory for short strips (e.g. a 16384-wide image with a 64-row
    /// working buffer would allocate 256 MiB upfront for nothing) and would
    /// silently overflow `usize` on 32-bit targets when `max_width` exceeded
    /// `usize::MAX / 4096`.
    pub fn new(max_width: usize) -> Self {
        // Cap the hint at a sane value so callers passing absurd widths
        // don't trigger an immediate gigabyte-scale allocation. The buffer
        // still grows on demand if the actual image needs more.
        let initial_capacity = max_width.min(usize::MAX / 4);
        Self {
            temp_buffer: Vec::with_capacity(initial_capacity),
            max_size: 0,
            vert_state: Vec::new(),
            vert_state_size: 0,
        }
    }

    /// Ensure the temporary buffer is large enough for `width * height`.
    ///
    /// Returns silently without resizing if the dimensions overflow `usize` or
    /// fit in the existing capacity. The actual blur entry point
    /// ([`Self::blur_single_plane_into`]) re-checks and panics with a clearer
    /// message on overflow, matching the previous (implicit) behavior on
    /// 64-bit targets but making the failure mode explicit on 32-bit.
    pub fn shrink_to(&mut self, width: usize, height: usize) {
        let Some(needed) = width.checked_mul(height) else {
            return;
        };
        if needed > self.max_size {
            self.temp_buffer.resize(needed, 0.0);
            self.max_size = needed;
        }
        // 6 IIR state arrays of `(width / 8) * 8` floats each.
        let groups = width / VERT_STATE_LANES;
        let vert_state_needed = 6usize.checked_mul(groups.saturating_mul(VERT_STATE_LANES));
        if let Some(n) = vert_state_needed
            && n > self.vert_state_size
        {
            self.vert_state.resize(n, 0.0);
            self.vert_state_size = n;
        }
    }

    #[allow(dead_code)]
    pub fn blur_single_plane(&mut self, plane: &[f32], width: usize, height: usize) -> Vec<f32> {
        let mut out = vec![0.0; width * height];
        self.blur_single_plane_into(plane, &mut out, width, height);
        out
    }

    pub fn blur_single_plane_into(
        &mut self,
        plane: &[f32],
        out: &mut [f32],
        width: usize,
        height: usize,
    ) {
        // checked_mul guards against silent wraparound on 32-bit targets where
        // a malicious caller could otherwise pass dims whose product overflows.
        let size = width
            .checked_mul(height)
            .expect("SimdGaussian: width * height overflows usize");
        if size > self.max_size {
            self.temp_buffer.resize(size, 0.0);
            self.max_size = size;
        }
        let groups = width / VERT_STATE_LANES;
        let vert_state_needed = 6 * groups * VERT_STATE_LANES;
        if vert_state_needed > self.vert_state_size {
            self.vert_state.resize(vert_state_needed, 0.0);
            self.vert_state_size = vert_state_needed;
        }
        // IIR initialises state to zero on every call.
        self.vert_state[..vert_state_needed].fill(0.0);

        // Horizontal pass: dispatched for FMA
        horizontal_pass(plane, &mut self.temp_buffer[..size], width);

        // Vertical pass: SIMD-dispatched, processes all columns per height traversal
        vertical_pass(
            &self.temp_buffer[..size],
            out,
            &mut self.vert_state[..vert_state_needed],
            width,
            height,
        );
    }
}

// ---------------------------------------------------------------------------
// Horizontal pass — scalar IIR filter, dispatched via #[autoversion] for FMA
// ---------------------------------------------------------------------------

fn horizontal_pass(input: &[f32], output: &mut [f32], width: usize) {
    assert_eq!(input.len(), output.len());
    horizontal_pass_inner(input, output, width);
}

/// Enables FMA on platforms that support it. The body is pure scalar IIR;
/// `#[autoversion]` adds `#[target_feature]` so `mul_add` compiles to FMA.
#[allow(unused_imports)] // archmage dispatch on i686 triggers false positive
#[autoversion]
fn horizontal_pass_inner(input: &[f32], output: &mut [f32], width: usize) {
    horizontal_pass_rows(input, output, width);
}

#[inline(always)]
fn horizontal_pass_rows(input: &[f32], output: &mut [f32], width: usize) {
    #[cfg(feature = "rayon")]
    {
        use rayon::prelude::*;
        input
            .par_chunks_exact(width)
            .zip(output.par_chunks_exact_mut(width))
            .for_each(|(inp, out)| horizontal_row(inp, out, width));
    }

    #[cfg(not(feature = "rayon"))]
    {
        input
            .chunks_exact(width)
            .zip(output.chunks_exact_mut(width))
            .for_each(|(inp, out)| horizontal_row(inp, out, width));
    }
}

#[inline(always)]
fn horizontal_row(input: &[f32], output: &mut [f32], width: usize) {
    let big_n = consts::RADIUS as isize;

    let mut prev_1 = 0f32;
    let mut prev_3 = 0f32;
    let mut prev_5 = 0f32;
    let mut prev2_1 = 0f32;
    let mut prev2_3 = 0f32;
    let mut prev2_5 = 0f32;

    let mut n = (-big_n) + 1;
    while n < width as isize {
        let left = n - big_n - 1;
        let right = n + big_n - 1;
        let left_val = if left >= 0 && (left as usize) < input.len() {
            input[left as usize]
        } else {
            0f32
        };
        let right_val = if right >= 0 && (right as usize) < input.len() {
            input[right as usize]
        } else {
            0f32
        };
        let sum = left_val + right_val;

        let mut out_1 = sum * consts::MUL_IN_1;
        let mut out_3 = sum * consts::MUL_IN_3;
        let mut out_5 = sum * consts::MUL_IN_5;

        out_1 = consts::MUL_PREV2_1.mul_add(prev2_1, out_1);
        out_3 = consts::MUL_PREV2_3.mul_add(prev2_3, out_3);
        out_5 = consts::MUL_PREV2_5.mul_add(prev2_5, out_5);
        prev2_1 = prev_1;
        prev2_3 = prev_3;
        prev2_5 = prev_5;

        out_1 = consts::MUL_PREV_1.mul_add(prev_1, out_1);
        out_3 = consts::MUL_PREV_3.mul_add(prev_3, out_3);
        out_5 = consts::MUL_PREV_5.mul_add(prev_5, out_5);
        prev_1 = out_1;
        prev_3 = out_3;
        prev_5 = out_5;

        if n >= 0 && (n as usize) < output.len() {
            output[n as usize] = out_1 + out_3 + out_5;
        }

        n += 1;
    }
}

// ---------------------------------------------------------------------------
// Vertical pass — SIMD IIR filter processing all columns per height traversal
// ---------------------------------------------------------------------------

fn vertical_pass(
    input: &[f32],
    output: &mut [f32],
    state: &mut [f32],
    width: usize,
    height: usize,
) {
    assert_eq!(input.len(), output.len());
    incant!(
        vertical_pass_inner(input, output, state, width, height),
        [v3, neon, wasm128, scalar]
    )
}

/// Generic vertical pass — processes 8 columns at a time on all platforms.
///
/// Uses flat f32 state arrays so all column groups are processed per row,
/// avoiding repeated height traversals (which kills cache performance).
///
/// `state` is a caller-supplied buffer of length `6 * (width / LANES) * LANES`,
/// zeroed before the call. We split it into six sub-slices to back the IIR
/// state vectors (prev_1, prev_3, prev_5, prev2_1, prev2_3, prev2_5) — owned
/// by `SimdGaussian` so we don't reallocate them on every blur call.
#[magetypes(v3, neon, wasm128, scalar)]
fn vertical_pass_inner(
    token: Token,
    input: &[f32],
    output: &mut [f32],
    state: &mut [f32],
    width: usize,
    height: usize,
) {
    #[allow(non_camel_case_types)]
    type f32x8 = GenericF32x8<Token>;
    const LANES: usize = 8;

    let big_n = consts::RADIUS as isize;
    let groups = width / LANES;

    // SIMD constants
    let mul_in_1 = f32x8::splat(token, consts::VERT_MUL_IN_1);
    let mul_in_3 = f32x8::splat(token, consts::VERT_MUL_IN_3);
    let mul_in_5 = f32x8::splat(token, consts::VERT_MUL_IN_5);
    let mul_prev_1 = f32x8::splat(token, consts::VERT_MUL_PREV_1);
    let mul_prev_3 = f32x8::splat(token, consts::VERT_MUL_PREV_3);
    let mul_prev_5 = f32x8::splat(token, consts::VERT_MUL_PREV_5);
    let zeroes = f32x8::zero(token);

    // State arrays: 6 IIR state variables x (groups x LANES) floats each.
    // Caller pre-zeroed and pre-sized — split the flat buffer in place.
    let state_size = groups * LANES;
    let (prev_1, rest) = state.split_at_mut(state_size);
    let (prev_3, rest) = rest.split_at_mut(state_size);
    let (prev_5, rest) = rest.split_at_mut(state_size);
    let (prev2_1, rest) = rest.split_at_mut(state_size);
    let (prev2_3, rest) = rest.split_at_mut(state_size);
    let (prev2_5, _) = rest.split_at_mut(state_size);

    let mut n = (-big_n) + 1;
    while n < height as isize {
        let top = n - big_n - 1;
        let bottom = n + big_n - 1;

        let top_valid = top >= 0 && (top as usize) < height;
        let bottom_valid = bottom >= 0 && (bottom as usize) < height;
        let top_row_start = if top_valid { top as usize * width } else { 0 };
        let bottom_row_start = if bottom_valid {
            bottom as usize * width
        } else {
            0
        };

        for g in 0..groups {
            let col = g * LANES;

            let top_vals = if top_valid {
                let idx = top_row_start + col;
                f32x8::from_array(token, input[idx..][..LANES].try_into().unwrap())
            } else {
                zeroes
            };

            let bottom_vals = if bottom_valid {
                let idx = bottom_row_start + col;
                f32x8::from_array(token, input[idx..][..LANES].try_into().unwrap())
            } else {
                zeroes
            };

            let sum = top_vals + bottom_vals;

            let p1 = f32x8::from_array(token, prev_1[col..][..LANES].try_into().unwrap());
            let p3 = f32x8::from_array(token, prev_3[col..][..LANES].try_into().unwrap());
            let p5 = f32x8::from_array(token, prev_5[col..][..LANES].try_into().unwrap());
            let p21 = f32x8::from_array(token, prev2_1[col..][..LANES].try_into().unwrap());
            let p23 = f32x8::from_array(token, prev2_3[col..][..LANES].try_into().unwrap());
            let p25 = f32x8::from_array(token, prev2_5[col..][..LANES].try_into().unwrap());

            let out1 = p1.mul_add(mul_prev_1, p21);
            let out3 = p3.mul_add(mul_prev_3, p23);
            let out5 = p5.mul_add(mul_prev_5, p25);

            let out1 = sum.mul_add(mul_in_1, -out1);
            let out3 = sum.mul_add(mul_in_3, -out3);
            let out5 = sum.mul_add(mul_in_5, -out5);

            // Update state: prev2 = prev, prev = out
            prev2_1[col..col + LANES].copy_from_slice(&p1.to_array());
            prev2_3[col..col + LANES].copy_from_slice(&p3.to_array());
            prev2_5[col..col + LANES].copy_from_slice(&p5.to_array());
            prev_1[col..col + LANES].copy_from_slice(&out1.to_array());
            prev_3[col..col + LANES].copy_from_slice(&out3.to_array());
            prev_5[col..col + LANES].copy_from_slice(&out5.to_array());

            if n >= 0 {
                let result = out1 + out3 + out5;
                let out_start = n as usize * width + col;
                output[out_start..out_start + LANES].copy_from_slice(&result.to_array());
            }
        }

        n += 1;
    }

    // Scalar remainder for leftover columns
    vertical_pass_scalar_columns(input, output, width, height, groups * LANES);
}

/// Process remaining columns one at a time (used by both SIMD remainder and scalar fallback).
fn vertical_pass_scalar_columns(
    input: &[f32],
    output: &mut [f32],
    width: usize,
    height: usize,
    start_x: usize,
) {
    let big_n = consts::RADIUS as isize;
    let mut x = start_x;

    while x < width {
        let mut prev_1 = 0.0f32;
        let mut prev_3 = 0.0f32;
        let mut prev_5 = 0.0f32;
        let mut prev2_1 = 0.0f32;
        let mut prev2_3 = 0.0f32;
        let mut prev2_5 = 0.0f32;

        let mut n = (-big_n) + 1;
        while n < height as isize {
            let top = n - big_n - 1;
            let bottom = n + big_n - 1;

            let top_val = if top >= 0 && (top as usize) < height {
                input[top as usize * width + x]
            } else {
                0.0f32
            };

            let bottom_val = if bottom >= 0 && (bottom as usize) < height {
                input[bottom as usize * width + x]
            } else {
                0.0f32
            };

            let sum = top_val + bottom_val;

            let out1 = prev_1.mul_add(consts::VERT_MUL_PREV_1, prev2_1);
            let out3 = prev_3.mul_add(consts::VERT_MUL_PREV_3, prev2_3);
            let out5 = prev_5.mul_add(consts::VERT_MUL_PREV_5, prev2_5);

            let out1 = sum.mul_add(consts::VERT_MUL_IN_1, -out1);
            let out3 = sum.mul_add(consts::VERT_MUL_IN_3, -out3);
            let out5 = sum.mul_add(consts::VERT_MUL_IN_5, -out5);

            prev2_1 = prev_1;
            prev2_3 = prev_3;
            prev2_5 = prev_5;
            prev_1 = out1;
            prev_3 = out3;
            prev_5 = out5;

            if n >= 0 {
                output[n as usize * width + x] = out1 + out3 + out5;
            }

            n += 1;
        }

        x += 1;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn new_does_not_eagerly_allocate_height_hint() {
        // Previously `SimdGaussian::new(max_width)` allocated
        // `max_width * 4096` floats unconditionally. A 1024-wide hint should
        // not commit 16 MiB upfront -- the buffer grows lazily when blur is
        // actually invoked.
        let g = SimdGaussian::new(1024);
        assert_eq!(g.max_size, 0);
        // Capacity may be reserved up to the hint, but len stays at 0 so we
        // pay no time touching uninitialised pages.
        assert_eq!(g.temp_buffer.len(), 0);
    }

    #[test]
    fn shrink_to_ignores_overflowing_dims() {
        // Hostile caller passes dims whose product overflows usize. We must
        // not panic in `shrink_to`; the actual blur path is the place to
        // refuse the work.
        let mut g = SimdGaussian::new(0);
        g.shrink_to(usize::MAX, 2);
        assert_eq!(g.max_size, 0);
    }

    #[test]
    fn shrink_to_grows_on_demand() {
        let mut g = SimdGaussian::new(0);
        g.shrink_to(64, 64);
        assert!(g.max_size >= 64 * 64);
        assert_eq!(g.temp_buffer.len(), 64 * 64);
    }

    #[test]
    fn blur_runs_after_lazy_construction() {
        // End-to-end: a context constructed with hint=0 must still service a
        // small blur call by growing its buffer in blur_single_plane_into.
        let mut g = SimdGaussian::new(0);
        let plane = vec![0.5f32; 16 * 16];
        let mut out = vec![0.0f32; 16 * 16];
        g.blur_single_plane_into(&plane, &mut out, 16, 16);
        // Output is finite (the recursive Gaussian preserves a constant
        // signal up to scaling at small sizes; we only assert non-NaN here).
        assert!(out.iter().all(|v| v.is_finite()));
    }

    #[test]
    #[should_panic(expected = "width * height overflows usize")]
    fn blur_panics_on_overflowing_dims() {
        let mut g = SimdGaussian::new(0);
        let plane = [0.0f32; 0];
        let mut out = [0.0f32; 0];
        g.blur_single_plane_into(&plane, &mut out, usize::MAX, 2);
    }
}