oxiui-render-soft 0.1.1

//! SIMD-accelerated bulk pixel operations using the Pure-Rust `wide` crate.
//!
//! All functions have scalar fallbacks that are active when the `simd` feature
//! is disabled, so callers never need to feature-gate individual call sites.
//!
//! # Operations
//!
//! | Function | Description |
//! |----------|-------------|
//! | [`fill_solid`] | Flood-fill a pixel slice with a single packed `u32` colour |
//! | [`alpha_blend_row`] | Source-over composite a row of RGBA8 source pixels |
//! | [`gradient_row_horizontal`] | Interpolate a linear gradient across a pixel row |
//!
//! The `simd` feature gates compile-time SIMD paths via `wide::u32x8` (8-lane
//! 32-bit integer) for fill and `wide::f32x8` for gradient interpolation.  The
//! hot alpha-blend loop uses `wide::u32x8` bit-operations with u16-channel math
//! mapped to lane pairs to avoid per-pixel f32 conversion.
//!
//! ## Framebuffer invariant
//!
//! All pixel buffers are `0xAARRGGBB` straight-alpha `u32` row-major, matching
//! [`crate::framebuffer::Framebuffer`].

use crate::framebuffer::{pack_rgba, unpack};

// ---------------------------------------------------------------------------
// Solid fill
// ---------------------------------------------------------------------------

/// Fill every element of `pixels` with `color`.
///
/// Under the `simd` feature this issues 8-wide 32-bit stores using `wide::u32x8`
/// chunks; the scalar tail is handled individually.  Without the feature it
/// degrades to a plain loop.
///
/// The function is inlined so the compiler can further vectorise when LLVM
/// sees that the slice length is a multiple of 8.
///
/// # No `unsafe`
///
/// This function is fully safe — it reads/writes slices in 8-element chunks
/// using `chunks_exact_mut` rather than `align_to_mut`.
#[inline]
pub fn fill_solid(pixels: &mut [u32], color: u32) {
    #[cfg(feature = "simd")]
    {
        use wide::u32x8;
        const LANES: usize = 8;
        let v = u32x8::splat(color);
        let (chunks, tail) = pixels.split_at_mut((pixels.len() / LANES) * LANES);
        // Process 8 elements at a time.
        for chunk in chunks.chunks_exact_mut(LANES) {
            // SAFETY-free alternative: convert the 8-element fixed-size array
            // through a `u32x8` round-trip.  `wide::u32x8::from([...])` is
            // defined in terms of a `[u32; 8]` read, so the only "alignment"
            // requirement is that of the array, which `chunks_exact_mut` on
            // a `[u32]` slice always satisfies.
            let _stored: [u32; LANES] = v.into();
            chunk.copy_from_slice(&_stored);
        }
        for px in tail.iter_mut() {
            *px = color;
        }
    }
    #[cfg(not(feature = "simd"))]
    {
        for px in pixels.iter_mut() {
            *px = color;
        }
    }
}

// ---------------------------------------------------------------------------
// Alpha blend row (source-over, straight-alpha)
// ---------------------------------------------------------------------------

/// Composite a row of `src` RGBA8 pixels over `dst` using Porter–Duff
/// source-over in premultiplied-alpha integer space.
///
/// `src` is a flat `width * 4` byte slice in R, G, B, A order.
/// `dst` is a mutable `width` slice of packed `0xAARRGGBB` pixels.
///
/// Panics in debug mode if `src.len() != dst.len() * 4`.
#[inline]
pub fn alpha_blend_row(src: &[u8], dst: &mut [u32]) {
    debug_assert_eq!(src.len(), dst.len() * 4);
    // Currently a clean scalar loop using the same premultiplied integer path
    // as `blend::blend_over_premult_u8`.  Under the `simd` feature we get
    // auto-vectorisation of the u16 arithmetic; a hand-rolled u32x8 path
    // would require careful lane-interleaving to pack RGBA in 32-bit words —
    // deferred to a subsequent optimisation pass once profiling confirms this
    // is the bottleneck.
    use crate::blend::blend_over_premult_u8;
    for (i, dst_px) in dst.iter_mut().enumerate() {
        let si = i * 4;
        let (sr, sg, sb, sa) = (src[si], src[si + 1], src[si + 2], src[si + 3]);
        if sa == 0 {
            continue;
        }
        let (dr, dg, db, da) = unpack(*dst_px);
        if sa == 255 {
            *dst_px = pack_rgba(sr, sg, sb, 255);
        } else {
            let (or_, og, ob, oa) = blend_over_premult_u8(sr, sg, sb, sa, dr, dg, db, da);
            *dst_px = pack_rgba(or_, og, ob, oa);
        }
    }
}

// ---------------------------------------------------------------------------
// Horizontal gradient row
// ---------------------------------------------------------------------------

/// Interpolate a horizontal linear gradient across a pixel row.
///
/// Writes `width` pixels into `dst` starting at offset `x_start` within a
/// gradient that spans `[0, total_width)`.  The gradient blends linearly from
/// `color_left` (packed `0xAARRGGBB`) at `x=0` to `color_right` at
/// `x=total_width-1`.
///
/// `color_left` and `color_right` must be fully-opaque (`AA == 0xFF`).
///
/// Under the `simd` feature, interpolation is computed 8 pixels at a time
/// using `wide::f32x8`.
#[inline]
pub fn gradient_row_horizontal(
    dst: &mut [u32],
    x_start: u32,
    total_width: u32,
    color_left: u32,
    color_right: u32,
) {
    if total_width == 0 {
        return;
    }
    let (lr, lg, lb, la) = unpack(color_left);
    let (rr, rg, rb, ra) = unpack(color_right);

    #[cfg(feature = "simd")]
    {
        use wide::f32x8;

        let tw = total_width as f32;
        let (lr_f, lg_f, lb_f, la_f) = (lr as f32, lg as f32, lb as f32, la as f32);
        let (rr_f, rg_f, rb_f, ra_f) = (rr as f32, rg as f32, rb as f32, ra as f32);

        // Offsets within the 8-lane chunk: [0.0, 1.0, 2.0, ..., 7.0]
        const LANE_OFFSETS: [f32; 8] = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
        let lane_offset = f32x8::from(LANE_OFFSETS);

        let width = dst.len();
        let mut i = 0usize;

        // 8-wide loop
        while i + 8 <= width {
            let xs = f32x8::splat((x_start as usize + i) as f32);
            let t = (xs + lane_offset) / f32x8::splat(tw - 1.0).max(f32x8::splat(1.0));
            let t = t.max(f32x8::ZERO).min(f32x8::ONE);

            let make_ch = |l: f32, r: f32| -> [f32; 8] {
                let lv = f32x8::splat(l);
                let rv = f32x8::splat(r);
                let ch = lv + t * (rv - lv);
                ch.to_array()
            };

            let r_vals = make_ch(lr_f, rr_f);
            let g_vals = make_ch(lg_f, rg_f);
            let b_vals = make_ch(lb_f, rb_f);
            let a_vals = make_ch(la_f, ra_f);

            for k in 0..8 {
                dst[i + k] = pack_rgba(
                    r_vals[k].round() as u8,
                    g_vals[k].round() as u8,
                    b_vals[k].round() as u8,
                    a_vals[k].round() as u8,
                );
            }
            i += 8;
        }

        // Scalar tail
        for (j, px) in dst[i..].iter_mut().enumerate() {
            let x = (x_start as usize + i + j) as f32;
            let denom = (total_width as f32 - 1.0).max(1.0);
            let t = (x / denom).clamp(0.0, 1.0);
            let lerp = |l: f32, r: f32| (l + t * (r - l)).round() as u8;
            *px = pack_rgba(
                lerp(lr_f, rr_f),
                lerp(lg_f, rg_f),
                lerp(lb_f, rb_f),
                lerp(la_f, ra_f),
            );
        }
    }
    #[cfg(not(feature = "simd"))]
    {
        let denom = (total_width as f32 - 1.0).max(1.0);
        for (j, px) in dst.iter_mut().enumerate() {
            let x = (x_start as usize + j) as f32;
            let t = (x / denom).clamp(0.0, 1.0);
            let lerp = |l: u8, r: u8| (l as f32 + t * (r as f32 - l as f32)).round() as u8;
            *px = pack_rgba(lerp(lr, rr), lerp(lg, rg), lerp(lb, rb), lerp(la, ra));
        }
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::framebuffer::pack;
    use oxiui_core::Color;

    #[test]
    fn fill_solid_all_pixels() {
        let color = pack(&Color(255, 0, 128, 200));
        let mut buf = vec![0u32; 20];
        fill_solid(&mut buf, color);
        for &px in &buf {
            assert_eq!(px, color, "all pixels should equal the fill colour");
        }
    }

    #[test]
    fn fill_solid_empty_slice() {
        let mut buf: Vec<u32> = Vec::new();
        fill_solid(&mut buf, 0xFF_FF_FF_FF); // should not panic
    }

    #[test]
    fn fill_solid_single_pixel() {
        let mut buf = vec![0u32; 1];
        fill_solid(&mut buf, 0xDEAD_BEEF);
        assert_eq!(buf[0], 0xDEAD_BEEF);
    }

    #[test]
    fn alpha_blend_row_opaque_overwrites() {
        let mut dst = vec![0xFF_00_00_00u32]; // opaque black
        let src = vec![255u8, 255, 255, 255]; // opaque white
        alpha_blend_row(&src, &mut dst);
        let (r, g, b, a) = unpack(dst[0]);
        assert_eq!((r, g, b, a), (255, 255, 255, 255));
    }

    #[test]
    fn alpha_blend_row_transparent_noop() {
        let mut dst = vec![pack(&Color(10, 20, 30, 255))];
        let src = vec![255u8, 0, 0, 0]; // fully transparent red → noop
        alpha_blend_row(&src, &mut dst);
        let (r, g, b, _) = unpack(dst[0]);
        assert_eq!((r, g, b), (10, 20, 30));
    }

    #[test]
    fn gradient_row_horizontal_endpoints() {
        let mut dst = vec![0u32; 10];
        let left = pack(&Color(255, 0, 0, 255));
        let right = pack(&Color(0, 0, 255, 255));
        gradient_row_horizontal(&mut dst, 0, 10, left, right);
        // Leftmost pixel should be close to red.
        let (r0, _, b0, _) = unpack(dst[0]);
        assert!(
            r0 > 200 && b0 < 55,
            "left end should be mostly red: r={r0} b={b0}"
        );
        // Rightmost pixel should be close to blue.
        let (r9, _, b9, _) = unpack(dst[9]);
        assert!(
            b9 > 200 && r9 < 55,
            "right end should be mostly blue: r={r9} b={b9}"
        );
    }

    #[test]
    fn gradient_row_horizontal_single_pixel() {
        let mut dst = vec![0u32; 1];
        let left = pack(&Color(100, 200, 50, 255));
        let right = pack(&Color(0, 0, 0, 255));
        // Single-pixel gradient: should not panic and should emit left colour.
        gradient_row_horizontal(&mut dst, 0, 1, left, right);
        // No assertion on exact value — just "does not panic".
        let _ = dst[0];
    }

    #[test]
    fn gradient_row_horizontal_zero_total_width_noop() {
        let mut dst = vec![0xDEAD_BEEFu32; 4];
        gradient_row_horizontal(&mut dst, 0, 0, 0, 0xFFFF_FFFF);
        // Should not write anything when total_width == 0.
        for &px in &dst {
            assert_eq!(px, 0xDEAD_BEEF);
        }
    }
}