rasterrocket_render/simd/
composite.rs

1//! AA compositing: Porter-Duff source-over with per-pixel coverage.
2//!
3//! Single public entry: [`composite_aa_rgb8_opaque`] — RGB pixels with no
4//! alpha plane (opaque destination).  The compositing formula simplifies to
5//! `c = div255((255-a_src)*c_dst + a_src*c_src)` per channel.  Expressed as
6//! `[u16; LANE]` lanes so that LLVM auto-vectorizes into AVX2/AVX-512 when
7//! the binary is compiled with `-C target-cpu=native`.
8//!
9//! # Why `[u16; LANE]` instead of explicit intrinsics
10//!
11//! tiny-skia's lowp pipeline uses the same technique (see `src/wide/u16x16_t.rs`
12//! and its comment: "No need for explicit AVX2 SIMD; `-C target-cpu=native` will
13//! autovectorize better than us").  A plain `[u16; 16]` array with straight
14//! arithmetic on `u16` gives LLVM the freedom to choose the best instruction
15//! width (128/256/512-bit) for the target, without us hard-coding a specific ISA.
16//!
17//! # div255 approximation
18//!
19//! `(v + 255) >> 8` approximates `v / 255` with at most ±1 LSB of error.
20//! This matches the tiny-skia lowp `div255` and is cheaper to auto-vectorize
21//! than the higher-precision `(v + (v>>8) + 0x80) >> 8` form.
22
23// Number of pixels per SIMD-style lane chunk.  16 × u16 = 256 bits — one AVX2
24// vector per colour channel.  LLVM will widen to 512-bit (AVX-512) automatically
25// when the target supports it.
26const LANE: usize = 16;
27
28/// Approximate `v / 255` for `v` in `[0, 255²]`.  Maximum error: ±1 LSB.
29#[inline]
30const fn div255_u16(v: u16) -> u16 {
31    (v + 255) >> 8
32}
33
34/// Composite a solid RGB source over an opaque destination (no alpha plane).
35///
36/// The destination is assumed fully opaque (`a_dst = 255`).  The compositing
37/// formula simplifies to:
38/// ```text
39/// c_result = div255((255 - a_src) * c_dst + a_src * c_src)
40/// ```
41///
42/// Processes pixels in chunks of `LANE` (16) using `[u16; LANE]` lanes.
43/// LLVM auto-vectorizes into 256/512-bit SIMD when compiled with
44/// `-C target-cpu=native`.
45///
46/// # Arguments
47///
48/// - `dst`: packed RGB bytes, length must be `shape.len() * 3`.
49/// - `src`: constant source colour applied to every pixel.
50/// - `a_input`: source opacity (0 = transparent, 255 = opaque).
51/// - `shape`: per-pixel AA coverage, one byte per pixel.
52///
53/// # Panics (debug only)
54///
55/// Panics if `dst.len() != shape.len() * 3`.
56pub fn composite_aa_rgb8_opaque(dst: &mut [u8], src: [u8; 3], a_input: u8, shape: &[u8]) {
57    let count = shape.len();
58    debug_assert_eq!(
59        dst.len(),
60        count * 3,
61        "composite_aa_rgb8_opaque: dst length mismatch (got {}, expected {})",
62        dst.len(),
63        count * 3,
64    );
65
66    let a_in = u16::from(a_input);
67    let [sr, sg, sb] = [u16::from(src[0]), u16::from(src[1]), u16::from(src[2])];
68
69    let full_chunks = count / LANE;
70    let remainder = count % LANE;
71
72    // ── Full LANE-wide chunks ──────────────────────────────────────────────────
73    //
74    // Both loops are structured so LLVM sees LANE independent iterations over
75    // arrays of length LANE — the pattern that triggers 256/512-bit vectorization.
76    for chunk in 0..full_chunks {
77        let px_base = chunk * LANE;
78        let byte_base = px_base * 3;
79
80        // Step 1: compute a_src for each pixel in the chunk.
81        let mut a_src_lane = [0u16; LANE];
82        for (k, a) in a_src_lane.iter_mut().enumerate() {
83            *a = div255_u16(a_in * u16::from(shape[px_base + k]));
84        }
85
86        // Step 2: composite each pixel using its a_src.
87        for (k, &a_src) in a_src_lane.iter().enumerate() {
88            let inv = 255 - a_src;
89            let b = byte_base + k * 3;
90            // Result of div255_u16 is ≤ 255, so truncation to u8 is safe.
91            #[expect(clippy::cast_possible_truncation, reason = "div255_u16 result ≤ 255")]
92            {
93                dst[b] = div255_u16(inv * u16::from(dst[b]) + a_src * sr) as u8;
94                dst[b + 1] = div255_u16(inv * u16::from(dst[b + 1]) + a_src * sg) as u8;
95                dst[b + 2] = div255_u16(inv * u16::from(dst[b + 2]) + a_src * sb) as u8;
96            }
97        }
98    }
99
100    // ── Scalar tail ───────────────────────────────────────────────────────────
101    let tail_px = full_chunks * LANE;
102    let tail_byte = tail_px * 3;
103    for k in 0..remainder {
104        let a_src = div255_u16(a_in * u16::from(shape[tail_px + k]));
105        let inv = 255 - a_src;
106        let b = tail_byte + k * 3;
107        #[expect(clippy::cast_possible_truncation, reason = "div255_u16 result ≤ 255")]
108        {
109            dst[b] = div255_u16(inv * u16::from(dst[b]) + a_src * sr) as u8;
110            dst[b + 1] = div255_u16(inv * u16::from(dst[b + 1]) + a_src * sg) as u8;
111            dst[b + 2] = div255_u16(inv * u16::from(dst[b + 2]) + a_src * sb) as u8;
112        }
113    }
114}
115
116// ── Tests ─────────────────────────────────────────────────────────────────────
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121
122    // ── composite_aa_rgb8_opaque ─────────────────────────────────────────────
123
124    #[test]
125    fn opaque_full_coverage_writes_src() {
126        let src = [200u8, 100, 50];
127        let shape = [255u8; 4];
128        let mut dst = vec![10u8; 12]; // 4 pixels
129
130        composite_aa_rgb8_opaque(&mut dst, src, 255, &shape);
131
132        for i in 0..4 {
133            assert_eq!(&dst[i * 3..i * 3 + 3], &[200, 100, 50], "pixel {i}");
134        }
135    }
136
137    #[test]
138    fn opaque_zero_coverage_leaves_dst() {
139        let src = [200u8, 100, 50];
140        let shape = [0u8; 4];
141        let original: Vec<u8> = (0..12).map(|i: u8| i * 10).collect();
142        let mut dst = original.clone();
143
144        composite_aa_rgb8_opaque(&mut dst, src, 255, &shape);
145
146        assert_eq!(dst, original);
147    }
148
149    #[test]
150    fn opaque_half_coverage_blends() {
151        let src = [255u8, 255, 255];
152        let shape = [128u8];
153        let mut dst = vec![0u8; 3]; // black dst
154
155        composite_aa_rgb8_opaque(&mut dst, src, 255, &shape);
156
157        // div255_u16(128 * 255) ≈ 128; blend: div255_u16((255-128)*0 + 128*255) ≈ 128.
158        let v = dst[0];
159        assert!((125..=131).contains(&v), "expected ~128, got {v}");
160    }
161
162    #[test]
163    fn opaque_matches_scalar_for_large_span() {
164        // Verify LANE-chunked path matches a pixel-by-pixel reference.
165        let src = [100u8, 150, 200];
166        let a_input = 200u8;
167        let count = 37usize; // crosses chunk boundary: 2 full chunks + 5 tail
168        // i * 7 / i * 3 are mod-256 reduced before the cast, so the truncation
169        // is intentional and exact.
170        #[expect(
171            clippy::cast_possible_truncation,
172            reason = "mod-256 result fits in u8 by construction"
173        )]
174        let shape: Vec<u8> = (0..count).map(|i| (i * 7 % 256) as u8).collect();
175        #[expect(
176            clippy::cast_possible_truncation,
177            reason = "mod-256 result fits in u8 by construction"
178        )]
179        let initial: Vec<u8> = (0..count * 3).map(|i| (i * 3 % 256) as u8).collect();
180
181        // Scalar reference
182        let mut ref_dst = initial.clone();
183        let a_in = u16::from(a_input);
184        let [sr, sg, sb] = [u16::from(src[0]), u16::from(src[1]), u16::from(src[2])];
185        for (shape_v, ref_chunk) in shape.iter().zip(ref_dst.chunks_exact_mut(3)) {
186            let a_src = div255_u16(a_in * u16::from(*shape_v));
187            let inv = 255 - a_src;
188            #[expect(clippy::cast_possible_truncation, reason = "div255_u16 result ≤ 255")]
189            {
190                ref_chunk[0] = div255_u16(inv * u16::from(ref_chunk[0]) + a_src * sr) as u8;
191                ref_chunk[1] = div255_u16(inv * u16::from(ref_chunk[1]) + a_src * sg) as u8;
192                ref_chunk[2] = div255_u16(inv * u16::from(ref_chunk[2]) + a_src * sb) as u8;
193            }
194        }
195
196        let mut got = initial;
197        composite_aa_rgb8_opaque(&mut got, src, a_input, &shape);
198
199        assert_eq!(got, ref_dst, "chunked path mismatch vs scalar reference");
200    }
201
202    #[test]
203    fn opaque_empty_is_noop() {
204        let mut dst: Vec<u8> = vec![];
205        composite_aa_rgb8_opaque(&mut dst, [1, 2, 3], 255, &[]);
206    }
207}
rasterrocket_render/simd/composite.rs

rasterrocket_render/simd/
composite.rs