Skip to main content

rasterrocket_render/simd/
blend.rs

1//! SIMD-accelerated solid-colour fill paths.
2//!
3//! `blend_solid_rgb8` — fill RGB pixels with a constant colour.
4//! `blend_solid_gray8` — fill grayscale pixels with a constant value.
5//!
6//! # Acceleration tiers (x86-64, most to least preferred for large spans)
7//!
8//! 1. **`movdir64b`** (> 256 px): non-temporal 64-byte atomic stores that bypass
9//!    all cache levels.  Used for large write-only fills so the L3 V-Cache is not
10//!    evicted of the edge table that the scanner keeps hot between page renders.
11//! 2. **AVX2** (≥ 32 px): 256-bit stores; fast for medium spans where the data
12//!    will be read back shortly after writing.
13//! 3. **Scalar**: `copy_from_slice` / `fill` per pixel.
14//!
15//! The `movdir64b` path requires a 64-byte-aligned destination address.  A scalar
16//! preamble advances the write pointer to the next alignment boundary; a scalar
17//! tail handles any remaining bytes.  Because `movdir64b` is not yet exposed in
18//! `std::arch::x86_64`, runtime detection uses a `std::sync::OnceLock` that
19//! queries CPUID leaf 7 subleaf 0 ECX bit 28 exactly once per process.
20//!
21//! # Acceleration tiers (aarch64)
22//!
23//! 1. **NEON `vst3q_u8`** (≥ 16 px): 48-byte interleaved RGB stores via three-channel
24//!    scatter, 16 pixels per iteration.
25//! 2. **NEON `vst1q_u8`** (≥ 16 px, gray): 16-byte stores, 16 pixels per iteration.
26//! 3. **Scalar**: `copy_from_slice` / `fill` per pixel.
27//!
28//! NEON is mandatory on all ARMv8-A targets; no runtime detection is needed.
29
30/// Fill `count` RGB pixels in `dst` with `color` using a scalar loop.
31#[inline]
32pub(super) fn blend_solid_rgb8_scalar(dst: &mut [u8], color: [u8; 3], count: usize) {
33    debug_assert!(
34        dst.len() >= count * 3,
35        "dst too short: {} < {}",
36        dst.len(),
37        count * 3
38    );
39    for chunk in dst[..count * 3].chunks_exact_mut(3) {
40        chunk.copy_from_slice(&color);
41    }
42}
43
44/// Fill `count` grayscale pixels in `dst` with `color`.
45#[inline]
46pub(super) fn blend_solid_gray8_scalar(dst: &mut [u8], color: u8, count: usize) {
47    debug_assert!(
48        dst.len() >= count,
49        "dst too short: {} < {}",
50        dst.len(),
51        count
52    );
53    dst[..count].fill(color);
54}
55
56// ── AVX2 paths ────────────────────────────────────────────────────────────────
57
58// AVX2 functions use unsafe SIMD intrinsics — required, not lazy.
59#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
60/// Fill `count` RGB pixels in `dst` with `color` using 96-byte AVX2 chunks.
61///
62/// # Safety
63///
64/// Must only be called when AVX2 is available (`is_x86_feature_detected!("avx2")`).
65/// `dst.len() >= count * 3` must hold.
66#[target_feature(enable = "avx2")]
67unsafe fn blend_solid_rgb8_avx2(dst: &mut [u8], color: [u8; 3], count: usize) {
68    use std::arch::x86_64::{__m256i, _mm256_loadu_si256, _mm256_storeu_si256};
69    debug_assert!(
70        dst.len() >= count * 3,
71        "dst too short for AVX2 RGB fill: {} < {}",
72        dst.len(),
73        count * 3
74    );
75
76    let [r, g, b] = color;
77    // Build a 96-byte tile (32 pixels × 3 bytes = LCM(3,32)) so that three
78    // 32-byte stores cover exactly 32 pixels with no partial pixel at any
79    // store boundary.
80    let mut tile = [0u8; 96];
81    for (i, t) in tile.iter_mut().enumerate() {
82        *t = match i % 3 {
83            0 => r,
84            1 => g,
85            _ => b,
86        };
87    }
88
89    let dst_ptr = dst.as_mut_ptr();
90    let tile_ptr = tile.as_ptr();
91
92    // Load the three 32-byte vectors once.
93    // SAFETY: tile is 96 bytes on the stack; all three pointers are in-bounds.
94    // dst_ptr..dst_ptr + count*3 ≤ dst.len() by the debug_assert above.
95    let (v0, v1, v2): (__m256i, __m256i, __m256i) = unsafe {
96        (
97            _mm256_loadu_si256(tile_ptr.cast()),
98            _mm256_loadu_si256(tile_ptr.add(32).cast()),
99            _mm256_loadu_si256(tile_ptr.add(64).cast()),
100        )
101    };
102
103    // Number of complete 96-byte (32-pixel) chunks.
104    let chunks = count / 32;
105    for i in 0..chunks {
106        // SAFETY: i * 96 + 96 ≤ chunks * 96 ≤ count * 3 ≤ dst.len().
107        unsafe {
108            let p = dst_ptr.add(i * 96);
109            _mm256_storeu_si256(p.cast(), v0);
110            _mm256_storeu_si256(p.add(32).cast(), v1);
111            _mm256_storeu_si256(p.add(64).cast(), v2);
112        }
113    }
114
115    // Scalar tail for the remaining pixels.
116    let done = chunks * 32;
117    blend_solid_rgb8_scalar(&mut dst[done * 3..], color, count - done);
118}
119
120#[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
121/// Fill `count` grayscale pixels in `dst` with `color` using 32-byte AVX2 stores.
122///
123/// # Safety
124///
125/// Must only be called when AVX2 is available (`is_x86_feature_detected!("avx2")`).
126/// `dst.len() >= count` must hold.
127#[target_feature(enable = "avx2")]
128unsafe fn blend_solid_gray8_avx2(dst: &mut [u8], color: u8, count: usize) {
129    use std::arch::x86_64::{_mm256_set1_epi8, _mm256_storeu_si256};
130    debug_assert!(
131        dst.len() >= count,
132        "dst too short for AVX2 gray fill: {} < {}",
133        dst.len(),
134        count
135    );
136
137    #[expect(
138        clippy::cast_possible_wrap,
139        reason = "reinterpreting byte as i8 for SIMD; bit pattern preserved"
140    )]
141    let vec = _mm256_set1_epi8(color as i8);
142    let dst_ptr = dst.as_mut_ptr();
143
144    let chunks = count / 32;
145    for i in 0..chunks {
146        // SAFETY: i * 32 + 32 ≤ chunks * 32 ≤ count ≤ dst.len().
147        unsafe { _mm256_storeu_si256(dst_ptr.add(i * 32).cast(), vec) };
148    }
149
150    // Scalar tail.
151    let done = chunks * 32;
152    dst[done..count].fill(color);
153}
154
155// ── movdir64b non-temporal fill ───────────────────────────────────────────────
156//
157// `movdir64b` is not yet exposed in `std::arch::x86_64`; we use inline asm.
158// Detection uses CPUID leaf 7, subleaf 0, ECX bit 28, cached in a OnceLock.
159
160#[cfg(target_arch = "x86_64")]
161/// Pixel threshold above which `movdir64b` is preferred over AVX2 for solid fills.
162///
163/// Below this threshold the destination span is likely to be read back soon
164/// (e.g. compositing, AA blending) and keeping it in L3 is beneficial.
165/// Above it the output is write-only until the next page, so a non-temporal
166/// store preserves the edge table's V-Cache residency.
167const MOVDIR64B_THRESHOLD_PX: usize = 256;
168
169#[cfg(target_arch = "x86_64")]
170/// Query CPUID leaf 7, subleaf 0, ECX bit 28 to detect `movdir64b`.
171///
172/// Uses `std::arch::x86_64::__cpuid_count` (stable since 1.27) which handles
173/// the rbx save/restore internally on all supported platforms.
174/// Result is cached in a `OnceLock` so CPUID is executed at most once per process.
175fn has_movdir64b() -> bool {
176    use std::sync::OnceLock;
177    static CACHE: OnceLock<bool> = OnceLock::new();
178    *CACHE.get_or_init(|| {
179        // leaf 7 / subleaf 0: on CPUs that don't support leaf 7 this returns
180        // all-zeros, which correctly produces `false`.
181        let result = std::arch::x86_64::__cpuid_count(7, 0);
182        // ECX bit 28 = MOVDIR64B (Intel SDM vol. 2A, CPUID.07H.00H:ECX[28])
183        (result.ecx >> 28) & 1 != 0
184    })
185}
186
187/// Compute how many leading bytes are needed to bring `ptr` to `align`-byte
188/// alignment, capped at `limit`.
189///
190/// `ptr::align_offset` returns `usize::MAX` for null pointers or when
191/// alignment is provably impossible; in both cases we return `limit` so the
192/// caller falls back to its scalar path for the entire span.
193#[cfg(target_arch = "x86_64")]
194#[inline]
195fn preamble_len(ptr: *const u8, limit: usize, align: usize) -> usize {
196    let off = ptr.align_offset(align);
197    if off == usize::MAX {
198        limit
199    } else {
200        off.min(limit)
201    }
202}
203
204#[cfg(target_arch = "x86_64")]
205/// Fill `count` RGB pixels in `dst` using `movdir64b` non-temporal 64-byte stores.
206///
207/// The tile is 192 bytes (`LCM(3, 64) = 192`): three consecutive `movdir64b`
208/// stores advance the destination by 192 bytes (64 pixels) while reading from
209/// three disjoint 64-byte sections of the tile.  This ensures every store
210/// writes exactly one full RGB pixel at every byte boundary within the tile.
211///
212/// A scalar preamble aligns the destination pointer to 64 bytes; a scalar tail
213/// handles any remaining bytes after the last aligned block.
214///
215/// # Safety
216///
217/// - `movdir64b` must be available (CPUID.07H.00H:ECX[28] = 1).
218/// - `dst.len() >= count * 3` must hold.
219unsafe fn blend_solid_rgb8_movdir64b(dst: &mut [u8], color: [u8; 3], count: usize) {
220    // 192 = LCM(3, 64): three 64-byte stores cover exactly 64 RGB pixels
221    // with no partial pixel at any 64-byte boundary.
222    #[repr(align(64))]
223    struct Tile([u8; 192]);
224
225    // count * 3 cannot overflow: caller asserts dst.len() >= count * 3,
226    // and dst.len() is bounded by isize::MAX.
227    let byte_count = count * 3;
228    debug_assert!(
229        dst.len() >= byte_count,
230        "dst too short for movdir64b RGB fill: {} < {}",
231        dst.len(),
232        byte_count,
233    );
234    let dst_ptr = dst.as_mut_ptr();
235
236    // ── Preamble ──────────────────────────────────────────────────────────────
237    // Fill bytes [0, preamble) so that dst_ptr + preamble is 64-byte aligned.
238    // Each byte is indexed by its absolute position mod 3 so the RGB pattern
239    // remains continuous across the preamble / block boundary.
240    let preamble = preamble_len(dst_ptr.cast_const(), byte_count, 64);
241    for i in 0..preamble {
242        dst[i] = color[i % 3];
243    }
244
245    // ── Aligned blocks ────────────────────────────────────────────────────────
246    // Build a 192-byte tile phase-shifted by `preamble % 3` so that
247    // tile[k] == color[(preamble + k) % 3], i.e. the correct channel for
248    // absolute destination byte (preamble + k).
249    let phase = preamble % 3;
250    let mut tile = Tile([0u8; 192]);
251    for (k, t) in tile.0.iter_mut().enumerate() {
252        *t = color[(phase + k) % 3];
253    }
254
255    let blocks_start = preamble;
256    debug_assert!(
257        blocks_start <= byte_count,
258        "preamble_len exceeded byte_count"
259    );
260    let remaining = byte_count - blocks_start;
261    let blocks = remaining / 192;
262
263    for blk in 0..blocks {
264        // SAFETY:
265        // - blocks_start + blk*192 + 192 ≤ blocks_start + remaining ≤ byte_count ≤ dst.len().
266        // - dst_ptr + blocks_start is 64-byte aligned: preamble_len guarantees this
267        //   (or preamble == byte_count making blocks == 0, so the loop never runs).
268        // - blocks_start + blk*192 is also 64-byte aligned (192 is a multiple of 64).
269        // - tile.0 is 64-byte aligned by #[repr(align(64))]; sub-tiles at +64 and
270        //   +128 are therefore also 64-byte aligned.
271        // - MOVDIR64B destination operand is a register holding the aligned address;
272        //   source operand is a memory reference [reg].
273        unsafe {
274            let dst_base = dst_ptr.add(blocks_start + blk * 192);
275            let src0 = tile.0.as_ptr();
276            let src1 = src0.add(64);
277            let src2 = src0.add(128);
278            std::arch::asm!(
279                "movdir64b {d0}, [{s0}]",
280                "movdir64b {d1}, [{s1}]",
281                "movdir64b {d2}, [{s2}]",
282                d0 = in(reg) dst_base,
283                d1 = in(reg) dst_base.add(64),
284                d2 = in(reg) dst_base.add(128),
285                s0 = in(reg) src0,
286                s1 = in(reg) src1,
287                s2 = in(reg) src2,
288                options(nostack, preserves_flags),
289            );
290        }
291    }
292
293    // ── Tail ──────────────────────────────────────────────────────────────────
294    // Bytes after the last complete 192-byte block.  `off - blocks_start` is
295    // the offset from the first block byte, so adding `phase` gives the correct
296    // channel index consistent with the tile.
297    let tail_start = blocks_start + blocks * 192;
298    for off in tail_start..byte_count {
299        dst[off] = color[(phase + (off - blocks_start)) % 3];
300    }
301}
302
303#[cfg(target_arch = "x86_64")]
304/// Fill `count` grayscale pixels in `dst` using `movdir64b` non-temporal stores.
305///
306/// Each `movdir64b` writes 64 bytes = 64 pixels.  A scalar preamble aligns the
307/// destination to 64 bytes; a scalar tail handles the remainder.
308///
309/// # Safety
310///
311/// - `movdir64b` must be available (CPUID.07H.00H:ECX[28] = 1).
312/// - `dst.len() >= count` must hold.
313unsafe fn blend_solid_gray8_movdir64b(dst: &mut [u8], color: u8, count: usize) {
314    #[repr(align(64))]
315    struct Tile([u8; 64]);
316
317    debug_assert!(
318        dst.len() >= count,
319        "dst too short for movdir64b gray fill: {} < {}",
320        dst.len(),
321        count,
322    );
323
324    let tile = Tile([color; 64]);
325    let dst_ptr = dst.as_mut_ptr();
326
327    // ── Preamble ──────────────────────────────────────────────────────────────
328    let preamble = preamble_len(dst_ptr.cast_const(), count, 64);
329    dst[..preamble].fill(color);
330
331    // ── Aligned blocks ────────────────────────────────────────────────────────
332    debug_assert!(preamble <= count, "preamble_len exceeded count");
333    let blocks = (count - preamble) / 64;
334    for blk in 0..blocks {
335        // SAFETY:
336        // - preamble + blk*64 + 64 ≤ preamble + (count - preamble) = count ≤ dst.len().
337        // - dst_ptr + preamble is 64-byte aligned (preamble_len guarantees this,
338        //   or preamble == count making blocks == 0).
339        // - tile.0 is 64-byte aligned by #[repr(align(64))].
340        // - MOVDIR64B destination operand is a register holding the aligned address.
341        unsafe {
342            let dst_blk = dst_ptr.add(preamble + blk * 64);
343            let src = tile.0.as_ptr();
344            std::arch::asm!(
345                "movdir64b {dst}, [{src}]",
346                dst = in(reg) dst_blk,
347                src = in(reg) src,
348                options(nostack, preserves_flags),
349            );
350        }
351    }
352
353    // ── Tail ──────────────────────────────────────────────────────────────────
354    let tail_start = preamble + blocks * 64;
355    dst[tail_start..count].fill(color);
356}
357
358// ── aarch64 NEON solid fill ───────────────────────────────────────────────────
359
360/// Fill `count` RGB pixels in `dst` using NEON `vst3q_u8`, 16 pixels per iter.
361///
362/// `vst3q_u8` interleaves three `uint8x16_t` channels into 48 bytes of packed
363/// RGB output in a single store.  Each channel vector is a constant broadcast
364/// of the corresponding colour component.
365///
366/// # Safety
367///
368/// NEON is mandatory on all ARMv8-A targets.  `dst.len() >= count * 3` must hold.
369#[cfg(target_arch = "aarch64")]
370#[target_feature(enable = "neon")]
371unsafe fn blend_solid_rgb8_neon(dst: &mut [u8], color: [u8; 3], count: usize) {
372    use std::arch::aarch64::{uint8x16x3_t, vdupq_n_u8, vst3q_u8};
373
374    debug_assert!(
375        dst.len() >= count * 3,
376        "dst too short for NEON RGB fill: {} < {}",
377        dst.len(),
378        count * 3
379    );
380
381    let [r, g, b] = color;
382    // Broadcast each colour component to a full 16-byte vector.
383    let vr = vdupq_n_u8(r);
384    let vg = vdupq_n_u8(g);
385    let vb = vdupq_n_u8(b);
386    let chunk = uint8x16x3_t(vr, vg, vb);
387
388    let mut px = 0usize;
389    while px + 16 <= count {
390        // SAFETY: px*3 + 48 ≤ count*3 ≤ dst.len(); vst3q_u8 requires 1-byte alignment.
391        unsafe { vst3q_u8(dst.as_mut_ptr().add(px * 3), chunk) };
392        px += 16;
393    }
394    // Scalar tail for remaining pixels (< 16).
395    blend_solid_rgb8_scalar(&mut dst[px * 3..], color, count - px);
396}
397
398/// Fill `count` grayscale pixels in `dst` using NEON `vst1q_u8`, 16 pixels per iter.
399///
400/// # Safety
401///
402/// NEON is mandatory on all ARMv8-A targets.  `dst.len() >= count` must hold.
403#[cfg(target_arch = "aarch64")]
404#[target_feature(enable = "neon")]
405unsafe fn blend_solid_gray8_neon(dst: &mut [u8], color: u8, count: usize) {
406    use std::arch::aarch64::{vdupq_n_u8, vst1q_u8};
407
408    debug_assert!(
409        dst.len() >= count,
410        "dst too short for NEON gray fill: {} < {}",
411        dst.len(),
412        count
413    );
414
415    let vec = vdupq_n_u8(color);
416
417    let mut px = 0usize;
418    while px + 16 <= count {
419        // SAFETY: px + 16 ≤ count ≤ dst.len(); vst1q_u8 requires 1-byte alignment.
420        unsafe { vst1q_u8(dst.as_mut_ptr().add(px), vec) };
421        px += 16;
422    }
423    // Scalar tail for remaining pixels (< 16).
424    blend_solid_gray8_scalar(&mut dst[px..], color, count - px);
425}
426
427// ── Per-arch dispatch helpers ─────────────────────────────────────────────────
428
429#[cfg(target_arch = "x86_64")]
430#[inline]
431fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
432    if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
433        // SAFETY: movdir64b confirmed; dst.len() >= count*3 asserted by caller.
434        unsafe { blend_solid_rgb8_movdir64b(dst, color, count) };
435        return;
436    }
437    #[cfg(feature = "simd-avx2")]
438    if count >= 32 && is_x86_feature_detected!("avx2") {
439        // SAFETY: AVX2 confirmed; dst.len() >= count*3 asserted by caller.
440        unsafe { blend_solid_rgb8_avx2(dst, color, count) };
441        return;
442    }
443    blend_solid_rgb8_scalar(dst, color, count);
444}
445
446#[cfg(target_arch = "aarch64")]
447#[inline]
448fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
449    if count >= 16 {
450        // SAFETY: NEON mandatory on aarch64; dst.len() >= count*3 asserted by caller.
451        unsafe { blend_solid_rgb8_neon(dst, color, count) };
452    } else {
453        blend_solid_rgb8_scalar(dst, color, count);
454    }
455}
456
457#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
458#[inline]
459fn dispatch_blend_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
460    blend_solid_rgb8_scalar(dst, color, count);
461}
462
463#[cfg(target_arch = "x86_64")]
464#[inline]
465fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
466    if count > MOVDIR64B_THRESHOLD_PX && has_movdir64b() {
467        // SAFETY: movdir64b confirmed; dst.len() >= count asserted by caller.
468        unsafe { blend_solid_gray8_movdir64b(dst, color, count) };
469        return;
470    }
471    #[cfg(feature = "simd-avx2")]
472    if count >= 32 && is_x86_feature_detected!("avx2") {
473        // SAFETY: AVX2 confirmed; dst.len() >= count asserted by caller.
474        unsafe { blend_solid_gray8_avx2(dst, color, count) };
475        return;
476    }
477    blend_solid_gray8_scalar(dst, color, count);
478}
479
480#[cfg(target_arch = "aarch64")]
481#[inline]
482fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
483    if count >= 16 {
484        // SAFETY: NEON mandatory on aarch64; dst.len() >= count asserted by caller.
485        unsafe { blend_solid_gray8_neon(dst, color, count) };
486    } else {
487        blend_solid_gray8_scalar(dst, color, count);
488    }
489}
490
491#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
492#[inline]
493fn dispatch_blend_gray8(dst: &mut [u8], color: u8, count: usize) {
494    blend_solid_gray8_scalar(dst, color, count);
495}
496
497// ── Public API ────────────────────────────────────────────────────────────────
498
499/// Fill `count` RGB pixels in `dst` (starting at byte 0) with `color`.
500///
501/// # Panics
502///
503/// Panics if `dst.len() < count * 3`.
504///
505/// # Dispatch order
506///
507/// - **x86-64**: `movdir64b` (> 256 px, non-temporal) → AVX2 (≥ 32 px) → scalar
508/// - **aarch64**: NEON `vst3q_u8` (≥ 16 px) → scalar
509/// - **other**: scalar
510pub fn blend_solid_rgb8(dst: &mut [u8], color: [u8; 3], count: usize) {
511    assert!(
512        dst.len() >= count * 3,
513        "blend_solid_rgb8: dst too short ({} < {})",
514        dst.len(),
515        count * 3,
516    );
517    dispatch_blend_rgb8(dst, color, count);
518}
519
520/// Fill `count` grayscale pixels in `dst` (starting at byte 0) with `color`.
521///
522/// # Panics
523///
524/// Panics if `dst.len() < count`.
525///
526/// # Dispatch order
527///
528/// - **x86-64**: `movdir64b` (> 256 px, non-temporal) → AVX2 (≥ 32 px) → scalar
529/// - **aarch64**: NEON `vst1q_u8` (≥ 16 px) → scalar
530/// - **other**: scalar
531pub fn blend_solid_gray8(dst: &mut [u8], color: u8, count: usize) {
532    assert!(
533        dst.len() >= count,
534        "blend_solid_gray8: dst too short ({} < {})",
535        dst.len(),
536        count,
537    );
538    dispatch_blend_gray8(dst, color, count);
539}
540
541// ── Tests ─────────────────────────────────────────────────────────────────────
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546
547    // ── scalar ────────────────────────────────────────────────────────────────
548
549    #[test]
550    fn scalar_rgb8_small() {
551        let color = [10u8, 20, 30];
552        let mut dst = vec![0u8; 9];
553        blend_solid_rgb8_scalar(&mut dst, color, 3);
554        assert_eq!(dst, [10, 20, 30, 10, 20, 30, 10, 20, 30]);
555    }
556
557    #[test]
558    fn scalar_rgb8_zero_count() {
559        let color = [1u8, 2, 3];
560        let mut dst = vec![0u8; 3];
561        blend_solid_rgb8_scalar(&mut dst, color, 0);
562        assert_eq!(dst, [0, 0, 0]);
563    }
564
565    #[test]
566    fn scalar_gray8() {
567        let mut dst = vec![0u8; 5];
568        blend_solid_gray8_scalar(&mut dst, 42, 5);
569        assert!(dst.iter().all(|&b| b == 42));
570    }
571
572    // ── dispatch (tests both paths if AVX2 present) ───────────────────────────
573
574    #[test]
575    fn dispatch_rgb8_matches_scalar() {
576        let color = [100u8, 150, 200];
577        // Use count > 32 so AVX2 path is triggered.
578        let count = 64usize;
579        let mut expected = vec![0u8; count * 3];
580        blend_solid_rgb8_scalar(&mut expected, color, count);
581
582        let mut got = vec![0u8; count * 3];
583        blend_solid_rgb8(&mut got, color, count);
584        assert_eq!(got, expected, "dispatch_rgb8 mismatch");
585    }
586
587    #[test]
588    fn dispatch_gray8_matches_scalar() {
589        let count = 128usize;
590        let mut expected = vec![0u8; count];
591        blend_solid_gray8_scalar(&mut expected, 77, count);
592
593        let mut got = vec![0u8; count];
594        blend_solid_gray8(&mut got, 77, count);
595        assert_eq!(got, expected, "dispatch_gray8 mismatch");
596    }
597
598    #[test]
599    fn dispatch_rgb8_tail_handled() {
600        // count = 35: 32-pixel AVX2 chunk + 3-pixel scalar tail.
601        let color = [7u8, 8, 9];
602        let count = 35usize;
603        let mut expected = vec![0u8; count * 3];
604        blend_solid_rgb8_scalar(&mut expected, color, count);
605
606        let mut got = vec![0u8; count * 3];
607        blend_solid_rgb8(&mut got, color, count);
608        assert_eq!(got, expected, "tail mismatch");
609    }
610
611    #[test]
612    fn dispatch_rgb8_exact_32_pixels() {
613        let color = [255u8, 0, 128];
614        let count = 32usize;
615        let mut expected = vec![0u8; count * 3];
616        blend_solid_rgb8_scalar(&mut expected, color, count);
617
618        let mut got = vec![0u8; count * 3];
619        blend_solid_rgb8(&mut got, color, count);
620        assert_eq!(got, expected, "exact 32-pixel mismatch");
621    }
622
623    /// Verify AVX2 path directly (skipped when AVX2 is unavailable).
624    #[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
625    #[test]
626    fn avx2_rgb8_matches_scalar_direct() {
627        if !is_x86_feature_detected!("avx2") {
628            return;
629        }
630        let color = [11u8, 22, 33];
631        let count = 96usize;
632        let mut expected = vec![0u8; count * 3];
633        blend_solid_rgb8_scalar(&mut expected, color, count);
634
635        let mut got = vec![0u8; count * 3];
636        // SAFETY: is_x86_feature_detected! confirmed AVX2 is available.
637        unsafe { blend_solid_rgb8_avx2(&mut got, color, count) };
638        assert_eq!(got, expected, "AVX2 RGB path mismatch");
639    }
640
641    #[cfg(all(target_arch = "x86_64", feature = "simd-avx2"))]
642    #[test]
643    fn avx2_gray8_matches_scalar_direct() {
644        if !is_x86_feature_detected!("avx2") {
645            return;
646        }
647        let count = 128usize;
648        let mut expected = vec![0u8; count];
649        blend_solid_gray8_scalar(&mut expected, 200, count);
650
651        let mut got = vec![0u8; count];
652        // SAFETY: is_x86_feature_detected! confirmed AVX2 is available.
653        unsafe { blend_solid_gray8_avx2(&mut got, 200, count) };
654        assert_eq!(got, expected, "AVX2 gray path mismatch");
655    }
656
657    // ── movdir64b tests ───────────────────────────────────────────────────────
658
659    /// `dispatch_rgb8_large` exercises count > `MOVDIR64B_THRESHOLD_PX` so the
660    /// movdir64b path (or AVX2 on machines without movdir64b) is selected.
661    #[cfg(target_arch = "x86_64")]
662    #[test]
663    fn dispatch_rgb8_large_matches_scalar() {
664        let color = [77u8, 133, 211];
665        // 384 pixels: well above the 256-px threshold; exercises multiple 192-byte blocks.
666        let count = 384usize;
667        let mut expected = vec![0u8; count * 3];
668        blend_solid_rgb8_scalar(&mut expected, color, count);
669
670        let mut got = vec![0u8; count * 3];
671        blend_solid_rgb8(&mut got, color, count);
672        assert_eq!(got, expected, "large RGB dispatch mismatch");
673    }
674
675    #[cfg(target_arch = "x86_64")]
676    #[test]
677    fn dispatch_gray8_large_matches_scalar() {
678        let count = 512usize;
679        let mut expected = vec![0u8; count];
680        blend_solid_gray8_scalar(&mut expected, 99, count);
681
682        let mut got = vec![0u8; count];
683        blend_solid_gray8(&mut got, 99, count);
684        assert_eq!(got, expected, "large gray dispatch mismatch");
685    }
686
687    /// Exercise the movdir64b RGB path directly on capable machines.
688    /// Uses a misaligned allocation (`vec::as_mut_ptr` is only 8-byte aligned by
689    /// default) to exercise the preamble path as well.
690    #[cfg(target_arch = "x86_64")]
691    #[test]
692    fn movdir64b_rgb8_matches_scalar() {
693        if !has_movdir64b() {
694            return;
695        }
696        let color = [11u8, 22, 33];
697        // 512 pixels — exercises multiple 192-byte blocks plus preamble and tail.
698        let count = 512usize;
699        let mut expected = vec![0u8; count * 3];
700        blend_solid_rgb8_scalar(&mut expected, color, count);
701
702        let mut got = vec![0u8; count * 3];
703        // SAFETY: has_movdir64b() confirmed CPUID.07H.00H:ECX[28] = 1.
704        unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
705        assert_eq!(got, expected, "movdir64b RGB mismatch");
706    }
707
708    #[cfg(target_arch = "x86_64")]
709    #[test]
710    fn movdir64b_gray8_matches_scalar() {
711        if !has_movdir64b() {
712            return;
713        }
714        let count = 512usize;
715        let mut expected = vec![0u8; count];
716        blend_solid_gray8_scalar(&mut expected, 200, count);
717
718        let mut got = vec![0u8; count];
719        // SAFETY: has_movdir64b() confirmed CPUID.07H.00H:ECX[28] = 1.
720        unsafe { blend_solid_gray8_movdir64b(&mut got, 200, count) };
721        assert_eq!(got, expected, "movdir64b gray mismatch");
722    }
723
724    /// Odd count exercises the tail path that handles non-block-multiple pixel counts.
725    #[cfg(target_arch = "x86_64")]
726    #[test]
727    fn movdir64b_rgb8_odd_count() {
728        if !has_movdir64b() {
729            return;
730        }
731        let color = [3u8, 7, 11];
732        let count = 257usize; // 257 px: one block of 64px + preamble & tail
733        let mut expected = vec![0u8; count * 3];
734        blend_solid_rgb8_scalar(&mut expected, color, count);
735
736        let mut got = vec![0u8; count * 3];
737        // SAFETY: has_movdir64b() confirmed CPUID.07H.00H:ECX[28] = 1.
738        unsafe { blend_solid_rgb8_movdir64b(&mut got, color, count) };
739        assert_eq!(got, expected, "movdir64b RGB odd-count mismatch");
740    }
741
742    #[cfg(target_arch = "x86_64")]
743    #[test]
744    fn movdir64b_gray8_odd_count() {
745        if !has_movdir64b() {
746            return;
747        }
748        let count = 259usize;
749        let mut expected = vec![0u8; count];
750        blend_solid_gray8_scalar(&mut expected, 17, count);
751
752        let mut got = vec![0u8; count];
753        // SAFETY: has_movdir64b() confirmed CPUID.07H.00H:ECX[28] = 1.
754        unsafe { blend_solid_gray8_movdir64b(&mut got, 17, count) };
755        assert_eq!(got, expected, "movdir64b gray odd-count mismatch");
756    }
757
758    // ── NEON tests (aarch64 only) ─────────────────────────────────────────────
759
760    /// NEON RGB: 16 pixels exactly (one full chunk, no tail).
761    #[cfg(target_arch = "aarch64")]
762    #[test]
763    fn neon_rgb8_exact_16_pixels() {
764        let color = [11u8, 22, 33];
765        let count = 16usize;
766        let mut expected = vec![0u8; count * 3];
767        blend_solid_rgb8_scalar(&mut expected, color, count);
768        let mut got = vec![0u8; count * 3];
769        // SAFETY: NEON mandatory on aarch64; got.len() == count * 3.
770        unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
771        assert_eq!(got, expected, "NEON RGB 16-pixel mismatch");
772    }
773
774    /// NEON RGB: 35 pixels (2 full chunks + 3-pixel scalar tail).
775    #[cfg(target_arch = "aarch64")]
776    #[test]
777    fn neon_rgb8_with_tail() {
778        let color = [100u8, 150, 200];
779        let count = 35usize;
780        let mut expected = vec![0u8; count * 3];
781        blend_solid_rgb8_scalar(&mut expected, color, count);
782        let mut got = vec![0u8; count * 3];
783        // SAFETY: NEON mandatory on aarch64; got.len() == count * 3.
784        unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
785        assert_eq!(got, expected, "NEON RGB tail mismatch");
786    }
787
788    /// NEON RGB: count < 16 falls straight to scalar (no NEON store).
789    #[cfg(target_arch = "aarch64")]
790    #[test]
791    fn neon_rgb8_small_count() {
792        let color = [7u8, 8, 9];
793        let count = 5usize;
794        let mut expected = vec![0u8; count * 3];
795        blend_solid_rgb8_scalar(&mut expected, color, count);
796        let mut got = vec![0u8; count * 3];
797        // SAFETY: NEON mandatory on aarch64.
798        unsafe { blend_solid_rgb8_neon(&mut got, color, count) };
799        assert_eq!(got, expected, "NEON RGB small count mismatch");
800    }
801
802    /// NEON gray: 32 pixels (2 full chunks, no tail).
803    #[cfg(target_arch = "aarch64")]
804    #[test]
805    fn neon_gray8_exact_32_pixels() {
806        let count = 32usize;
807        let mut expected = vec![0u8; count];
808        blend_solid_gray8_scalar(&mut expected, 42, count);
809        let mut got = vec![0u8; count];
810        // SAFETY: NEON mandatory on aarch64; got.len() == count.
811        unsafe { blend_solid_gray8_neon(&mut got, 42, count) };
812        assert_eq!(got, expected, "NEON gray 32-pixel mismatch");
813    }
814
815    /// NEON gray: 19 pixels (1 full chunk + 3-pixel scalar tail).
816    #[cfg(target_arch = "aarch64")]
817    #[test]
818    fn neon_gray8_with_tail() {
819        let count = 19usize;
820        let mut expected = vec![0u8; count];
821        blend_solid_gray8_scalar(&mut expected, 77, count);
822        let mut got = vec![0u8; count];
823        // SAFETY: NEON mandatory on aarch64.
824        unsafe { blend_solid_gray8_neon(&mut got, 77, count) };
825        assert_eq!(got, expected, "NEON gray tail mismatch");
826    }
827
828    // ── public API boundary checks ────────────────────────────────────────────
829
830    #[test]
831    fn public_rgb8_zero_count() {
832        let mut dst = vec![0xFFu8; 6];
833        blend_solid_rgb8(&mut dst, [1, 2, 3], 0);
834        assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
835    }
836
837    #[test]
838    fn public_gray8_zero_count() {
839        let mut dst = vec![0xFFu8; 4];
840        blend_solid_gray8(&mut dst, 42, 0);
841        assert!(dst.iter().all(|&b| b == 0xFF), "zero-count must not write");
842    }
843
844    #[test]
845    #[should_panic(expected = "blend_solid_rgb8: dst too short")]
846    fn rgb8_panics_on_short_dst() {
847        let mut dst = vec![0u8; 5];
848        blend_solid_rgb8(&mut dst, [1, 2, 3], 10);
849    }
850
851    #[test]
852    #[should_panic(expected = "blend_solid_gray8: dst too short")]
853    fn gray8_panics_on_short_dst() {
854        let mut dst = vec![0u8; 5];
855        blend_solid_gray8(&mut dst, 42, 10);
856    }
857}