Skip to main content

rasterrocket_render/simd/
aa_coverage.rs

1//! Per-pixel AA coverage counts for `AaBuf` rows.
2//!
3//! Single public entry point:
4//!
5//! - `aa_coverage_span(rows, x0, shape)` — fills a `shape` buffer with
6//!   per-pixel AA coverage counts for output pixels `x0 .. x0+shape.len()`.
7//!   This is the hot path called from `draw_aa_line` in `fill/mod.rs`.
8//!   Each output pixel maps to 4 bits (one nibble) in each of the 4 `AaBuf`
9//!   rows; `aa_coverage_span` sums those nibbles across rows for every pixel
10//!   in the span in one vectorised pass.
11//!
12//! # `AaBuf` nibble layout
13//!
14//! For `AA_SIZE = 4`, output pixel `x` occupies the nibble at byte `x/2` of
15//! each row: the **high** nibble if `x` is even, the **low** nibble if `x` is
16//! odd.  Each nibble holds 0–4 set bits (one per AA sub-sample).  Summing the
17//! four rows gives a coverage count in 0..=16.
18//!
19//! # Acceleration tiers for `aa_coverage_span`
20//!
21//! ## x86-64 (most to least preferred)
22//! 1. **AVX-512 BITALG** (`avx512bitalg` + `avx512bw`): `_mm512_popcnt_epi8` on
23//!    nibble-isolated bytes, 128 output pixels per 64-byte iteration.
24//! 2. **AVX2** (`avx2`): VPSHUFB nibble lookup, 64 output pixels per 32-byte iteration.
25//! 3. **Scalar**: byte-by-byte nibble lookup via `NIBBLE_POP` table.
26//!
27//! ## aarch64 (most to least preferred)
28//! 1. **SVE2** (`nightly-sve2` feature + `sve2` target feature): nibble-isolated `svcnt_u8_z`,
29//!    `svcntb()*2` output pixels per iteration. Requires nightly Rust and `sve2` CPU feature.
30//! 2. **NEON**: nibble-isolated `vcntq_u8`, 32 output pixels per 16-byte iteration.
31//!    High/low nibbles extracted with `vshrq_n_u8` + `vandq_u8`; four-row
32//!    accumulation in u8 (max 16 ≤ 255); interleaved into `shape` via `vst2q_u8`.
33//!    NEON is mandatory on all ARMv8-A cores; no runtime detection is needed.
34
35// ── Scalar helpers ────────────────────────────────────────────────────────────
36
37/// Nibble popcount table: `NIBBLE_POP[n]` = number of set bits in `n` (0..=4).
38const NIBBLE_POP: [u8; 16] = [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4];
39
40/// Scalar fallback for `aa_coverage_span`.
41///
42/// Writes the coverage count (0..=16) for output pixel `x0 + i` into `shape[i]`
43/// by looking up the two nibbles (high = even pixel, low = odd pixel) of each
44/// packed row byte in `NIBBLE_POP` and summing across the four rows.
45///
46/// # Panics
47///
48/// Panics if any byte index derived from `x0 + i` is out of bounds for a row
49/// slice — i.e. if the caller's precondition `x0 + shape.len() ≤ bitmap_width`
50/// is violated.
51pub(super) fn aa_coverage_span_scalar(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
52    for (i, out) in shape.iter_mut().enumerate() {
53        let x = x0 + i;
54        let byte_idx = x >> 1;
55        let is_odd = (x & 1) != 0;
56        let mut count = 0u8;
57        for row in rows {
58            debug_assert!(
59                byte_idx < row.len(),
60                "aa_coverage_span_scalar: byte_idx={byte_idx} out of bounds (row.len={})",
61                row.len()
62            );
63            let byte = row[byte_idx];
64            let nibble = if is_odd { byte & 0x0f } else { byte >> 4 };
65            count += NIBBLE_POP[nibble as usize];
66        }
67        *out = count;
68    }
69}
70
71// ── Shared chunk-parameter computation ───────────────────────────────────────
72
73/// Compute the byte offset of the first row byte for pixel `x0` and the number
74/// of complete SIMD chunks of `chunk_bytes` row bytes (= `chunk_bytes * 2`
75/// output pixels) that fit in the span of `n` output pixels.
76///
77/// Used by all SIMD coverage-span kernels to centralise this arithmetic and
78/// avoid drift between implementations.
79///
80/// # Panics
81///
82/// Panics in debug builds if `chunk_bytes` is zero.
83#[inline]
84fn coverage_chunk_params(x0: usize, n: usize, chunk_bytes: usize) -> (usize, usize) {
85    debug_assert!(
86        chunk_bytes > 0,
87        "coverage_chunk_params: chunk_bytes must be > 0"
88    );
89    let byte_x0 = x0 >> 1;
90    // n.div_ceil(2): number of row bytes touched by the span.
91    // Integer-divide by chunk_bytes to get complete chunks only.
92    let n_chunks = n.div_ceil(2) / chunk_bytes;
93    (byte_x0, n_chunks)
94}
95
96// ── aarch64 NEON tier ─────────────────────────────────────────────────────────
97
98#[cfg(target_arch = "aarch64")]
99/// NEON tier for `aa_coverage_span`: 32 output pixels (16 row bytes) per iteration.
100///
101/// Each row byte encodes two consecutive output pixels as nibbles:
102/// - bits 7–4 (high nibble): even pixel `2k`   — extracted via `vshrq_n_u8(v, 4) & 0x0F`
103/// - bits 3–0 (low  nibble): odd  pixel `2k+1` — extracted via `v & 0x0F`
104///
105/// `vcntq_u8` counts set bits per nibble-byte; accumulation across the four rows
106/// stays in u8 (max 4 rows × 4 bits/nibble = 16 ≤ 255).  `vst2q_u8` interleaves
107/// the even/odd accumulators into `shape` in a single store.
108///
109/// Odd `x0` cannot be handled by this kernel (nibble boundaries are byte-aligned).
110/// The caller falls back to scalar when `x0` is odd.
111///
112/// # Safety
113///
114/// Must be compiled for `target_arch = "aarch64"`.  NEON is mandatory on all
115/// ARMv8-A targets covered by this cfg.  Caller must ensure:
116/// - `x0` is even (enforced at call site; odd `x0` is redirected to scalar).
117/// - `x0 + shape.len() ≤ bitmap_width` (precondition of `aa_coverage_span`).
118#[target_feature(enable = "neon")]
119unsafe fn aa_coverage_span_neon(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
120    use std::arch::aarch64::{
121        uint8x16x2_t, vaddq_u8, vandq_u8, vcntq_u8, vdupq_n_u8, vld1q_u8, vshrq_n_u8, vst2q_u8,
122    };
123
124    debug_assert!(x0 & 1 == 0, "aa_coverage_span_neon: x0={x0} must be even");
125
126    let n = shape.len();
127    let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 16);
128
129    // SAFETY: NEON intrinsics valid on all aarch64 targets (cfg gate).
130    // Row bounds are checked inside the loop before each load.
131    unsafe {
132        let mask_lo = vdupq_n_u8(0x0F);
133
134        for chunk_idx in 0..n_chunks {
135            let byte_off = byte_x0 + chunk_idx * 16;
136
137            let mut acc_hi = vdupq_n_u8(0);
138            let mut acc_lo = vdupq_n_u8(0);
139
140            for row in rows {
141                assert!(
142                    byte_off + 16 <= row.len(),
143                    "aa_coverage_span_neon: row too short: \
144                     need {} bytes at offset {byte_off}, have {}",
145                    byte_off + 16,
146                    row.len(),
147                );
148                let v = vld1q_u8(row[byte_off..].as_ptr());
149                // High nibble → bits 3–0 via arithmetic right-shift, then mask.
150                let hi = vandq_u8(vshrq_n_u8(v, 4), mask_lo);
151                // Low nibble → bits 3–0 directly.
152                let lo = vandq_u8(v, mask_lo);
153                acc_hi = vaddq_u8(acc_hi, vcntq_u8(hi));
154                acc_lo = vaddq_u8(acc_lo, vcntq_u8(lo));
155            }
156
157            // vst2q_u8 interleaves the two 16-byte vectors as
158            // [hi[0], lo[0], hi[1], lo[1], …] = [px0, px1, px2, px3, …].
159            let out_base = chunk_idx * 32;
160            let remaining = n - out_base;
161            if remaining >= 32 {
162                vst2q_u8(shape[out_base..].as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
163            } else {
164                // Partial last chunk: write through a staging buffer to avoid a
165                // 32-byte store past the end of `shape`.
166                let mut tmp = [0u8; 32];
167                vst2q_u8(tmp.as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
168                shape[out_base..].copy_from_slice(&tmp[..remaining]);
169            }
170        }
171    }
172
173    // Scalar remainder for any output pixels not covered by complete NEON chunks.
174    let scalar_start = n_chunks * 32;
175    if scalar_start < n {
176        aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
177    }
178}
179
180// ── aarch64 SVE2 tiers ───────────────────────────────────────────────────────
181//
182// Requires:
183//   - Cargo feature `nightly-sve2`
184//   - Rust nightly (stdarch_aarch64_sve is not yet stable)
185//   - CPU `sve2` target feature at runtime
186//
187// On fixed-128-bit SVE2 (Apple M4, Graviton4 at 128b mode) svcntb() == 16,
188// giving the same throughput as NEON. On wide-SVE2 server chips (Graviton4 at
189// full width, Neoverse V2) svcntb() may be 32–64, giving 2–4× NEON throughput.
190
191#[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
192/// SVE2 tier for `aa_coverage_span`.
193///
194/// Each SVE2 vector of `vl` row bytes encodes `vl * 2` output pixels as
195/// nibble pairs. High nibbles (even pixels) and low nibbles (odd pixels) are
196/// extracted and popcounted independently with `svcnt_u8_z`. The two result
197/// vectors are then scattered into `shape` by interleaving.
198///
199/// Odd `x0` is redirected to scalar by the caller (nibble boundaries are
200/// byte-aligned).
201///
202/// # Safety
203///
204/// Caller must ensure the `sve2` CPU feature is available.
205#[target_feature(enable = "sve2")]
206unsafe fn aa_coverage_span_sve2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
207    use std::arch::aarch64::{
208        svadd_u8_z, svand_u8_z, svcnt_u8_z, svcntb, svdup_n_u8, svld1_u8, svlsr_u8_z, svptrue_b8,
209        svst1_u8,
210    };
211
212    debug_assert!(x0 & 1 == 0, "aa_coverage_span_sve2: x0={x0} must be even");
213
214    #[expect(
215        clippy::cast_possible_truncation,
216        reason = "aarch64 is 64-bit; svcntb() ≤ 256 fits in usize"
217    )]
218    let vl = svcntb() as usize;
219    // AArch64 SVE architecture caps vector length at 2048 bits = 256 bytes,
220    // so a fixed 256-byte stack buffer always fits `vl`. Using `Vec::with_capacity`
221    // here would heap-allocate per call; the kernel is invoked per scanline.
222    debug_assert!(
223        vl <= 256,
224        "aa_coverage_span_sve2: svcntb()={vl} exceeds SVE max of 256"
225    );
226    let pg = svptrue_b8();
227    let mask_lo = svdup_n_u8(0x0F);
228    let shift4 = svdup_n_u8(4u8);
229
230    let n = shape.len();
231    let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, vl);
232
233    // Staging buffers for svst1_u8 output — stack-allocated to the SVE max
234    // (256 bytes); only the first `vl` bytes are written/read per chunk.
235    let mut hi_buf = [0u8; 256];
236    let mut lo_buf = [0u8; 256];
237
238    for chunk_idx in 0..n_chunks {
239        let byte_off = byte_x0 + chunk_idx * vl;
240
241        let mut acc_hi = svdup_n_u8(0u8);
242        let mut acc_lo = svdup_n_u8(0u8);
243
244        for row in rows {
245            assert!(
246                byte_off + vl <= row.len(),
247                "aa_coverage_span_sve2: row too short: \
248                 need {} bytes at offset {byte_off}, have {}",
249                byte_off + vl,
250                row.len(),
251            );
252            // SAFETY: caller guarantees sve2 is available; ptr is in-bounds (assert above).
253            let v = unsafe { svld1_u8(pg, row.as_ptr().add(byte_off)) };
254            // High nibble: shift right 4, mask to low 4 bits.
255            let hi = svand_u8_z(pg, svlsr_u8_z(pg, v, shift4), mask_lo);
256            // Low nibble: mask directly.
257            let lo = svand_u8_z(pg, v, mask_lo);
258            // Accumulate per-nibble popcount across rows (max 4 rows × 4 bits = 16 ≤ u8::MAX).
259            acc_hi = svadd_u8_z(pg, acc_hi, svcnt_u8_z(pg, hi));
260            acc_lo = svadd_u8_z(pg, acc_lo, svcnt_u8_z(pg, lo));
261        }
262
263        // Store to staging buffers and interleave into shape:
264        // shape[out_base + k*2] = hi_buf[k] (even pixel), shape[out_base + k*2+1] = lo_buf[k] (odd).
265        // SAFETY: caller guarantees sve2 is available; bufs are vl bytes, matching pg width.
266        unsafe {
267            svst1_u8(pg, hi_buf.as_mut_ptr(), acc_hi);
268            svst1_u8(pg, lo_buf.as_mut_ptr(), acc_lo);
269        }
270
271        let out_base = chunk_idx * vl * 2;
272        for k in 0..vl {
273            let even_px = out_base + k * 2;
274            let odd_px = even_px + 1;
275            if even_px < n {
276                shape[even_px] = hi_buf[k];
277            }
278            if odd_px < n {
279                shape[odd_px] = lo_buf[k];
280            }
281        }
282    }
283
284    // Scalar remainder for pixels not covered by complete SVE2 chunks.
285    let scalar_start = n_chunks * vl * 2;
286    if scalar_start < n {
287        aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
288    }
289}
290
291// ── x86-64 AVX2 tier ──────────────────────────────────────────────────────────
292
293#[cfg(target_arch = "x86_64")]
294/// AVX2 tier for `aa_coverage_span`: 64 output pixels (32 row bytes) per iteration.
295///
296/// Each row byte encodes two consecutive pixels as nibbles.  VPSHUFB is used
297/// as a 16-entry parallel lookup table for nibble popcounts. Accumulation
298/// across the four rows stays in u8 (max 4 rows × 4 bits/nibble = 16 ≤ 255).
299///
300/// After the four-row accumulation, the 32-element `hi` (even pixels) and
301/// `lo` (odd pixels) u8 vectors are interleaved into `shape` by extracting
302/// 128-bit lanes and storing via a staging buffer.
303///
304/// Odd `x0` is redirected to scalar by the caller.
305///
306/// # Safety
307///
308/// Caller must ensure the `avx2` CPU feature is available.
309#[target_feature(enable = "avx2")]
310unsafe fn aa_coverage_span_avx2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
311    use std::arch::x86_64::{
312        _mm256_add_epi8, _mm256_and_si256, _mm256_loadu_si256, _mm256_set_epi8, _mm256_set1_epi8,
313        _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, _mm256_storeu_si256,
314    };
315
316    debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx2: x0={x0} must be even");
317
318    let n = shape.len();
319    let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 32);
320
321    // Nibble popcount LUT broadcast to both 128-bit lanes.
322    let lut = _mm256_set_epi8(
323        4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1,
324        1, 0,
325    );
326    let mask_lo = _mm256_set1_epi8(0x0F_u8.cast_signed());
327
328    for chunk_idx in 0..n_chunks {
329        let byte_off = byte_x0 + chunk_idx * 32;
330
331        let (mut acc_hi, mut acc_lo) = (_mm256_setzero_si256(), _mm256_setzero_si256());
332
333        for row in rows {
334            assert!(
335                byte_off + 32 <= row.len(),
336                "aa_coverage_span_avx2: row too short: \
337                 need {} bytes at offset {byte_off}, have {}",
338                byte_off + 32,
339                row.len(),
340            );
341            // SAFETY: byte_off + 32 ≤ row.len() asserted above.
342            let v = unsafe { _mm256_loadu_si256(row[byte_off..].as_ptr().cast()) };
343            // High nibble → bits 3:0 via right-shift, then mask.
344            let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), mask_lo);
345            // Low nibble → bits 3:0 directly.
346            let lo = _mm256_and_si256(v, mask_lo);
347            // VPSHUFB lookup: nibble → popcount(nibble).
348            acc_hi = _mm256_add_epi8(acc_hi, _mm256_shuffle_epi8(lut, hi));
349            acc_lo = _mm256_add_epi8(acc_lo, _mm256_shuffle_epi8(lut, lo));
350        }
351
352        // Write the 32-element hi and lo vectors to a staging buffer, then
353        // interleave into shape: shape[out_base + 2k] = hi[k], shape[out_base + 2k+1] = lo[k].
354        let mut hi_buf = [0u8; 32];
355        let mut lo_buf = [0u8; 32];
356        // SAFETY: hi_buf / lo_buf are exactly 32 bytes.
357        unsafe {
358            _mm256_storeu_si256(hi_buf.as_mut_ptr().cast(), acc_hi);
359            _mm256_storeu_si256(lo_buf.as_mut_ptr().cast(), acc_lo);
360        }
361
362        let out_base = chunk_idx * 64;
363        for k in 0..32 {
364            let even_px = out_base + k * 2;
365            let odd_px = even_px + 1;
366            if even_px < n {
367                shape[even_px] = hi_buf[k];
368            }
369            if odd_px < n {
370                shape[odd_px] = lo_buf[k];
371            }
372        }
373    }
374
375    // Scalar remainder for any output pixels not covered by complete 32-byte chunks.
376    let scalar_start = n_chunks * 64;
377    if scalar_start < n {
378        aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
379    }
380}
381
382// ── aa_coverage_span AVX-512 BITALG tier ─────────────────────────────────────
383
384#[cfg(target_arch = "x86_64")]
385/// AVX-512 BITALG tier for `aa_coverage_span`.
386///
387/// Processes 128 output pixels (= 64 row bytes) per loop iteration.
388/// Each byte encodes two pixels as nibbles:
389/// - bits 7–4 (high nibble): even pixel `2k`
390/// - bits 3–0 (low  nibble): odd  pixel `2k+1`
391///
392/// Each nibble is isolated into its own byte lane before `_mm512_popcnt_epi8`,
393/// so the per-lane result equals the per-pixel coverage count (0..=4).  After
394/// accumulating across all four rows the two 64-lane buffers are interleaved
395/// into `shape`.
396///
397/// Odd `x0` is redirected to scalar by the caller.
398///
399/// # Safety
400///
401/// Caller must ensure `avx512bitalg` and `avx512bw` CPU features are available.
402#[target_feature(enable = "avx512bitalg,avx512bw")]
403unsafe fn aa_coverage_span_avx512(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
404    use std::arch::x86_64::{
405        _mm512_add_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_popcnt_epi8,
406        _mm512_set1_epi8, _mm512_setzero_si512, _mm512_srli_epi16, _mm512_storeu_si512,
407    };
408
409    debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx512: x0={x0} must be even");
410
411    let n = shape.len();
412    let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 64);
413
414    // 0x0F mask: after shifting or masking, isolates the low 4 bits of each byte.
415    // AVX-512 intrinsics are safe to call within a `#[target_feature]` function —
416    // the feature guarantee makes them non-unsafe in this context.
417    let mask_lo = _mm512_set1_epi8(0x0F_u8.cast_signed());
418
419    for chunk_idx in 0..n_chunks {
420        let byte_off = byte_x0 + chunk_idx * 64;
421
422        let (mut acc_hi, mut acc_lo) = (_mm512_setzero_si512(), _mm512_setzero_si512());
423
424        for row in rows {
425            assert!(
426                byte_off + 64 <= row.len(),
427                "aa_coverage_span_avx512: row too short: \
428                 need {} bytes at offset {byte_off}, have {}",
429                byte_off + 64,
430                row.len(),
431            );
432            // SAFETY: byte_off + 64 ≤ row.len() asserted above.
433            unsafe {
434                let v = _mm512_loadu_si512(row[byte_off..].as_ptr().cast());
435                // High nibble: arithmetic right-shift by 4, then mask off upper bits.
436                let hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), mask_lo);
437                // Low nibble: mask directly.
438                let lo = _mm512_and_si512(v, mask_lo);
439                acc_hi = _mm512_add_epi8(acc_hi, _mm512_popcnt_epi8(hi));
440                acc_lo = _mm512_add_epi8(acc_lo, _mm512_popcnt_epi8(lo));
441            }
442        }
443
444        let mut hi_buf = [0u8; 64];
445        let mut lo_buf = [0u8; 64];
446        // SAFETY: buffers are exactly 64 bytes; unaligned stores are always valid.
447        unsafe {
448            _mm512_storeu_si512(hi_buf.as_mut_ptr().cast(), acc_hi);
449            _mm512_storeu_si512(lo_buf.as_mut_ptr().cast(), acc_lo);
450        }
451
452        // Interleave: even pixel k*2 ← hi_buf[k], odd pixel k*2+1 ← lo_buf[k].
453        let out_base = chunk_idx * 128;
454        for k in 0..64 {
455            let even_px = out_base + k * 2;
456            let odd_px = even_px + 1;
457            if even_px < n {
458                shape[even_px] = hi_buf[k];
459            }
460            if odd_px < n {
461                shape[odd_px] = lo_buf[k];
462            }
463        }
464    }
465
466    // Scalar remainder (< 64 row bytes = < 128 output pixels).
467    let scalar_start = n_chunks * 128;
468    if scalar_start < n {
469        aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
470    }
471}
472
473// ── Public dispatch ───────────────────────────────────────────────────────────
474
475/// Fill `shape[i]` with the AA coverage count (0..=16) for output pixel `x0 + i`.
476///
477/// `rows` are the four `AaBuf` sub-row byte slices, each of length
478/// `(bitmap_width * AA_SIZE + 7) / 8`.
479///
480/// # Preconditions
481///
482/// - `x0 + shape.len() ≤ bitmap_width` — span must not exceed the bitmap row.
483/// - `x0` should be **even** for the SIMD tiers to be used.  An odd `x0` is
484///   correctly handled by the scalar tier (full precision, lower throughput).
485///
486/// # Panics
487///
488/// Panics if any row slice is shorter than required by the span — i.e. if the
489/// `x0 + shape.len() ≤ bitmap_width` precondition is violated.
490pub fn aa_coverage_span(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
491    if shape.is_empty() {
492        return;
493    }
494    dispatch_coverage(rows, x0, shape);
495}
496
497#[cfg(target_arch = "x86_64")]
498#[inline]
499fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
500    if x0 & 1 != 0 {
501        // Odd x0: nibble boundaries are byte-aligned; SIMD paths require even x0.
502        aa_coverage_span_scalar(rows, x0, shape);
503        return;
504    }
505    if is_x86_feature_detected!("avx512bitalg") && is_x86_feature_detected!("avx512bw") {
506        // SAFETY: both features confirmed present.
507        unsafe { aa_coverage_span_avx512(rows, x0, shape) };
508    } else if is_x86_feature_detected!("avx2") {
509        // SAFETY: avx2 confirmed present.
510        unsafe { aa_coverage_span_avx2(rows, x0, shape) };
511    } else {
512        aa_coverage_span_scalar(rows, x0, shape);
513    }
514}
515
516#[cfg(target_arch = "aarch64")]
517#[inline]
518fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
519    if x0 & 1 != 0 {
520        // Odd x0: nibble boundaries are byte-aligned; SIMD paths require even x0.
521        aa_coverage_span_scalar(rows, x0, shape);
522        return;
523    }
524    #[cfg(feature = "nightly-sve2")]
525    if std::arch::is_aarch64_feature_detected!("sve2") {
526        // SAFETY: sve2 feature confirmed present.
527        unsafe { aa_coverage_span_sve2(rows, x0, shape) };
528        return;
529    }
530    // NEON is mandatory on all ARMv8-A targets.
531    // SAFETY: aarch64 always has NEON.
532    unsafe { aa_coverage_span_neon(rows, x0, shape) };
533}
534
535#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
536#[inline]
537fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
538    aa_coverage_span_scalar(rows, x0, shape);
539}
540
541// ── Tests ─────────────────────────────────────────────────────────────────────
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546
547    // ── aa_coverage_span ──────────────────────────────────────────────────────
548
549    /// Build four row buffers from a `[[u8; N]; 4]` literal.
550    fn make_rows<const N: usize>(data: [[u8; N]; 4]) -> [Vec<u8>; 4] {
551        data.map(|r| r.to_vec())
552    }
553
554    /// Build four deterministic pseudo-random rows of `row_bytes` bytes for
555    /// dispatch-vs-scalar cross-check tests. Three rows use distinct hash
556    /// schedules `(mul, add)`; the fourth row is `!i`. Stays in `u8` arithmetic
557    /// throughout — no truncating cast from `usize`.
558    fn dispatch_test_rows(row_bytes: usize, schedules: [(u8, u8); 3]) -> [Vec<u8>; 4] {
559        let mk = |mul: u8, add: u8| -> Vec<u8> {
560            (0u8..)
561                .take(row_bytes)
562                .map(|i| i.wrapping_mul(mul).wrapping_add(add))
563                .collect()
564        };
565        let [(m0, a0), (m1, a1), (m2, a2)] = schedules;
566        let r3: Vec<u8> = (0u8..).take(row_bytes).map(|i| !i).collect();
567        [mk(m0, a0), mk(m1, a1), mk(m2, a2), r3]
568    }
569
570    #[test]
571    fn coverage_span_all_zero() {
572        let rows = make_rows([[0u8; 4]; 4]);
573        let mut shape = [0xFFu8; 8];
574        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
575        assert_eq!(shape, [0u8; 8]);
576    }
577
578    #[test]
579    fn coverage_span_all_ones() {
580        // 0xFF: high nibble 0xF (4 bits) + low nibble 0xF (4 bits) → 8 output pixels.
581        // 4 rows × 4 bits/nibble = 16 per pixel.
582        let rows = make_rows([[0xFFu8; 4]; 4]);
583        let mut shape = [0u8; 8];
584        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
585        assert_eq!(shape, [16u8; 8]);
586    }
587
588    #[test]
589    fn coverage_span_single_pixel_even() {
590        // Pixel 0 = high nibble of byte 0.  Set only that nibble in row 0.
591        let rows = [
592            vec![0xF0u8, 0, 0, 0],
593            vec![0u8; 4],
594            vec![0u8; 4],
595            vec![0u8; 4],
596        ];
597        let mut shape = [0u8; 2];
598        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
599        assert_eq!(shape, [4, 0]);
600    }
601
602    #[test]
603    fn coverage_span_single_pixel_odd() {
604        // Pixel 1 = low nibble of byte 0.  Set only that nibble in row 0.
605        let rows = [
606            vec![0x0Fu8, 0, 0, 0],
607            vec![0u8; 4],
608            vec![0u8; 4],
609            vec![0u8; 4],
610        ];
611        let mut shape = [0u8; 2];
612        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
613        assert_eq!(shape, [0, 4]);
614    }
615
616    #[test]
617    fn coverage_span_x0_offset() {
618        // x0=2: pixel 2 = high nibble of byte 1.
619        // Row 0 byte 1 high nibble = 0xA = 0b1010 = 2 bits.
620        // Row 1 byte 1 high nibble = 0x5 = 0b0101 = 2 bits.  Total = 4.
621        let rows = [
622            vec![0u8, 0xA0, 0, 0],
623            vec![0u8, 0x50, 0, 0],
624            vec![0u8; 4],
625            vec![0u8; 4],
626        ];
627        let mut shape = [0u8; 1];
628        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 2, &mut shape);
629        assert_eq!(shape[0], 4);
630    }
631
632    #[test]
633    fn coverage_span_odd_x0_matches_scalar() {
634        // Odd x0=1: SIMD tiers must fall back to scalar; result must still be correct.
635        const N: usize = 10;
636        let row_bytes = (1 + N).div_ceil(2); // bytes needed for pixels 1..=10
637        let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
638
639        let mut expected = vec![0u8; N];
640        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut expected);
641
642        let mut got = vec![0u8; N];
643        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut got);
644
645        assert_eq!(got, expected, "odd x0 result mismatch");
646    }
647
648    #[test]
649    fn coverage_span_dispatch_matches_scalar() {
650        // 300 output pixels = 150 row bytes.
651        // Exercises: AVX-512 (2 × 64-byte chunks + 22-byte scalar remainder),
652        //            NEON    (9 × 16-byte chunks + 6-byte scalar remainder),
653        //            scalar  (full path).
654        const N: usize = 300;
655        let row_bytes = N.div_ceil(2);
656        let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
657
658        let mut expected = vec![0u8; N];
659        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
660
661        let mut got = vec![0u8; N];
662        aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got);
663
664        assert_eq!(got, expected, "dispatch mismatch on N={N}");
665    }
666
667    #[test]
668    fn coverage_span_empty_is_noop() {
669        let row = vec![0xFFu8; 4];
670        let mut shape: [u8; 0] = [];
671        aa_coverage_span([&row, &row, &row, &row], 0, &mut shape); // must not panic
672    }
673
674    /// Schedule used by all per-tier cross-checks below — three distinct
675    /// `(mul, add)` pairs feed the [`dispatch_test_rows`] helper.
676    const TIER_SCHEDULES: [(u8, u8); 3] = [(37, 11), (53, 7), (17, 3)];
677
678    /// Output-pixel span used by every per-tier cross-check. 300 leaves a
679    /// non-aligned tail in every tier (AVX-512: 2 × 64-byte chunks + 22-byte
680    /// scalar; AVX2: 4 × 32-byte chunks + 44-px scalar; NEON: 9 × 16-byte
681    /// chunks + 6-byte scalar).
682    const TIER_TEST_N: usize = 300;
683
684    #[cfg(target_arch = "x86_64")]
685    #[test]
686    fn avx512_coverage_matches_scalar() {
687        if !is_x86_feature_detected!("avx512bitalg") || !is_x86_feature_detected!("avx512bw") {
688            return;
689        }
690        let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
691
692        let mut expected = vec![0u8; TIER_TEST_N];
693        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
694
695        let mut got = vec![0u8; TIER_TEST_N];
696        // SAFETY: both features confirmed present above.
697        unsafe { aa_coverage_span_avx512([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
698
699        assert_eq!(got, expected, "AVX-512 coverage mismatch vs scalar");
700    }
701
702    #[cfg(target_arch = "x86_64")]
703    #[test]
704    fn avx2_coverage_matches_scalar() {
705        if !is_x86_feature_detected!("avx2") {
706            return;
707        }
708        let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
709
710        let mut expected = vec![0u8; TIER_TEST_N];
711        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
712
713        let mut got = vec![0u8; TIER_TEST_N];
714        // SAFETY: avx2 confirmed present above.
715        unsafe { aa_coverage_span_avx2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
716
717        assert_eq!(got, expected, "AVX2 coverage mismatch vs scalar");
718    }
719
720    #[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
721    #[test]
722    fn sve2_coverage_matches_scalar() {
723        if !std::arch::is_aarch64_feature_detected!("sve2") {
724            return;
725        }
726        let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
727
728        let mut expected = vec![0u8; TIER_TEST_N];
729        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
730
731        let mut got = vec![0u8; TIER_TEST_N];
732        // SAFETY: sve2 confirmed present above.
733        unsafe { aa_coverage_span_sve2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
734
735        assert_eq!(got, expected, "SVE2 coverage mismatch vs scalar");
736    }
737
738    #[cfg(target_arch = "aarch64")]
739    #[test]
740    fn neon_coverage_matches_scalar() {
741        let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
742
743        let mut expected = vec![0u8; TIER_TEST_N];
744        aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
745
746        let mut got = vec![0u8; TIER_TEST_N];
747        // SAFETY: aarch64 always has NEON.
748        unsafe { aa_coverage_span_neon([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
749
750        assert_eq!(got, expected, "NEON coverage mismatch vs scalar");
751    }
752}