rasterrocket_render/simd/aa_coverage.rs
1//! Per-pixel AA coverage counts for `AaBuf` rows.
2//!
3//! Single public entry point:
4//!
5//! - `aa_coverage_span(rows, x0, shape)` — fills a `shape` buffer with
6//! per-pixel AA coverage counts for output pixels `x0 .. x0+shape.len()`.
7//! This is the hot path called from `draw_aa_line` in `fill/mod.rs`.
8//! Each output pixel maps to 4 bits (one nibble) in each of the 4 `AaBuf`
9//! rows; `aa_coverage_span` sums those nibbles across rows for every pixel
10//! in the span in one vectorised pass.
11//!
12//! # `AaBuf` nibble layout
13//!
14//! For `AA_SIZE = 4`, output pixel `x` occupies the nibble at byte `x/2` of
15//! each row: the **high** nibble if `x` is even, the **low** nibble if `x` is
16//! odd. Each nibble holds 0–4 set bits (one per AA sub-sample). Summing the
17//! four rows gives a coverage count in 0..=16.
18//!
19//! # Acceleration tiers for `aa_coverage_span`
20//!
21//! ## x86-64 (most to least preferred)
22//! 1. **AVX-512 BITALG** (`avx512bitalg` + `avx512bw`): `_mm512_popcnt_epi8` on
23//! nibble-isolated bytes, 128 output pixels per 64-byte iteration.
24//! 2. **AVX2** (`avx2`): VPSHUFB nibble lookup, 64 output pixels per 32-byte iteration.
25//! 3. **Scalar**: byte-by-byte nibble lookup via `NIBBLE_POP` table.
26//!
27//! ## aarch64 (most to least preferred)
28//! 1. **SVE2** (`nightly-sve2` feature + `sve2` target feature): nibble-isolated `svcnt_u8_z`,
29//! `svcntb()*2` output pixels per iteration. Requires nightly Rust and `sve2` CPU feature.
30//! 2. **NEON**: nibble-isolated `vcntq_u8`, 32 output pixels per 16-byte iteration.
31//! High/low nibbles extracted with `vshrq_n_u8` + `vandq_u8`; four-row
32//! accumulation in u8 (max 16 ≤ 255); interleaved into `shape` via `vst2q_u8`.
33//! NEON is mandatory on all ARMv8-A cores; no runtime detection is needed.
34
35// ── Scalar helpers ────────────────────────────────────────────────────────────
36
37/// Nibble popcount table: `NIBBLE_POP[n]` = number of set bits in `n` (0..=4).
38const NIBBLE_POP: [u8; 16] = [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4];
39
40/// Scalar fallback for `aa_coverage_span`.
41///
42/// Writes the coverage count (0..=16) for output pixel `x0 + i` into `shape[i]`
43/// by looking up the two nibbles (high = even pixel, low = odd pixel) of each
44/// packed row byte in `NIBBLE_POP` and summing across the four rows.
45///
46/// # Panics
47///
48/// Panics if any byte index derived from `x0 + i` is out of bounds for a row
49/// slice — i.e. if the caller's precondition `x0 + shape.len() ≤ bitmap_width`
50/// is violated.
51pub(super) fn aa_coverage_span_scalar(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
52 for (i, out) in shape.iter_mut().enumerate() {
53 let x = x0 + i;
54 let byte_idx = x >> 1;
55 let is_odd = (x & 1) != 0;
56 let mut count = 0u8;
57 for row in rows {
58 debug_assert!(
59 byte_idx < row.len(),
60 "aa_coverage_span_scalar: byte_idx={byte_idx} out of bounds (row.len={})",
61 row.len()
62 );
63 let byte = row[byte_idx];
64 let nibble = if is_odd { byte & 0x0f } else { byte >> 4 };
65 count += NIBBLE_POP[nibble as usize];
66 }
67 *out = count;
68 }
69}
70
71// ── Shared chunk-parameter computation ───────────────────────────────────────
72
73/// Compute the byte offset of the first row byte for pixel `x0` and the number
74/// of complete SIMD chunks of `chunk_bytes` row bytes (= `chunk_bytes * 2`
75/// output pixels) that fit in the span of `n` output pixels.
76///
77/// Used by all SIMD coverage-span kernels to centralise this arithmetic and
78/// avoid drift between implementations.
79///
80/// # Panics
81///
82/// Panics in debug builds if `chunk_bytes` is zero.
83#[inline]
84fn coverage_chunk_params(x0: usize, n: usize, chunk_bytes: usize) -> (usize, usize) {
85 debug_assert!(
86 chunk_bytes > 0,
87 "coverage_chunk_params: chunk_bytes must be > 0"
88 );
89 let byte_x0 = x0 >> 1;
90 // n.div_ceil(2): number of row bytes touched by the span.
91 // Integer-divide by chunk_bytes to get complete chunks only.
92 let n_chunks = n.div_ceil(2) / chunk_bytes;
93 (byte_x0, n_chunks)
94}
95
96// ── aarch64 NEON tier ─────────────────────────────────────────────────────────
97
98#[cfg(target_arch = "aarch64")]
99/// NEON tier for `aa_coverage_span`: 32 output pixels (16 row bytes) per iteration.
100///
101/// Each row byte encodes two consecutive output pixels as nibbles:
102/// - bits 7–4 (high nibble): even pixel `2k` — extracted via `vshrq_n_u8(v, 4) & 0x0F`
103/// - bits 3–0 (low nibble): odd pixel `2k+1` — extracted via `v & 0x0F`
104///
105/// `vcntq_u8` counts set bits per nibble-byte; accumulation across the four rows
106/// stays in u8 (max 4 rows × 4 bits/nibble = 16 ≤ 255). `vst2q_u8` interleaves
107/// the even/odd accumulators into `shape` in a single store.
108///
109/// Odd `x0` cannot be handled by this kernel (nibble boundaries are byte-aligned).
110/// The caller falls back to scalar when `x0` is odd.
111///
112/// # Safety
113///
114/// Must be compiled for `target_arch = "aarch64"`. NEON is mandatory on all
115/// ARMv8-A targets covered by this cfg. Caller must ensure:
116/// - `x0` is even (enforced at call site; odd `x0` is redirected to scalar).
117/// - `x0 + shape.len() ≤ bitmap_width` (precondition of `aa_coverage_span`).
118#[target_feature(enable = "neon")]
119unsafe fn aa_coverage_span_neon(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
120 use std::arch::aarch64::{
121 uint8x16x2_t, vaddq_u8, vandq_u8, vcntq_u8, vdupq_n_u8, vld1q_u8, vshrq_n_u8, vst2q_u8,
122 };
123
124 debug_assert!(x0 & 1 == 0, "aa_coverage_span_neon: x0={x0} must be even");
125
126 let n = shape.len();
127 let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 16);
128
129 // SAFETY: NEON intrinsics valid on all aarch64 targets (cfg gate).
130 // Row bounds are checked inside the loop before each load.
131 unsafe {
132 let mask_lo = vdupq_n_u8(0x0F);
133
134 for chunk_idx in 0..n_chunks {
135 let byte_off = byte_x0 + chunk_idx * 16;
136
137 let mut acc_hi = vdupq_n_u8(0);
138 let mut acc_lo = vdupq_n_u8(0);
139
140 for row in rows {
141 assert!(
142 byte_off + 16 <= row.len(),
143 "aa_coverage_span_neon: row too short: \
144 need {} bytes at offset {byte_off}, have {}",
145 byte_off + 16,
146 row.len(),
147 );
148 let v = vld1q_u8(row[byte_off..].as_ptr());
149 // High nibble → bits 3–0 via arithmetic right-shift, then mask.
150 let hi = vandq_u8(vshrq_n_u8(v, 4), mask_lo);
151 // Low nibble → bits 3–0 directly.
152 let lo = vandq_u8(v, mask_lo);
153 acc_hi = vaddq_u8(acc_hi, vcntq_u8(hi));
154 acc_lo = vaddq_u8(acc_lo, vcntq_u8(lo));
155 }
156
157 // vst2q_u8 interleaves the two 16-byte vectors as
158 // [hi[0], lo[0], hi[1], lo[1], …] = [px0, px1, px2, px3, …].
159 let out_base = chunk_idx * 32;
160 let remaining = n - out_base;
161 if remaining >= 32 {
162 vst2q_u8(shape[out_base..].as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
163 } else {
164 // Partial last chunk: write through a staging buffer to avoid a
165 // 32-byte store past the end of `shape`.
166 let mut tmp = [0u8; 32];
167 vst2q_u8(tmp.as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
168 shape[out_base..].copy_from_slice(&tmp[..remaining]);
169 }
170 }
171 }
172
173 // Scalar remainder for any output pixels not covered by complete NEON chunks.
174 let scalar_start = n_chunks * 32;
175 if scalar_start < n {
176 aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
177 }
178}
179
180// ── aarch64 SVE2 tiers ───────────────────────────────────────────────────────
181//
182// Requires:
183// - Cargo feature `nightly-sve2`
184// - Rust nightly (stdarch_aarch64_sve is not yet stable)
185// - CPU `sve2` target feature at runtime
186//
187// On fixed-128-bit SVE2 (Apple M4, Graviton4 at 128b mode) svcntb() == 16,
188// giving the same throughput as NEON. On wide-SVE2 server chips (Graviton4 at
189// full width, Neoverse V2) svcntb() may be 32–64, giving 2–4× NEON throughput.
190
191#[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
192/// SVE2 tier for `aa_coverage_span`.
193///
194/// Each SVE2 vector of `vl` row bytes encodes `vl * 2` output pixels as
195/// nibble pairs. High nibbles (even pixels) and low nibbles (odd pixels) are
196/// extracted and popcounted independently with `svcnt_u8_z`. The two result
197/// vectors are then scattered into `shape` by interleaving.
198///
199/// Odd `x0` is redirected to scalar by the caller (nibble boundaries are
200/// byte-aligned).
201///
202/// # Safety
203///
204/// Caller must ensure the `sve2` CPU feature is available.
205#[target_feature(enable = "sve2")]
206unsafe fn aa_coverage_span_sve2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
207 use std::arch::aarch64::{
208 svadd_u8_z, svand_u8_z, svcnt_u8_z, svcntb, svdup_n_u8, svld1_u8, svlsr_u8_z, svptrue_b8,
209 svst1_u8,
210 };
211
212 debug_assert!(x0 & 1 == 0, "aa_coverage_span_sve2: x0={x0} must be even");
213
214 #[expect(
215 clippy::cast_possible_truncation,
216 reason = "aarch64 is 64-bit; svcntb() ≤ 256 fits in usize"
217 )]
218 let vl = svcntb() as usize;
219 // AArch64 SVE architecture caps vector length at 2048 bits = 256 bytes,
220 // so a fixed 256-byte stack buffer always fits `vl`. Using `Vec::with_capacity`
221 // here would heap-allocate per call; the kernel is invoked per scanline.
222 debug_assert!(
223 vl <= 256,
224 "aa_coverage_span_sve2: svcntb()={vl} exceeds SVE max of 256"
225 );
226 let pg = svptrue_b8();
227 let mask_lo = svdup_n_u8(0x0F);
228 let shift4 = svdup_n_u8(4u8);
229
230 let n = shape.len();
231 let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, vl);
232
233 // Staging buffers for svst1_u8 output — stack-allocated to the SVE max
234 // (256 bytes); only the first `vl` bytes are written/read per chunk.
235 let mut hi_buf = [0u8; 256];
236 let mut lo_buf = [0u8; 256];
237
238 for chunk_idx in 0..n_chunks {
239 let byte_off = byte_x0 + chunk_idx * vl;
240
241 let mut acc_hi = svdup_n_u8(0u8);
242 let mut acc_lo = svdup_n_u8(0u8);
243
244 for row in rows {
245 assert!(
246 byte_off + vl <= row.len(),
247 "aa_coverage_span_sve2: row too short: \
248 need {} bytes at offset {byte_off}, have {}",
249 byte_off + vl,
250 row.len(),
251 );
252 // SAFETY: caller guarantees sve2 is available; ptr is in-bounds (assert above).
253 let v = unsafe { svld1_u8(pg, row.as_ptr().add(byte_off)) };
254 // High nibble: shift right 4, mask to low 4 bits.
255 let hi = svand_u8_z(pg, svlsr_u8_z(pg, v, shift4), mask_lo);
256 // Low nibble: mask directly.
257 let lo = svand_u8_z(pg, v, mask_lo);
258 // Accumulate per-nibble popcount across rows (max 4 rows × 4 bits = 16 ≤ u8::MAX).
259 acc_hi = svadd_u8_z(pg, acc_hi, svcnt_u8_z(pg, hi));
260 acc_lo = svadd_u8_z(pg, acc_lo, svcnt_u8_z(pg, lo));
261 }
262
263 // Store to staging buffers and interleave into shape:
264 // shape[out_base + k*2] = hi_buf[k] (even pixel), shape[out_base + k*2+1] = lo_buf[k] (odd).
265 // SAFETY: caller guarantees sve2 is available; bufs are vl bytes, matching pg width.
266 unsafe {
267 svst1_u8(pg, hi_buf.as_mut_ptr(), acc_hi);
268 svst1_u8(pg, lo_buf.as_mut_ptr(), acc_lo);
269 }
270
271 let out_base = chunk_idx * vl * 2;
272 for k in 0..vl {
273 let even_px = out_base + k * 2;
274 let odd_px = even_px + 1;
275 if even_px < n {
276 shape[even_px] = hi_buf[k];
277 }
278 if odd_px < n {
279 shape[odd_px] = lo_buf[k];
280 }
281 }
282 }
283
284 // Scalar remainder for pixels not covered by complete SVE2 chunks.
285 let scalar_start = n_chunks * vl * 2;
286 if scalar_start < n {
287 aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
288 }
289}
290
291// ── x86-64 AVX2 tier ──────────────────────────────────────────────────────────
292
293#[cfg(target_arch = "x86_64")]
294/// AVX2 tier for `aa_coverage_span`: 64 output pixels (32 row bytes) per iteration.
295///
296/// Each row byte encodes two consecutive pixels as nibbles. VPSHUFB is used
297/// as a 16-entry parallel lookup table for nibble popcounts. Accumulation
298/// across the four rows stays in u8 (max 4 rows × 4 bits/nibble = 16 ≤ 255).
299///
300/// After the four-row accumulation, the 32-element `hi` (even pixels) and
301/// `lo` (odd pixels) u8 vectors are interleaved into `shape` by extracting
302/// 128-bit lanes and storing via a staging buffer.
303///
304/// Odd `x0` is redirected to scalar by the caller.
305///
306/// # Safety
307///
308/// Caller must ensure the `avx2` CPU feature is available.
309#[target_feature(enable = "avx2")]
310unsafe fn aa_coverage_span_avx2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
311 use std::arch::x86_64::{
312 _mm256_add_epi8, _mm256_and_si256, _mm256_loadu_si256, _mm256_set_epi8, _mm256_set1_epi8,
313 _mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, _mm256_storeu_si256,
314 };
315
316 debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx2: x0={x0} must be even");
317
318 let n = shape.len();
319 let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 32);
320
321 // Nibble popcount LUT broadcast to both 128-bit lanes.
322 let lut = _mm256_set_epi8(
323 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1,
324 1, 0,
325 );
326 let mask_lo = _mm256_set1_epi8(0x0F_u8.cast_signed());
327
328 for chunk_idx in 0..n_chunks {
329 let byte_off = byte_x0 + chunk_idx * 32;
330
331 let (mut acc_hi, mut acc_lo) = (_mm256_setzero_si256(), _mm256_setzero_si256());
332
333 for row in rows {
334 assert!(
335 byte_off + 32 <= row.len(),
336 "aa_coverage_span_avx2: row too short: \
337 need {} bytes at offset {byte_off}, have {}",
338 byte_off + 32,
339 row.len(),
340 );
341 // SAFETY: byte_off + 32 ≤ row.len() asserted above.
342 let v = unsafe { _mm256_loadu_si256(row[byte_off..].as_ptr().cast()) };
343 // High nibble → bits 3:0 via right-shift, then mask.
344 let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), mask_lo);
345 // Low nibble → bits 3:0 directly.
346 let lo = _mm256_and_si256(v, mask_lo);
347 // VPSHUFB lookup: nibble → popcount(nibble).
348 acc_hi = _mm256_add_epi8(acc_hi, _mm256_shuffle_epi8(lut, hi));
349 acc_lo = _mm256_add_epi8(acc_lo, _mm256_shuffle_epi8(lut, lo));
350 }
351
352 // Write the 32-element hi and lo vectors to a staging buffer, then
353 // interleave into shape: shape[out_base + 2k] = hi[k], shape[out_base + 2k+1] = lo[k].
354 let mut hi_buf = [0u8; 32];
355 let mut lo_buf = [0u8; 32];
356 // SAFETY: hi_buf / lo_buf are exactly 32 bytes.
357 unsafe {
358 _mm256_storeu_si256(hi_buf.as_mut_ptr().cast(), acc_hi);
359 _mm256_storeu_si256(lo_buf.as_mut_ptr().cast(), acc_lo);
360 }
361
362 let out_base = chunk_idx * 64;
363 for k in 0..32 {
364 let even_px = out_base + k * 2;
365 let odd_px = even_px + 1;
366 if even_px < n {
367 shape[even_px] = hi_buf[k];
368 }
369 if odd_px < n {
370 shape[odd_px] = lo_buf[k];
371 }
372 }
373 }
374
375 // Scalar remainder for any output pixels not covered by complete 32-byte chunks.
376 let scalar_start = n_chunks * 64;
377 if scalar_start < n {
378 aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
379 }
380}
381
382// ── aa_coverage_span AVX-512 BITALG tier ─────────────────────────────────────
383
384#[cfg(target_arch = "x86_64")]
385/// AVX-512 BITALG tier for `aa_coverage_span`.
386///
387/// Processes 128 output pixels (= 64 row bytes) per loop iteration.
388/// Each byte encodes two pixels as nibbles:
389/// - bits 7–4 (high nibble): even pixel `2k`
390/// - bits 3–0 (low nibble): odd pixel `2k+1`
391///
392/// Each nibble is isolated into its own byte lane before `_mm512_popcnt_epi8`,
393/// so the per-lane result equals the per-pixel coverage count (0..=4). After
394/// accumulating across all four rows the two 64-lane buffers are interleaved
395/// into `shape`.
396///
397/// Odd `x0` is redirected to scalar by the caller.
398///
399/// # Safety
400///
401/// Caller must ensure `avx512bitalg` and `avx512bw` CPU features are available.
402#[target_feature(enable = "avx512bitalg,avx512bw")]
403unsafe fn aa_coverage_span_avx512(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
404 use std::arch::x86_64::{
405 _mm512_add_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_popcnt_epi8,
406 _mm512_set1_epi8, _mm512_setzero_si512, _mm512_srli_epi16, _mm512_storeu_si512,
407 };
408
409 debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx512: x0={x0} must be even");
410
411 let n = shape.len();
412 let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 64);
413
414 // 0x0F mask: after shifting or masking, isolates the low 4 bits of each byte.
415 // AVX-512 intrinsics are safe to call within a `#[target_feature]` function —
416 // the feature guarantee makes them non-unsafe in this context.
417 let mask_lo = _mm512_set1_epi8(0x0F_u8.cast_signed());
418
419 for chunk_idx in 0..n_chunks {
420 let byte_off = byte_x0 + chunk_idx * 64;
421
422 let (mut acc_hi, mut acc_lo) = (_mm512_setzero_si512(), _mm512_setzero_si512());
423
424 for row in rows {
425 assert!(
426 byte_off + 64 <= row.len(),
427 "aa_coverage_span_avx512: row too short: \
428 need {} bytes at offset {byte_off}, have {}",
429 byte_off + 64,
430 row.len(),
431 );
432 // SAFETY: byte_off + 64 ≤ row.len() asserted above.
433 unsafe {
434 let v = _mm512_loadu_si512(row[byte_off..].as_ptr().cast());
435 // High nibble: arithmetic right-shift by 4, then mask off upper bits.
436 let hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), mask_lo);
437 // Low nibble: mask directly.
438 let lo = _mm512_and_si512(v, mask_lo);
439 acc_hi = _mm512_add_epi8(acc_hi, _mm512_popcnt_epi8(hi));
440 acc_lo = _mm512_add_epi8(acc_lo, _mm512_popcnt_epi8(lo));
441 }
442 }
443
444 let mut hi_buf = [0u8; 64];
445 let mut lo_buf = [0u8; 64];
446 // SAFETY: buffers are exactly 64 bytes; unaligned stores are always valid.
447 unsafe {
448 _mm512_storeu_si512(hi_buf.as_mut_ptr().cast(), acc_hi);
449 _mm512_storeu_si512(lo_buf.as_mut_ptr().cast(), acc_lo);
450 }
451
452 // Interleave: even pixel k*2 ← hi_buf[k], odd pixel k*2+1 ← lo_buf[k].
453 let out_base = chunk_idx * 128;
454 for k in 0..64 {
455 let even_px = out_base + k * 2;
456 let odd_px = even_px + 1;
457 if even_px < n {
458 shape[even_px] = hi_buf[k];
459 }
460 if odd_px < n {
461 shape[odd_px] = lo_buf[k];
462 }
463 }
464 }
465
466 // Scalar remainder (< 64 row bytes = < 128 output pixels).
467 let scalar_start = n_chunks * 128;
468 if scalar_start < n {
469 aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
470 }
471}
472
473// ── Public dispatch ───────────────────────────────────────────────────────────
474
475/// Fill `shape[i]` with the AA coverage count (0..=16) for output pixel `x0 + i`.
476///
477/// `rows` are the four `AaBuf` sub-row byte slices, each of length
478/// `(bitmap_width * AA_SIZE + 7) / 8`.
479///
480/// # Preconditions
481///
482/// - `x0 + shape.len() ≤ bitmap_width` — span must not exceed the bitmap row.
483/// - `x0` should be **even** for the SIMD tiers to be used. An odd `x0` is
484/// correctly handled by the scalar tier (full precision, lower throughput).
485///
486/// # Panics
487///
488/// Panics if any row slice is shorter than required by the span — i.e. if the
489/// `x0 + shape.len() ≤ bitmap_width` precondition is violated.
490pub fn aa_coverage_span(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
491 if shape.is_empty() {
492 return;
493 }
494 dispatch_coverage(rows, x0, shape);
495}
496
497#[cfg(target_arch = "x86_64")]
498#[inline]
499fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
500 if x0 & 1 != 0 {
501 // Odd x0: nibble boundaries are byte-aligned; SIMD paths require even x0.
502 aa_coverage_span_scalar(rows, x0, shape);
503 return;
504 }
505 if is_x86_feature_detected!("avx512bitalg") && is_x86_feature_detected!("avx512bw") {
506 // SAFETY: both features confirmed present.
507 unsafe { aa_coverage_span_avx512(rows, x0, shape) };
508 } else if is_x86_feature_detected!("avx2") {
509 // SAFETY: avx2 confirmed present.
510 unsafe { aa_coverage_span_avx2(rows, x0, shape) };
511 } else {
512 aa_coverage_span_scalar(rows, x0, shape);
513 }
514}
515
516#[cfg(target_arch = "aarch64")]
517#[inline]
518fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
519 if x0 & 1 != 0 {
520 // Odd x0: nibble boundaries are byte-aligned; SIMD paths require even x0.
521 aa_coverage_span_scalar(rows, x0, shape);
522 return;
523 }
524 #[cfg(feature = "nightly-sve2")]
525 if std::arch::is_aarch64_feature_detected!("sve2") {
526 // SAFETY: sve2 feature confirmed present.
527 unsafe { aa_coverage_span_sve2(rows, x0, shape) };
528 return;
529 }
530 // NEON is mandatory on all ARMv8-A targets.
531 // SAFETY: aarch64 always has NEON.
532 unsafe { aa_coverage_span_neon(rows, x0, shape) };
533}
534
535#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
536#[inline]
537fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
538 aa_coverage_span_scalar(rows, x0, shape);
539}
540
541// ── Tests ─────────────────────────────────────────────────────────────────────
542
543#[cfg(test)]
544mod tests {
545 use super::*;
546
547 // ── aa_coverage_span ──────────────────────────────────────────────────────
548
549 /// Build four row buffers from a `[[u8; N]; 4]` literal.
550 fn make_rows<const N: usize>(data: [[u8; N]; 4]) -> [Vec<u8>; 4] {
551 data.map(|r| r.to_vec())
552 }
553
554 /// Build four deterministic pseudo-random rows of `row_bytes` bytes for
555 /// dispatch-vs-scalar cross-check tests. Three rows use distinct hash
556 /// schedules `(mul, add)`; the fourth row is `!i`. Stays in `u8` arithmetic
557 /// throughout — no truncating cast from `usize`.
558 fn dispatch_test_rows(row_bytes: usize, schedules: [(u8, u8); 3]) -> [Vec<u8>; 4] {
559 let mk = |mul: u8, add: u8| -> Vec<u8> {
560 (0u8..)
561 .take(row_bytes)
562 .map(|i| i.wrapping_mul(mul).wrapping_add(add))
563 .collect()
564 };
565 let [(m0, a0), (m1, a1), (m2, a2)] = schedules;
566 let r3: Vec<u8> = (0u8..).take(row_bytes).map(|i| !i).collect();
567 [mk(m0, a0), mk(m1, a1), mk(m2, a2), r3]
568 }
569
570 #[test]
571 fn coverage_span_all_zero() {
572 let rows = make_rows([[0u8; 4]; 4]);
573 let mut shape = [0xFFu8; 8];
574 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
575 assert_eq!(shape, [0u8; 8]);
576 }
577
578 #[test]
579 fn coverage_span_all_ones() {
580 // 0xFF: high nibble 0xF (4 bits) + low nibble 0xF (4 bits) → 8 output pixels.
581 // 4 rows × 4 bits/nibble = 16 per pixel.
582 let rows = make_rows([[0xFFu8; 4]; 4]);
583 let mut shape = [0u8; 8];
584 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
585 assert_eq!(shape, [16u8; 8]);
586 }
587
588 #[test]
589 fn coverage_span_single_pixel_even() {
590 // Pixel 0 = high nibble of byte 0. Set only that nibble in row 0.
591 let rows = [
592 vec![0xF0u8, 0, 0, 0],
593 vec![0u8; 4],
594 vec![0u8; 4],
595 vec![0u8; 4],
596 ];
597 let mut shape = [0u8; 2];
598 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
599 assert_eq!(shape, [4, 0]);
600 }
601
602 #[test]
603 fn coverage_span_single_pixel_odd() {
604 // Pixel 1 = low nibble of byte 0. Set only that nibble in row 0.
605 let rows = [
606 vec![0x0Fu8, 0, 0, 0],
607 vec![0u8; 4],
608 vec![0u8; 4],
609 vec![0u8; 4],
610 ];
611 let mut shape = [0u8; 2];
612 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
613 assert_eq!(shape, [0, 4]);
614 }
615
616 #[test]
617 fn coverage_span_x0_offset() {
618 // x0=2: pixel 2 = high nibble of byte 1.
619 // Row 0 byte 1 high nibble = 0xA = 0b1010 = 2 bits.
620 // Row 1 byte 1 high nibble = 0x5 = 0b0101 = 2 bits. Total = 4.
621 let rows = [
622 vec![0u8, 0xA0, 0, 0],
623 vec![0u8, 0x50, 0, 0],
624 vec![0u8; 4],
625 vec![0u8; 4],
626 ];
627 let mut shape = [0u8; 1];
628 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 2, &mut shape);
629 assert_eq!(shape[0], 4);
630 }
631
632 #[test]
633 fn coverage_span_odd_x0_matches_scalar() {
634 // Odd x0=1: SIMD tiers must fall back to scalar; result must still be correct.
635 const N: usize = 10;
636 let row_bytes = (1 + N).div_ceil(2); // bytes needed for pixels 1..=10
637 let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
638
639 let mut expected = vec![0u8; N];
640 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut expected);
641
642 let mut got = vec![0u8; N];
643 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut got);
644
645 assert_eq!(got, expected, "odd x0 result mismatch");
646 }
647
648 #[test]
649 fn coverage_span_dispatch_matches_scalar() {
650 // 300 output pixels = 150 row bytes.
651 // Exercises: AVX-512 (2 × 64-byte chunks + 22-byte scalar remainder),
652 // NEON (9 × 16-byte chunks + 6-byte scalar remainder),
653 // scalar (full path).
654 const N: usize = 300;
655 let row_bytes = N.div_ceil(2);
656 let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
657
658 let mut expected = vec![0u8; N];
659 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
660
661 let mut got = vec![0u8; N];
662 aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got);
663
664 assert_eq!(got, expected, "dispatch mismatch on N={N}");
665 }
666
667 #[test]
668 fn coverage_span_empty_is_noop() {
669 let row = vec![0xFFu8; 4];
670 let mut shape: [u8; 0] = [];
671 aa_coverage_span([&row, &row, &row, &row], 0, &mut shape); // must not panic
672 }
673
674 /// Schedule used by all per-tier cross-checks below — three distinct
675 /// `(mul, add)` pairs feed the [`dispatch_test_rows`] helper.
676 const TIER_SCHEDULES: [(u8, u8); 3] = [(37, 11), (53, 7), (17, 3)];
677
678 /// Output-pixel span used by every per-tier cross-check. 300 leaves a
679 /// non-aligned tail in every tier (AVX-512: 2 × 64-byte chunks + 22-byte
680 /// scalar; AVX2: 4 × 32-byte chunks + 44-px scalar; NEON: 9 × 16-byte
681 /// chunks + 6-byte scalar).
682 const TIER_TEST_N: usize = 300;
683
684 #[cfg(target_arch = "x86_64")]
685 #[test]
686 fn avx512_coverage_matches_scalar() {
687 if !is_x86_feature_detected!("avx512bitalg") || !is_x86_feature_detected!("avx512bw") {
688 return;
689 }
690 let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
691
692 let mut expected = vec![0u8; TIER_TEST_N];
693 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
694
695 let mut got = vec![0u8; TIER_TEST_N];
696 // SAFETY: both features confirmed present above.
697 unsafe { aa_coverage_span_avx512([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
698
699 assert_eq!(got, expected, "AVX-512 coverage mismatch vs scalar");
700 }
701
702 #[cfg(target_arch = "x86_64")]
703 #[test]
704 fn avx2_coverage_matches_scalar() {
705 if !is_x86_feature_detected!("avx2") {
706 return;
707 }
708 let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
709
710 let mut expected = vec![0u8; TIER_TEST_N];
711 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
712
713 let mut got = vec![0u8; TIER_TEST_N];
714 // SAFETY: avx2 confirmed present above.
715 unsafe { aa_coverage_span_avx2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
716
717 assert_eq!(got, expected, "AVX2 coverage mismatch vs scalar");
718 }
719
720 #[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
721 #[test]
722 fn sve2_coverage_matches_scalar() {
723 if !std::arch::is_aarch64_feature_detected!("sve2") {
724 return;
725 }
726 let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
727
728 let mut expected = vec![0u8; TIER_TEST_N];
729 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
730
731 let mut got = vec![0u8; TIER_TEST_N];
732 // SAFETY: sve2 confirmed present above.
733 unsafe { aa_coverage_span_sve2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
734
735 assert_eq!(got, expected, "SVE2 coverage mismatch vs scalar");
736 }
737
738 #[cfg(target_arch = "aarch64")]
739 #[test]
740 fn neon_coverage_matches_scalar() {
741 let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
742
743 let mut expected = vec![0u8; TIER_TEST_N];
744 aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
745
746 let mut got = vec![0u8; TIER_TEST_N];
747 // SAFETY: aarch64 always has NEON.
748 unsafe { aa_coverage_span_neon([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
749
750 assert_eq!(got, expected, "NEON coverage mismatch vs scalar");
751 }
752}