const NIBBLE_POP: [u8; 16] = [0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4];
pub(super) fn aa_coverage_span_scalar(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
for (i, out) in shape.iter_mut().enumerate() {
let x = x0 + i;
let byte_idx = x >> 1;
let is_odd = (x & 1) != 0;
let mut count = 0u8;
for row in rows {
debug_assert!(
byte_idx < row.len(),
"aa_coverage_span_scalar: byte_idx={byte_idx} out of bounds (row.len={})",
row.len()
);
let byte = row[byte_idx];
let nibble = if is_odd { byte & 0x0f } else { byte >> 4 };
count += NIBBLE_POP[nibble as usize];
}
*out = count;
}
}
#[inline]
fn coverage_chunk_params(x0: usize, n: usize, chunk_bytes: usize) -> (usize, usize) {
debug_assert!(
chunk_bytes > 0,
"coverage_chunk_params: chunk_bytes must be > 0"
);
let byte_x0 = x0 >> 1;
let n_chunks = n.div_ceil(2) / chunk_bytes;
(byte_x0, n_chunks)
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn aa_coverage_span_neon(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
use std::arch::aarch64::{
uint8x16x2_t, vaddq_u8, vandq_u8, vcntq_u8, vdupq_n_u8, vld1q_u8, vshrq_n_u8, vst2q_u8,
};
debug_assert!(x0 & 1 == 0, "aa_coverage_span_neon: x0={x0} must be even");
let n = shape.len();
let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 16);
unsafe {
let mask_lo = vdupq_n_u8(0x0F);
for chunk_idx in 0..n_chunks {
let byte_off = byte_x0 + chunk_idx * 16;
let mut acc_hi = vdupq_n_u8(0);
let mut acc_lo = vdupq_n_u8(0);
for row in rows {
assert!(
byte_off + 16 <= row.len(),
"aa_coverage_span_neon: row too short: \
need {} bytes at offset {byte_off}, have {}",
byte_off + 16,
row.len(),
);
let v = vld1q_u8(row[byte_off..].as_ptr());
let hi = vandq_u8(vshrq_n_u8(v, 4), mask_lo);
let lo = vandq_u8(v, mask_lo);
acc_hi = vaddq_u8(acc_hi, vcntq_u8(hi));
acc_lo = vaddq_u8(acc_lo, vcntq_u8(lo));
}
let out_base = chunk_idx * 32;
let remaining = n - out_base;
if remaining >= 32 {
vst2q_u8(shape[out_base..].as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
} else {
let mut tmp = [0u8; 32];
vst2q_u8(tmp.as_mut_ptr(), uint8x16x2_t(acc_hi, acc_lo));
shape[out_base..].copy_from_slice(&tmp[..remaining]);
}
}
}
let scalar_start = n_chunks * 32;
if scalar_start < n {
aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
}
}
#[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
#[target_feature(enable = "sve2")]
unsafe fn aa_coverage_span_sve2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
use std::arch::aarch64::{
svadd_u8_z, svand_u8_z, svcnt_u8_z, svcntb, svdup_n_u8, svld1_u8, svlsr_u8_z, svptrue_b8,
svst1_u8,
};
debug_assert!(x0 & 1 == 0, "aa_coverage_span_sve2: x0={x0} must be even");
#[expect(
clippy::cast_possible_truncation,
reason = "aarch64 is 64-bit; svcntb() ≤ 256 fits in usize"
)]
let vl = svcntb() as usize;
debug_assert!(
vl <= 256,
"aa_coverage_span_sve2: svcntb()={vl} exceeds SVE max of 256"
);
let pg = svptrue_b8();
let mask_lo = svdup_n_u8(0x0F);
let shift4 = svdup_n_u8(4u8);
let n = shape.len();
let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, vl);
let mut hi_buf = [0u8; 256];
let mut lo_buf = [0u8; 256];
for chunk_idx in 0..n_chunks {
let byte_off = byte_x0 + chunk_idx * vl;
let mut acc_hi = svdup_n_u8(0u8);
let mut acc_lo = svdup_n_u8(0u8);
for row in rows {
assert!(
byte_off + vl <= row.len(),
"aa_coverage_span_sve2: row too short: \
need {} bytes at offset {byte_off}, have {}",
byte_off + vl,
row.len(),
);
let v = unsafe { svld1_u8(pg, row.as_ptr().add(byte_off)) };
let hi = svand_u8_z(pg, svlsr_u8_z(pg, v, shift4), mask_lo);
let lo = svand_u8_z(pg, v, mask_lo);
acc_hi = svadd_u8_z(pg, acc_hi, svcnt_u8_z(pg, hi));
acc_lo = svadd_u8_z(pg, acc_lo, svcnt_u8_z(pg, lo));
}
unsafe {
svst1_u8(pg, hi_buf.as_mut_ptr(), acc_hi);
svst1_u8(pg, lo_buf.as_mut_ptr(), acc_lo);
}
let out_base = chunk_idx * vl * 2;
for k in 0..vl {
let even_px = out_base + k * 2;
let odd_px = even_px + 1;
if even_px < n {
shape[even_px] = hi_buf[k];
}
if odd_px < n {
shape[odd_px] = lo_buf[k];
}
}
}
let scalar_start = n_chunks * vl * 2;
if scalar_start < n {
aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn aa_coverage_span_avx2(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
use std::arch::x86_64::{
_mm256_add_epi8, _mm256_and_si256, _mm256_loadu_si256, _mm256_set_epi8, _mm256_set1_epi8,
_mm256_setzero_si256, _mm256_shuffle_epi8, _mm256_srli_epi16, _mm256_storeu_si256,
};
debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx2: x0={x0} must be even");
let n = shape.len();
let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 32);
let lut = _mm256_set_epi8(
4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1,
1, 0,
);
let mask_lo = _mm256_set1_epi8(0x0F_u8.cast_signed());
for chunk_idx in 0..n_chunks {
let byte_off = byte_x0 + chunk_idx * 32;
let (mut acc_hi, mut acc_lo) = (_mm256_setzero_si256(), _mm256_setzero_si256());
for row in rows {
assert!(
byte_off + 32 <= row.len(),
"aa_coverage_span_avx2: row too short: \
need {} bytes at offset {byte_off}, have {}",
byte_off + 32,
row.len(),
);
let v = unsafe { _mm256_loadu_si256(row[byte_off..].as_ptr().cast()) };
let hi = _mm256_and_si256(_mm256_srli_epi16(v, 4), mask_lo);
let lo = _mm256_and_si256(v, mask_lo);
acc_hi = _mm256_add_epi8(acc_hi, _mm256_shuffle_epi8(lut, hi));
acc_lo = _mm256_add_epi8(acc_lo, _mm256_shuffle_epi8(lut, lo));
}
let mut hi_buf = [0u8; 32];
let mut lo_buf = [0u8; 32];
unsafe {
_mm256_storeu_si256(hi_buf.as_mut_ptr().cast(), acc_hi);
_mm256_storeu_si256(lo_buf.as_mut_ptr().cast(), acc_lo);
}
let out_base = chunk_idx * 64;
for k in 0..32 {
let even_px = out_base + k * 2;
let odd_px = even_px + 1;
if even_px < n {
shape[even_px] = hi_buf[k];
}
if odd_px < n {
shape[odd_px] = lo_buf[k];
}
}
}
let scalar_start = n_chunks * 64;
if scalar_start < n {
aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512bitalg,avx512bw")]
unsafe fn aa_coverage_span_avx512(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
use std::arch::x86_64::{
_mm512_add_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_popcnt_epi8,
_mm512_set1_epi8, _mm512_setzero_si512, _mm512_srli_epi16, _mm512_storeu_si512,
};
debug_assert!(x0 & 1 == 0, "aa_coverage_span_avx512: x0={x0} must be even");
let n = shape.len();
let (byte_x0, n_chunks) = coverage_chunk_params(x0, n, 64);
let mask_lo = _mm512_set1_epi8(0x0F_u8.cast_signed());
for chunk_idx in 0..n_chunks {
let byte_off = byte_x0 + chunk_idx * 64;
let (mut acc_hi, mut acc_lo) = (_mm512_setzero_si512(), _mm512_setzero_si512());
for row in rows {
assert!(
byte_off + 64 <= row.len(),
"aa_coverage_span_avx512: row too short: \
need {} bytes at offset {byte_off}, have {}",
byte_off + 64,
row.len(),
);
unsafe {
let v = _mm512_loadu_si512(row[byte_off..].as_ptr().cast());
let hi = _mm512_and_si512(_mm512_srli_epi16(v, 4), mask_lo);
let lo = _mm512_and_si512(v, mask_lo);
acc_hi = _mm512_add_epi8(acc_hi, _mm512_popcnt_epi8(hi));
acc_lo = _mm512_add_epi8(acc_lo, _mm512_popcnt_epi8(lo));
}
}
let mut hi_buf = [0u8; 64];
let mut lo_buf = [0u8; 64];
unsafe {
_mm512_storeu_si512(hi_buf.as_mut_ptr().cast(), acc_hi);
_mm512_storeu_si512(lo_buf.as_mut_ptr().cast(), acc_lo);
}
let out_base = chunk_idx * 128;
for k in 0..64 {
let even_px = out_base + k * 2;
let odd_px = even_px + 1;
if even_px < n {
shape[even_px] = hi_buf[k];
}
if odd_px < n {
shape[odd_px] = lo_buf[k];
}
}
}
let scalar_start = n_chunks * 128;
if scalar_start < n {
aa_coverage_span_scalar(rows, x0 + scalar_start, &mut shape[scalar_start..]);
}
}
pub fn aa_coverage_span(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
if shape.is_empty() {
return;
}
dispatch_coverage(rows, x0, shape);
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
if x0 & 1 != 0 {
aa_coverage_span_scalar(rows, x0, shape);
return;
}
if is_x86_feature_detected!("avx512bitalg") && is_x86_feature_detected!("avx512bw") {
unsafe { aa_coverage_span_avx512(rows, x0, shape) };
} else if is_x86_feature_detected!("avx2") {
unsafe { aa_coverage_span_avx2(rows, x0, shape) };
} else {
aa_coverage_span_scalar(rows, x0, shape);
}
}
#[cfg(target_arch = "aarch64")]
#[inline]
fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
if x0 & 1 != 0 {
aa_coverage_span_scalar(rows, x0, shape);
return;
}
#[cfg(feature = "nightly-sve2")]
if std::arch::is_aarch64_feature_detected!("sve2") {
unsafe { aa_coverage_span_sve2(rows, x0, shape) };
return;
}
unsafe { aa_coverage_span_neon(rows, x0, shape) };
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
#[inline]
fn dispatch_coverage(rows: [&[u8]; 4], x0: usize, shape: &mut [u8]) {
aa_coverage_span_scalar(rows, x0, shape);
}
#[cfg(test)]
mod tests {
use super::*;
fn make_rows<const N: usize>(data: [[u8; N]; 4]) -> [Vec<u8>; 4] {
data.map(|r| r.to_vec())
}
fn dispatch_test_rows(row_bytes: usize, schedules: [(u8, u8); 3]) -> [Vec<u8>; 4] {
let mk = |mul: u8, add: u8| -> Vec<u8> {
(0u8..)
.take(row_bytes)
.map(|i| i.wrapping_mul(mul).wrapping_add(add))
.collect()
};
let [(m0, a0), (m1, a1), (m2, a2)] = schedules;
let r3: Vec<u8> = (0u8..).take(row_bytes).map(|i| !i).collect();
[mk(m0, a0), mk(m1, a1), mk(m2, a2), r3]
}
#[test]
fn coverage_span_all_zero() {
let rows = make_rows([[0u8; 4]; 4]);
let mut shape = [0xFFu8; 8];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
assert_eq!(shape, [0u8; 8]);
}
#[test]
fn coverage_span_all_ones() {
let rows = make_rows([[0xFFu8; 4]; 4]);
let mut shape = [0u8; 8];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
assert_eq!(shape, [16u8; 8]);
}
#[test]
fn coverage_span_single_pixel_even() {
let rows = [
vec![0xF0u8, 0, 0, 0],
vec![0u8; 4],
vec![0u8; 4],
vec![0u8; 4],
];
let mut shape = [0u8; 2];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
assert_eq!(shape, [4, 0]);
}
#[test]
fn coverage_span_single_pixel_odd() {
let rows = [
vec![0x0Fu8, 0, 0, 0],
vec![0u8; 4],
vec![0u8; 4],
vec![0u8; 4],
];
let mut shape = [0u8; 2];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut shape);
assert_eq!(shape, [0, 4]);
}
#[test]
fn coverage_span_x0_offset() {
let rows = [
vec![0u8, 0xA0, 0, 0],
vec![0u8, 0x50, 0, 0],
vec![0u8; 4],
vec![0u8; 4],
];
let mut shape = [0u8; 1];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 2, &mut shape);
assert_eq!(shape[0], 4);
}
#[test]
fn coverage_span_odd_x0_matches_scalar() {
const N: usize = 10;
let row_bytes = (1 + N).div_ceil(2); let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
let mut expected = vec![0u8; N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut expected);
let mut got = vec![0u8; N];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 1, &mut got);
assert_eq!(got, expected, "odd x0 result mismatch");
}
#[test]
fn coverage_span_dispatch_matches_scalar() {
const N: usize = 300;
let row_bytes = N.div_ceil(2);
let rows = dispatch_test_rows(row_bytes, [(0x37, 0), (0x53, 0), (0x17, 0)]);
let mut expected = vec![0u8; N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
let mut got = vec![0u8; N];
aa_coverage_span([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got);
assert_eq!(got, expected, "dispatch mismatch on N={N}");
}
#[test]
fn coverage_span_empty_is_noop() {
let row = vec![0xFFu8; 4];
let mut shape: [u8; 0] = [];
aa_coverage_span([&row, &row, &row, &row], 0, &mut shape); }
const TIER_SCHEDULES: [(u8, u8); 3] = [(37, 11), (53, 7), (17, 3)];
const TIER_TEST_N: usize = 300;
#[cfg(target_arch = "x86_64")]
#[test]
fn avx512_coverage_matches_scalar() {
if !is_x86_feature_detected!("avx512bitalg") || !is_x86_feature_detected!("avx512bw") {
return;
}
let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
let mut expected = vec![0u8; TIER_TEST_N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
let mut got = vec![0u8; TIER_TEST_N];
unsafe { aa_coverage_span_avx512([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
assert_eq!(got, expected, "AVX-512 coverage mismatch vs scalar");
}
#[cfg(target_arch = "x86_64")]
#[test]
fn avx2_coverage_matches_scalar() {
if !is_x86_feature_detected!("avx2") {
return;
}
let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
let mut expected = vec![0u8; TIER_TEST_N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
let mut got = vec![0u8; TIER_TEST_N];
unsafe { aa_coverage_span_avx2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
assert_eq!(got, expected, "AVX2 coverage mismatch vs scalar");
}
#[cfg(all(target_arch = "aarch64", feature = "nightly-sve2"))]
#[test]
fn sve2_coverage_matches_scalar() {
if !std::arch::is_aarch64_feature_detected!("sve2") {
return;
}
let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
let mut expected = vec![0u8; TIER_TEST_N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
let mut got = vec![0u8; TIER_TEST_N];
unsafe { aa_coverage_span_sve2([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
assert_eq!(got, expected, "SVE2 coverage mismatch vs scalar");
}
#[cfg(target_arch = "aarch64")]
#[test]
fn neon_coverage_matches_scalar() {
let rows = dispatch_test_rows(TIER_TEST_N.div_ceil(2), TIER_SCHEDULES);
let mut expected = vec![0u8; TIER_TEST_N];
aa_coverage_span_scalar([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut expected);
let mut got = vec![0u8; TIER_TEST_N];
unsafe { aa_coverage_span_neon([&rows[0], &rows[1], &rows[2], &rows[3]], 0, &mut got) };
assert_eq!(got, expected, "NEON coverage mismatch vs scalar");
}
}