#![allow(clippy::missing_panics_doc)]
use wide::u8x16;
#[must_use]
pub fn count_bases_ge_q(qual: &[u8], threshold: u8) -> u64 {
let cutoff = u8x16::splat(threshold);
let mut count = 0u64;
let chunks = qual.chunks_exact(16);
let tail = chunks.remainder();
for chunk in chunks {
let v = u8x16::new(chunk.try_into().unwrap());
count += u64::from(v.simd_ge(cutoff).to_bitmask().count_ones());
}
for &q in tail {
if q >= threshold {
count += 1;
}
}
count
}
#[must_use]
#[inline]
pub fn count_bases_lt_q(qual: &[u8], threshold: u8) -> u64 {
qual.len() as u64 - count_bases_ge_q(qual, threshold)
}
#[must_use]
pub fn count_gc_case_insensitive(seq: &[u8]) -> u64 {
let case = u8x16::splat(0x20);
let g_lower = u8x16::splat(b'g');
let c_lower = u8x16::splat(b'c');
let mut count = 0u64;
let chunks = seq.chunks_exact(16);
let tail = chunks.remainder();
for chunk in chunks {
let v = u8x16::new(chunk.try_into().unwrap()) | case;
let gc_mask = v.simd_eq(g_lower).to_bitmask() | v.simd_eq(c_lower).to_bitmask();
count += u64::from(gc_mask.count_ones());
}
for &b in tail {
let folded = b | 0x20;
if folded == b'g' || folded == b'c' {
count += 1;
}
}
count
}
#[must_use]
pub fn count_n_case_insensitive(seq: &[u8]) -> u64 {
let case = u8x16::splat(0x20);
let n_lower = u8x16::splat(b'n');
let mut count = 0u64;
let chunks = seq.chunks_exact(16);
let tail = chunks.remainder();
for chunk in chunks {
let v = u8x16::new(chunk.try_into().unwrap()) | case;
count += u64::from(v.simd_eq(n_lower).to_bitmask().count_ones());
}
for &b in tail {
if b | 0x20 == b'n' {
count += 1;
}
}
count
}
#[cfg(test)]
mod tests {
use super::*;
fn scalar_count_ge_q(qual: &[u8], threshold: u8) -> u64 {
qual.iter().filter(|&&q| q >= threshold).count() as u64
}
fn scalar_count_lt_q(qual: &[u8], threshold: u8) -> u64 {
qual.iter().filter(|&&q| q < threshold).count() as u64
}
fn scalar_count_gc(seq: &[u8]) -> u64 {
seq.iter().filter(|&&b| matches!(b, b'G' | b'C' | b'g' | b'c')).count() as u64
}
fn scalar_count_n(seq: &[u8]) -> u64 {
seq.iter().filter(|&&b| matches!(b, b'N' | b'n')).count() as u64
}
struct XorShift(u64);
impl XorShift {
fn new(seed: u64) -> Self {
Self(seed.max(1))
}
fn next_u64(&mut self) -> u64 {
let mut x = self.0;
x ^= x >> 12;
x ^= x << 25;
x ^= x >> 27;
self.0 = x;
x.wrapping_mul(0x2545_F491_4F6C_DD1D)
}
fn fill_byte(&mut self, buf: &mut [u8], alphabet: &[u8]) {
for b in buf {
let r = self.next_u64();
#[allow(clippy::cast_possible_truncation)]
let idx = (r as usize) % alphabet.len();
*b = alphabet[idx];
}
}
}
#[test]
fn count_bases_ge_q_empty() {
assert_eq!(count_bases_ge_q(&[], 20), 0);
}
#[test]
fn count_bases_ge_q_all_pass() {
let qual = vec![40u8; 100];
assert_eq!(count_bases_ge_q(&qual, 20), 100);
}
#[test]
fn count_bases_ge_q_all_fail() {
let qual = vec![10u8; 100];
assert_eq!(count_bases_ge_q(&qual, 20), 0);
}
#[test]
fn count_bases_ge_q_boundary_values() {
let qual: Vec<u8> = (0..64).map(|i| if i % 2 == 0 { 19 } else { 20 }).collect();
assert_eq!(count_bases_ge_q(&qual, 20), 32);
}
#[test]
fn count_bases_ge_q_matches_scalar_on_chunk_boundaries() {
let mut rng = XorShift::new(0xCAFE_F00D);
for len in [0usize, 1, 15, 16, 17, 31, 32, 33, 63, 64, 65, 150, 300] {
let mut qual = vec![0u8; len];
rng.fill_byte(&mut qual, &(0u8..=60).collect::<Vec<u8>>());
for t in [0u8, 1, 10, 20, 30, 40, 60, 255] {
assert_eq!(
count_bases_ge_q(&qual, t),
scalar_count_ge_q(&qual, t),
"len={len} t={t}",
);
}
}
}
#[test]
fn count_bases_lt_q_empty() {
assert_eq!(count_bases_lt_q(&[], 20), 0);
}
#[test]
fn count_bases_lt_q_matches_scalar_on_chunk_boundaries() {
let mut rng = XorShift::new(0xDEAD_BEEF);
for len in [0usize, 1, 15, 16, 17, 31, 32, 33, 150] {
let mut qual = vec![0u8; len];
rng.fill_byte(&mut qual, &(0u8..=60).collect::<Vec<u8>>());
for t in [0u8, 1, 20, 30, 255] {
assert_eq!(
count_bases_lt_q(&qual, t),
scalar_count_lt_q(&qual, t),
"len={len} t={t}",
);
}
}
}
#[test]
fn count_bases_ge_q_and_lt_q_partition_the_slice() {
let mut rng = XorShift::new(0x1234_5678);
let mut qual = vec![0u8; 200];
rng.fill_byte(&mut qual, &(0u8..=60).collect::<Vec<u8>>());
for t in [0u8, 1, 20, 30, 40] {
assert_eq!(count_bases_ge_q(&qual, t) + count_bases_lt_q(&qual, t), qual.len() as u64,);
}
}
#[test]
fn count_gc_empty() {
assert_eq!(count_gc_case_insensitive(&[]), 0);
}
#[test]
fn count_gc_case_insensitive_basic() {
assert_eq!(count_gc_case_insensitive(b"ACGT"), 2);
assert_eq!(count_gc_case_insensitive(b"acgt"), 2);
assert_eq!(count_gc_case_insensitive(b"AaCcGgTt"), 4);
assert_eq!(count_gc_case_insensitive(b"NNNNNN"), 0);
assert_eq!(count_gc_case_insensitive(b""), 0);
}
#[test]
fn count_gc_case_insensitive_rejects_ambiguity_codes() {
assert_eq!(count_gc_case_insensitive(b"SSSS"), 0);
assert_eq!(count_gc_case_insensitive(b"KKKK"), 0);
}
#[test]
fn count_gc_all_gc_boundary_lengths() {
for len in [15usize, 16, 17] {
let seq: Vec<u8> = (0..len).map(|i| if i % 2 == 0 { b'G' } else { b'C' }).collect();
assert_eq!(count_gc_case_insensitive(&seq), len as u64, "len={len}");
}
for len in [15usize, 16, 17] {
let seq = vec![b'g'; len];
assert_eq!(count_gc_case_insensitive(&seq), len as u64, "len={len} (all g)");
}
}
#[test]
fn count_gc_matches_scalar_on_chunk_boundaries() {
let mut rng = XorShift::new(0xAA55_AA55);
let alphabet: &[u8] = b"ACGTNacgtnSKYMRW\x00\x80\xFF";
for len in [0usize, 1, 15, 16, 17, 31, 32, 33, 64, 150, 300] {
let mut seq = vec![0u8; len];
rng.fill_byte(&mut seq, alphabet);
assert_eq!(count_gc_case_insensitive(&seq), scalar_count_gc(&seq), "len={len}");
}
}
#[test]
fn count_n_empty() {
assert_eq!(count_n_case_insensitive(&[]), 0);
}
#[test]
fn count_n_case_insensitive_basic() {
assert_eq!(count_n_case_insensitive(b"NNNN"), 4);
assert_eq!(count_n_case_insensitive(b"nnnn"), 4);
assert_eq!(count_n_case_insensitive(b"NnNn"), 4);
assert_eq!(count_n_case_insensitive(b"ACGT"), 0);
}
#[test]
fn count_n_matches_scalar_on_chunk_boundaries() {
let mut rng = XorShift::new(0x3141_5926);
let alphabet: &[u8] = b"ACGTNacgtn\x00\x80\xFF";
for len in [0usize, 1, 15, 16, 17, 31, 32, 33, 64, 150] {
let mut seq = vec![0u8; len];
rng.fill_byte(&mut seq, alphabet);
assert_eq!(count_n_case_insensitive(&seq), scalar_count_n(&seq), "len={len}");
}
}
}