use rustc_hash::FxHashMap;
use super::base_content::BaseContent;
use super::duplication::DuplicationStats;
use super::gc::GcStats;
use super::kmer::KmerStats;
use super::length::LengthStats;
use super::quality::QualityStats;
use super::stats::{FilterFailures, QcStats, QcSummary};
use super::Mode;
const A_IDX: usize = 0;
const T_IDX: usize = 1;
const G_IDX: usize = 2;
const C_IDX: usize = 3;
const N_IDX: usize = 4;
const BASE_TO_IDX: [usize; 256] = {
let mut table = [N_IDX; 256];
table[b'A' as usize] = A_IDX;
table[b'a' as usize] = A_IDX;
table[b'T' as usize] = T_IDX;
table[b't' as usize] = T_IDX;
table[b'G' as usize] = G_IDX;
table[b'g' as usize] = G_IDX;
table[b'C' as usize] = C_IDX;
table[b'c' as usize] = C_IDX;
table
};
const BASE_TO_BITS: [u8; 256] = {
let mut table = [255u8; 256];
table[b'A' as usize] = 0;
table[b'a' as usize] = 0;
table[b'C' as usize] = 1;
table[b'c' as usize] = 1;
table[b'G' as usize] = 2;
table[b'g' as usize] = 2;
table[b'T' as usize] = 3;
table[b't' as usize] = 3;
table
};
const FIVEMER_ARRAY_SIZE: usize = 1024;
const DEFAULT_DUP_SAMPLE_SIZE: u64 = 100_000;
#[inline]
fn fnv_hash(seq: &[u8]) -> u64 {
const FNV_OFFSET: u64 = 0xcbf29ce484222325;
const FNV_PRIME: u64 = 0x100000001b3;
let mut hash = FNV_OFFSET;
for &byte in seq {
hash ^= byte.to_ascii_uppercase() as u64;
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
#[derive(Debug, Clone)]
pub struct FastQcStats {
base_counts: Vec<[u64; 5]>,
qual_sums: Vec<u64>,
qual_counts: Vec<u64>,
q20_counts: Vec<u64>,
q30_counts: Vec<u64>,
qual_histogram: Vec<u64>,
gc_histogram: [u64; 101],
length_distribution: std::collections::HashMap<usize, u64>,
total_reads: u64,
total_bases: u64,
total_gc: u64,
total_qual_sum: u64,
min_length: usize,
max_length: usize,
kmer_counts: [u64; FIVEMER_ARRAY_SIZE],
total_kmers: u64,
dup_hashes: FxHashMap<u64, u64>,
dup_sampling_active: bool,
dup_sample_size: u64,
dup_eval_enabled: bool,
filter_failures: FilterFailures,
kmer_stats: KmerStats,
mode: Mode,
}
impl Default for FastQcStats {
fn default() -> Self {
Self::new(Mode::Short)
}
}
impl FastQcStats {
pub fn new(mode: Mode) -> Self {
let capacity = mode.default_capacity();
Self {
base_counts: Vec::with_capacity(capacity),
qual_sums: Vec::with_capacity(capacity),
qual_counts: Vec::with_capacity(capacity),
q20_counts: Vec::with_capacity(capacity),
q30_counts: Vec::with_capacity(capacity),
qual_histogram: vec![0u64; 94],
gc_histogram: [0u64; 101],
length_distribution: std::collections::HashMap::new(),
total_reads: 0,
total_bases: 0,
total_gc: 0,
total_qual_sum: 0,
min_length: usize::MAX,
max_length: 0,
kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
total_kmers: 0,
dup_hashes: FxHashMap::default(),
dup_sampling_active: true,
dup_sample_size: DEFAULT_DUP_SAMPLE_SIZE,
dup_eval_enabled: true,
filter_failures: FilterFailures::default(),
kmer_stats: KmerStats::new(),
mode,
}
}
pub fn with_duplication_disabled(mode: Mode) -> Self {
let capacity = mode.default_capacity();
Self {
base_counts: Vec::with_capacity(capacity),
qual_sums: Vec::with_capacity(capacity),
qual_counts: Vec::with_capacity(capacity),
q20_counts: Vec::with_capacity(capacity),
q30_counts: Vec::with_capacity(capacity),
qual_histogram: vec![0u64; 94],
gc_histogram: [0u64; 101],
length_distribution: std::collections::HashMap::new(),
total_reads: 0,
total_bases: 0,
total_gc: 0,
total_qual_sum: 0,
min_length: usize::MAX,
max_length: 0,
kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
total_kmers: 0,
dup_hashes: FxHashMap::default(),
dup_sampling_active: false,
dup_sample_size: 0,
dup_eval_enabled: false,
filter_failures: FilterFailures::default(),
kmer_stats: KmerStats::new(),
mode,
}
}
pub fn with_full_config(
mode: Mode,
dup_enabled: bool,
overrep_enabled: bool,
overrep_sampling: u32,
) -> Self {
let capacity = mode.default_capacity();
let kmer_stats = if overrep_enabled {
KmerStats::with_sampling_rate(overrep_sampling)
} else {
KmerStats::disabled()
};
Self {
base_counts: Vec::with_capacity(capacity),
qual_sums: Vec::with_capacity(capacity),
qual_counts: Vec::with_capacity(capacity),
q20_counts: Vec::with_capacity(capacity),
q30_counts: Vec::with_capacity(capacity),
qual_histogram: vec![0u64; 94],
gc_histogram: [0u64; 101],
length_distribution: std::collections::HashMap::new(),
total_reads: 0,
total_bases: 0,
total_gc: 0,
total_qual_sum: 0,
min_length: usize::MAX,
max_length: 0,
kmer_counts: [0u64; FIVEMER_ARRAY_SIZE],
total_kmers: 0,
dup_hashes: FxHashMap::default(),
dup_sampling_active: dup_enabled,
dup_sample_size: if dup_enabled { DEFAULT_DUP_SAMPLE_SIZE } else { 0 },
dup_eval_enabled: dup_enabled,
filter_failures: FilterFailures::default(),
kmer_stats,
mode,
}
}
pub fn new_short() -> Self {
Self::new(Mode::Short)
}
pub fn new_long() -> Self {
Self::new(Mode::Long)
}
#[inline]
fn ensure_capacity(&mut self, len: usize) {
if len > self.base_counts.len() {
self.base_counts.resize(len, [0u64; 5]);
self.qual_sums.resize(len, 0);
self.qual_counts.resize(len, 0);
self.q20_counts.resize(len, 0);
self.q30_counts.resize(len, 0);
}
}
#[inline]
pub fn update_fast(&mut self, seq: &[u8], qual: &[u8]) {
let len = seq.len();
if len == 0 {
return;
}
self.ensure_capacity(len);
self.total_reads += 1;
self.total_bases += len as u64;
self.min_length = self.min_length.min(len);
self.max_length = self.max_length.max(len);
*self.length_distribution.entry(len).or_insert(0) += 1;
let mut gc_count = 0u64;
let mut kmer: u32 = 0;
let mut kmer_len = 0u8;
for i in 0..len {
let base = seq[i];
let q = qual[i];
let b_idx = BASE_TO_IDX[base as usize];
self.base_counts[i][b_idx] += 1;
let q_val = q.saturating_sub(33) as u64;
let clamped_q = q_val.min(93);
self.qual_sums[i] += clamped_q;
self.qual_counts[i] += 1;
self.qual_histogram[clamped_q as usize] += 1;
self.total_qual_sum += clamped_q;
if clamped_q >= 30 {
self.q30_counts[i] += 1;
self.q20_counts[i] += 1;
} else if clamped_q >= 20 {
self.q20_counts[i] += 1;
}
if base == b'G' || base == b'g' || base == b'C' || base == b'c' {
gc_count += 1;
}
let base_bits = BASE_TO_BITS[base as usize];
if base_bits == 255 {
kmer_len = 0;
} else {
kmer = ((kmer << 2) & 0x3FF) | (base_bits as u32);
kmer_len = (kmer_len + 1).min(5);
if kmer_len >= 5 {
self.kmer_counts[kmer as usize] += 1;
self.total_kmers += 1;
}
}
}
let gc_pct = ((gc_count * 100) / len as u64).min(100) as usize;
self.gc_histogram[gc_pct] += 1;
self.total_gc += gc_count;
if self.dup_eval_enabled {
if self.dup_sampling_active && self.total_reads <= self.dup_sample_size {
let hash = fnv_hash(seq);
*self.dup_hashes.entry(hash).or_insert(0) += 1;
}
if self.total_reads >= self.dup_sample_size {
self.dup_sampling_active = false;
}
}
self.kmer_stats.update(seq, self.total_reads);
}
pub fn merge(&mut self, other: FastQcStats) {
let max_len = self.base_counts.len().max(other.base_counts.len());
self.ensure_capacity(max_len);
for (i, other_counts) in other.base_counts.iter().enumerate() {
if i < self.base_counts.len() {
for (j, &count) in other_counts.iter().enumerate() {
self.base_counts[i][j] += count;
}
}
}
for (i, &sum) in other.qual_sums.iter().enumerate() {
if i < self.qual_sums.len() {
self.qual_sums[i] += sum;
}
}
for (i, &count) in other.qual_counts.iter().enumerate() {
if i < self.qual_counts.len() {
self.qual_counts[i] += count;
}
}
for (i, &count) in other.q20_counts.iter().enumerate() {
if i < self.q20_counts.len() {
self.q20_counts[i] += count;
}
}
for (i, &count) in other.q30_counts.iter().enumerate() {
if i < self.q30_counts.len() {
self.q30_counts[i] += count;
}
}
for (i, &count) in other.qual_histogram.iter().enumerate() {
self.qual_histogram[i] += count;
}
for (i, &count) in other.gc_histogram.iter().enumerate() {
self.gc_histogram[i] += count;
}
for (&len, &count) in &other.length_distribution {
*self.length_distribution.entry(len).or_insert(0) += count;
}
self.total_reads += other.total_reads;
self.total_bases += other.total_bases;
self.total_gc += other.total_gc;
self.total_qual_sum += other.total_qual_sum;
if other.total_reads > 0 {
self.min_length = self.min_length.min(other.min_length);
self.max_length = self.max_length.max(other.max_length);
}
for (i, &count) in other.kmer_counts.iter().enumerate() {
self.kmer_counts[i] += count;
}
self.total_kmers += other.total_kmers;
for (&hash, &count) in &other.dup_hashes {
*self.dup_hashes.entry(hash).or_insert(0) += count;
}
self.filter_failures.merge(&other.filter_failures);
self.kmer_stats.merge(&other.kmer_stats);
}
pub fn merge_ref(&mut self, other: &FastQcStats) {
let max_len = self.base_counts.len().max(other.base_counts.len());
self.ensure_capacity(max_len);
for (i, other_counts) in other.base_counts.iter().enumerate() {
if i < self.base_counts.len() {
for (j, &count) in other_counts.iter().enumerate() {
self.base_counts[i][j] += count;
}
}
}
for (i, &sum) in other.qual_sums.iter().enumerate() {
if i < self.qual_sums.len() {
self.qual_sums[i] += sum;
}
}
for (i, &count) in other.qual_counts.iter().enumerate() {
if i < self.qual_counts.len() {
self.qual_counts[i] += count;
}
}
for (i, &count) in other.q20_counts.iter().enumerate() {
if i < self.q20_counts.len() {
self.q20_counts[i] += count;
}
}
for (i, &count) in other.q30_counts.iter().enumerate() {
if i < self.q30_counts.len() {
self.q30_counts[i] += count;
}
}
for (i, &count) in other.qual_histogram.iter().enumerate() {
self.qual_histogram[i] += count;
}
for (i, &count) in other.gc_histogram.iter().enumerate() {
self.gc_histogram[i] += count;
}
for (&len, &count) in &other.length_distribution {
*self.length_distribution.entry(len).or_insert(0) += count;
}
self.total_reads += other.total_reads;
self.total_bases += other.total_bases;
self.total_gc += other.total_gc;
self.total_qual_sum += other.total_qual_sum;
if other.total_reads > 0 {
self.min_length = self.min_length.min(other.min_length);
self.max_length = self.max_length.max(other.max_length);
}
for (i, &count) in other.kmer_counts.iter().enumerate() {
self.kmer_counts[i] += count;
}
self.total_kmers += other.total_kmers;
for (&hash, &count) in &other.dup_hashes {
*self.dup_hashes.entry(hash).or_insert(0) += count;
}
self.filter_failures.merge(&other.filter_failures);
self.kmer_stats.merge(&other.kmer_stats);
}
pub fn total_reads(&self) -> u64 {
self.total_reads
}
pub fn total_bases(&self) -> u64 {
self.total_bases
}
pub fn mean_length(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
self.total_bases as f64 / self.total_reads as f64
}
}
pub fn mean_quality(&self) -> f64 {
if self.total_bases == 0 {
0.0
} else {
self.total_qual_sum as f64 / self.total_bases as f64
}
}
pub fn mean_gc(&self) -> f64 {
if self.total_bases == 0 {
0.0
} else {
(self.total_gc as f64 / self.total_bases as f64) * 100.0
}
}
pub fn q20_percent(&self) -> f64 {
if self.total_bases == 0 {
return 0.0;
}
let q20_plus: u64 = self.qual_histogram[20..].iter().sum();
(q20_plus as f64 / self.total_bases as f64) * 100.0
}
pub fn q30_percent(&self) -> f64 {
if self.total_bases == 0 {
return 0.0;
}
let q30_plus: u64 = self.qual_histogram[30..].iter().sum();
(q30_plus as f64 / self.total_bases as f64) * 100.0
}
pub fn n50(&self) -> usize {
if self.total_reads == 0 {
return 0;
}
let mut lengths: Vec<(usize, u64)> = self
.length_distribution
.iter()
.map(|(&len, &count)| (len, count))
.collect();
lengths.sort_by(|a, b| b.0.cmp(&a.0));
let half_bases = self.total_bases / 2;
let mut cumulative_bases: u64 = 0;
for (len, count) in lengths {
cumulative_bases += (len as u64) * count;
if cumulative_bases >= half_bases {
return len;
}
}
0
}
pub fn duplication_rate(&self) -> f64 {
let sampled_reads = self.dup_hashes.values().sum::<u64>();
if sampled_reads == 0 {
return 0.0;
}
let unique_sequences = self.dup_hashes.len() as u64;
let duplicates = sampled_reads.saturating_sub(unique_sequences);
(duplicates as f64 / sampled_reads as f64) * 100.0
}
pub fn mode(&self) -> Mode {
self.mode
}
pub fn is_empty(&self) -> bool {
self.total_reads == 0
}
pub fn filter_failures_mut(&mut self) -> &mut FilterFailures {
&mut self.filter_failures
}
pub fn filter_failures(&self) -> &FilterFailures {
&self.filter_failures
}
pub fn summary(&self) -> QcSummary {
QcSummary {
total_reads: self.total_reads,
total_bases: self.total_bases,
mean_length: self.mean_length(),
mean_quality: self.mean_quality(),
mean_gc: self.mean_gc(),
q20_percent: self.q20_percent(),
q30_percent: self.q30_percent(),
n50: self.n50(),
duplication_rate: self.duplication_rate(),
}
}
pub fn base_counts(&self) -> &[[u64; 5]] {
&self.base_counts
}
pub fn qual_histogram(&self) -> &[u64] {
&self.qual_histogram
}
pub fn gc_histogram(&self) -> &[u64; 101] {
&self.gc_histogram
}
pub fn length_distribution(&self) -> &std::collections::HashMap<usize, u64> {
&self.length_distribution
}
pub fn kmer_counts(&self) -> &[u64; FIVEMER_ARRAY_SIZE] {
&self.kmer_counts
}
pub fn total_kmers(&self) -> u64 {
self.total_kmers
}
pub fn to_qc_stats(&self) -> QcStats {
let base_content = BaseContent::from_raw_counts(self.base_counts.clone());
let quality = QualityStats::from_raw(
&self.qual_sums,
&self.qual_counts,
&self.qual_histogram,
self.total_qual_sum,
self.total_bases,
);
let gc = GcStats::from_raw(
&self.gc_histogram,
self.total_gc,
self.total_bases,
self.total_reads,
);
let length = LengthStats::from_raw(
self.length_distribution.clone(),
if self.total_reads > 0 { self.min_length } else { 0 },
self.max_length,
self.total_bases,
self.total_reads,
);
let kmer = self.kmer_stats.clone();
let duplication = DuplicationStats::from_raw(
self.dup_hashes.clone(),
self.dup_sample_size,
);
let mut qc = QcStats::new(self.mode);
qc.total_reads = self.total_reads;
qc.total_bases = self.total_bases;
qc.base_content = base_content;
qc.quality = quality;
qc.gc = gc;
qc.length = length;
qc.kmer = kmer;
qc.duplication = duplication;
qc.filter_failures = self.filter_failures.clone();
qc
}
}
impl From<FastQcStats> for QcStats {
fn from(fast: FastQcStats) -> Self {
fast.to_qc_stats()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_qual(scores: &[u8]) -> Vec<u8> {
scores.iter().map(|&s| s + 33).collect()
}
#[test]
fn test_fast_qc_stats_new() {
let stats = FastQcStats::new(Mode::Short);
assert_eq!(stats.total_reads(), 0);
assert_eq!(stats.total_bases(), 0);
assert!(stats.is_empty());
}
#[test]
fn test_fast_qc_stats_update_simple() {
let mut stats = FastQcStats::new(Mode::Short);
let seq = b"ATGC";
let qual = make_qual(&[40, 40, 40, 40]);
stats.update_fast(seq, &qual);
assert_eq!(stats.total_reads(), 1);
assert_eq!(stats.total_bases(), 4);
assert!(!stats.is_empty());
}
#[test]
fn test_fast_qc_stats_base_content() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
let counts = stats.base_counts();
assert_eq!(counts[0][A_IDX], 1); assert_eq!(counts[1][T_IDX], 1); assert_eq!(counts[2][G_IDX], 1); assert_eq!(counts[3][C_IDX], 1); }
#[test]
fn test_fast_qc_stats_quality() {
let mut stats = FastQcStats::new(Mode::Short);
let qual = make_qual(&[40, 40, 40, 40]);
stats.update_fast(b"ATGC", &qual);
assert!((stats.mean_quality() - 40.0).abs() < 0.001);
}
#[test]
fn test_fast_qc_stats_gc() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"GGCC", &make_qual(&[40; 4]));
assert!((stats.mean_gc() - 100.0).abs() < 0.001);
}
#[test]
fn test_fast_qc_stats_gc_50_percent() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
assert!((stats.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_fast_qc_stats_kmer() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ACGTA", &make_qual(&[40; 5]));
assert_eq!(stats.total_kmers(), 1);
}
#[test]
fn test_fast_qc_stats_kmer_sliding() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ACGTAC", &make_qual(&[40; 6]));
assert_eq!(stats.total_kmers(), 2);
}
#[test]
fn test_fast_qc_stats_kmer_with_n() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ACNTACGTA", &make_qual(&[40; 9]));
assert_eq!(stats.total_kmers(), 2);
}
#[test]
fn test_fast_qc_stats_length() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
stats.update_fast(b"ATGCATGC", &make_qual(&[40; 8]));
assert!((stats.mean_length() - 6.0).abs() < 0.001);
}
#[test]
fn test_fast_qc_stats_q20_q30() {
let mut stats = FastQcStats::new(Mode::Short);
let qual = vec![10 + 33, 25 + 33, 35 + 33];
stats.update_fast(b"ATG", &qual);
assert!((stats.q20_percent() - 66.666).abs() < 0.1);
assert!((stats.q30_percent() - 33.333).abs() < 0.1);
}
#[test]
fn test_fast_qc_stats_merge() {
let mut stats1 = FastQcStats::new(Mode::Short);
stats1.update_fast(b"ATGC", &make_qual(&[40; 4]));
let mut stats2 = FastQcStats::new(Mode::Short);
stats2.update_fast(b"GCTA", &make_qual(&[30; 4]));
stats1.merge(stats2);
assert_eq!(stats1.total_reads(), 2);
assert_eq!(stats1.total_bases(), 8);
}
#[test]
fn test_fast_qc_stats_merge_different_lengths() {
let mut stats1 = FastQcStats::new(Mode::Short);
stats1.update_fast(b"AT", &make_qual(&[40; 2]));
let mut stats2 = FastQcStats::new(Mode::Short);
stats2.update_fast(b"GCTA", &make_qual(&[30; 4]));
stats1.merge(stats2);
assert_eq!(stats1.total_reads(), 2);
assert_eq!(stats1.base_counts().len(), 4);
}
#[test]
fn test_fast_qc_stats_duplication() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
assert!((stats.duplication_rate() - 66.666).abs() < 0.1);
}
#[test]
fn test_fast_qc_stats_duplication_sampling() {
let mut stats = FastQcStats::new(Mode::Short);
for i in 0..150_000 {
stats.update_fast(format!("SEQ{:06}", i).as_bytes(), &make_qual(&[40; 9]));
}
assert!(stats.dup_hashes.len() <= 100_000);
}
#[test]
fn test_fast_qc_stats_empty_sequence() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"", &[]);
assert_eq!(stats.total_reads(), 0);
assert!(stats.is_empty());
}
#[test]
fn test_fast_qc_stats_n50() {
let mut stats = FastQcStats::new(Mode::Long);
stats.update_fast(&vec![b'A'; 100], &make_qual(&vec![40; 100]));
stats.update_fast(&vec![b'A'; 200], &make_qual(&vec![40; 200]));
stats.update_fast(&vec![b'A'; 300], &make_qual(&vec![40; 300]));
stats.update_fast(&vec![b'A'; 400], &make_qual(&vec![40; 400]));
stats.update_fast(&vec![b'A'; 500], &make_qual(&vec![40; 500]));
assert_eq!(stats.n50(), 400);
}
#[test]
fn test_fast_qc_stats_summary() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"ATGC", &make_qual(&[40; 4]));
let summary = stats.summary();
assert_eq!(summary.total_reads, 1);
assert_eq!(summary.total_bases, 4);
}
#[test]
fn test_fast_qc_stats_case_insensitive() {
let mut stats = FastQcStats::new(Mode::Short);
stats.update_fast(b"atgc", &make_qual(&[40; 4]));
let counts = stats.base_counts();
assert_eq!(counts[0][A_IDX], 1); assert_eq!(counts[1][T_IDX], 1); assert_eq!(counts[2][G_IDX], 1); assert_eq!(counts[3][C_IDX], 1); }
}