use serde::{Deserialize, Serialize};
use super::base_content::BaseContent;
use super::duplication::DuplicationStats;
use super::gc::GcStats;
use super::insert_size::InsertSizeStats;
use super::kmer::KmerStats;
use super::length::LengthStats;
use super::quality::QualityStats;
use super::Mode;
use crate::io::OwnedRecord;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QcStats {
pub total_reads: u64,
pub total_bases: u64,
pub base_content: BaseContent,
pub quality: QualityStats,
pub gc: GcStats,
pub length: LengthStats,
pub kmer: KmerStats,
pub duplication: DuplicationStats,
#[serde(default)]
pub insert_size: Option<InsertSizeStats>,
pub filter_failures: FilterFailures,
mode: Mode,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct FilterFailures {
pub failed_quality: u64,
pub failed_length: u64,
pub failed_complexity: u64,
pub failed_n_rate: u64,
}
impl FilterFailures {
pub fn merge(&mut self, other: &FilterFailures) {
self.failed_quality += other.failed_quality;
self.failed_length += other.failed_length;
self.failed_complexity += other.failed_complexity;
self.failed_n_rate += other.failed_n_rate;
}
pub fn total_failed(&self) -> u64 {
self.failed_quality + self.failed_length + self.failed_complexity + self.failed_n_rate
}
}
impl Default for QcStats {
fn default() -> Self {
Self::new(Mode::Short)
}
}
impl QcStats {
pub fn new(mode: Mode) -> Self {
let capacity = mode.default_capacity();
Self {
total_reads: 0,
total_bases: 0,
base_content: BaseContent::with_capacity(capacity),
quality: QualityStats::with_capacity(capacity),
gc: GcStats::new(),
length: LengthStats::new(),
kmer: KmerStats::new(),
duplication: DuplicationStats::new(),
insert_size: None,
filter_failures: FilterFailures::default(),
mode,
}
}
pub fn new_short() -> Self {
Self::new(Mode::Short)
}
pub fn new_long() -> Self {
Self::new(Mode::Long)
}
pub fn with_duplication_disabled(mode: Mode) -> Self {
let capacity = mode.default_capacity();
Self {
total_reads: 0,
total_bases: 0,
base_content: BaseContent::with_capacity(capacity),
quality: QualityStats::with_capacity(capacity),
gc: GcStats::new(),
length: LengthStats::new(),
kmer: KmerStats::new(),
duplication: DuplicationStats::disabled(),
insert_size: None,
filter_failures: FilterFailures::default(),
mode,
}
}
pub fn with_overrepresentation_config(mode: Mode, enabled: bool, sampling_rate: u32) -> Self {
let capacity = mode.default_capacity();
let kmer = if enabled {
KmerStats::with_sampling_rate(sampling_rate)
} else {
KmerStats::disabled()
};
Self {
total_reads: 0,
total_bases: 0,
base_content: BaseContent::with_capacity(capacity),
quality: QualityStats::with_capacity(capacity),
gc: GcStats::new(),
length: LengthStats::new(),
kmer,
duplication: DuplicationStats::new(),
insert_size: None,
filter_failures: FilterFailures::default(),
mode,
}
}
pub fn with_full_config(
mode: Mode,
duplication_enabled: bool,
overrep_enabled: bool,
overrep_sampling: u32,
) -> Self {
let capacity = mode.default_capacity();
let kmer = if overrep_enabled {
KmerStats::with_sampling_rate(overrep_sampling)
} else {
KmerStats::disabled()
};
let duplication = if duplication_enabled {
DuplicationStats::new()
} else {
DuplicationStats::disabled()
};
Self {
total_reads: 0,
total_bases: 0,
base_content: BaseContent::with_capacity(capacity),
quality: QualityStats::with_capacity(capacity),
gc: GcStats::new(),
length: LengthStats::new(),
kmer,
duplication,
insert_size: None,
filter_failures: FilterFailures::default(),
mode,
}
}
#[inline]
pub fn update(&mut self, record: &OwnedRecord) {
let seq = record.seq();
let qual = record.qual();
let len = seq.len();
if len == 0 {
return;
}
self.total_reads += 1;
self.total_bases += len as u64;
self.base_content.update(seq);
self.quality.update(qual);
self.gc.update(seq);
self.length.update(len);
self.kmer.update(seq, self.total_reads);
self.duplication.update(seq, self.total_reads);
}
#[inline]
pub fn update_raw(&mut self, seq: &[u8], qual: &[u8]) {
let len = seq.len();
if len == 0 {
return;
}
self.total_reads += 1;
self.total_bases += len as u64;
self.base_content.update(seq);
self.quality.update(qual);
self.gc.update(seq);
self.length.update(len);
self.kmer.update(seq, self.total_reads);
self.duplication.update(seq, self.total_reads);
}
pub fn merge(&mut self, other: QcStats) {
self.total_reads += other.total_reads;
self.total_bases += other.total_bases;
self.base_content.merge(&other.base_content);
self.quality.merge(&other.quality);
self.gc.merge(&other.gc);
self.length.merge(&other.length);
self.kmer.merge(&other.kmer);
self.duplication.merge(&other.duplication);
match (&mut self.insert_size, &other.insert_size) {
(Some(ref mut s), Some(ref o)) => s.merge(o),
(None, Some(o)) => self.insert_size = Some(o.clone()),
_ => {}
}
self.filter_failures.merge(&other.filter_failures);
}
pub fn merge_ref(&mut self, other: &QcStats) {
self.total_reads += other.total_reads;
self.total_bases += other.total_bases;
self.base_content.merge(&other.base_content);
self.quality.merge(&other.quality);
self.gc.merge(&other.gc);
self.length.merge(&other.length);
self.kmer.merge(&other.kmer);
self.duplication.merge(&other.duplication);
match (&mut self.insert_size, &other.insert_size) {
(Some(ref mut s), Some(ref o)) => s.merge(o),
(None, Some(o)) => self.insert_size = Some(o.clone()),
_ => {}
}
self.filter_failures.merge(&other.filter_failures);
}
pub fn mode(&self) -> Mode {
self.mode
}
pub fn mean_length(&self) -> f64 {
self.length.mean_length()
}
pub fn mean_quality(&self) -> f64 {
self.quality.mean_quality()
}
pub fn mean_gc(&self) -> f64 {
self.gc.mean_gc()
}
pub fn q20_percent(&self) -> f64 {
self.quality.q20_percent()
}
pub fn q30_percent(&self) -> f64 {
self.quality.q30_percent()
}
pub fn n50(&self) -> usize {
self.length.n50()
}
pub fn duplication_rate(&self) -> f64 {
self.duplication.duplication_rate()
}
pub fn set_insert_size(&mut self, stats: InsertSizeStats) {
self.insert_size = Some(stats);
}
pub fn insert_size(&self) -> Option<&InsertSizeStats> {
self.insert_size.as_ref()
}
pub fn is_empty(&self) -> bool {
self.total_reads == 0
}
pub fn summary(&self) -> QcSummary {
QcSummary {
total_reads: self.total_reads,
total_bases: self.total_bases,
mean_length: self.mean_length(),
mean_quality: self.mean_quality(),
mean_gc: self.mean_gc(),
q20_percent: self.q20_percent(),
q30_percent: self.q30_percent(),
n50: self.n50(),
duplication_rate: self.duplication_rate(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QcSummary {
pub total_reads: u64,
pub total_bases: u64,
pub mean_length: f64,
pub mean_quality: f64,
pub mean_gc: f64,
pub q20_percent: f64,
pub q30_percent: f64,
pub n50: usize,
pub duplication_rate: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FilteringStats {
pub before: QcStats,
pub after: QcStats,
}
impl FilteringStats {
pub fn new(mode: Mode) -> Self {
Self {
before: QcStats::new(mode),
after: QcStats::new(mode),
}
}
pub fn reads_filtered(&self) -> u64 {
self.before.total_reads.saturating_sub(self.after.total_reads)
}
pub fn pass_rate(&self) -> f64 {
if self.before.total_reads == 0 {
0.0
} else {
(self.after.total_reads as f64 / self.before.total_reads as f64) * 100.0
}
}
pub fn bases_filtered(&self) -> u64 {
self.before.total_bases.saturating_sub(self.after.total_bases)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_record(seq: &[u8], qual: &[u8]) -> OwnedRecord {
OwnedRecord::new(b"test".to_vec(), seq.to_vec(), qual.to_vec())
}
#[test]
fn test_qc_stats_new() {
let stats = QcStats::new(Mode::Short);
assert_eq!(stats.total_reads, 0);
assert_eq!(stats.total_bases, 0);
assert!(stats.is_empty());
}
#[test]
fn test_qc_stats_new_short() {
let stats = QcStats::new_short();
assert_eq!(stats.mode(), Mode::Short);
}
#[test]
fn test_qc_stats_new_long() {
let stats = QcStats::new_long();
assert_eq!(stats.mode(), Mode::Long);
}
#[test]
fn test_qc_stats_update() {
let mut stats = QcStats::new(Mode::Short);
let record = make_record(b"ATGC", b"IIII");
stats.update(&record);
assert_eq!(stats.total_reads, 1);
assert_eq!(stats.total_bases, 4);
assert!(!stats.is_empty());
}
#[test]
fn test_qc_stats_update_raw() {
let mut stats = QcStats::new(Mode::Short);
stats.update_raw(b"ATGC", b"IIII");
assert_eq!(stats.total_reads, 1);
assert_eq!(stats.total_bases, 4);
}
#[test]
fn test_qc_stats_update_empty() {
let mut stats = QcStats::new(Mode::Short);
let record = make_record(b"", b"");
stats.update(&record);
assert_eq!(stats.total_reads, 0);
assert!(stats.is_empty());
}
#[test]
fn test_qc_stats_update_multiple() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"ATGC", b"IIII"));
stats.update(&make_record(b"GCTA", b"HHHH"));
stats.update(&make_record(b"TTAA", b"GGGG"));
assert_eq!(stats.total_reads, 3);
assert_eq!(stats.total_bases, 12);
}
#[test]
fn test_qc_stats_merge() {
let mut stats1 = QcStats::new(Mode::Short);
stats1.update(&make_record(b"ATGC", b"IIII"));
let mut stats2 = QcStats::new(Mode::Short);
stats2.update(&make_record(b"GCTA", b"HHHH"));
stats1.merge(stats2);
assert_eq!(stats1.total_reads, 2);
assert_eq!(stats1.total_bases, 8);
}
#[test]
fn test_qc_stats_merge_ref() {
let mut stats1 = QcStats::new(Mode::Short);
stats1.update(&make_record(b"ATGC", b"IIII"));
let mut stats2 = QcStats::new(Mode::Short);
stats2.update(&make_record(b"GCTA", b"HHHH"));
stats1.merge_ref(&stats2);
assert_eq!(stats1.total_reads, 2);
assert_eq!(stats2.total_reads, 1); }
#[test]
fn test_qc_stats_mean_length() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"ATGC", b"IIII")); stats.update(&make_record(b"ATGCATGC", b"IIIIIIII"));
assert!((stats.mean_length() - 6.0).abs() < 0.001);
}
#[test]
fn test_qc_stats_mean_quality() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"ATGC", b"IIII"));
assert!((stats.mean_quality() - 40.0).abs() < 0.001);
}
#[test]
fn test_qc_stats_mean_gc() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"GGCC", b"IIII")); stats.update(&make_record(b"AATT", b"IIII"));
assert!((stats.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_qc_stats_n50() {
let mut stats = QcStats::new(Mode::Long);
stats.update(&make_record(&vec![b'A'; 100], &vec![b'I'; 100]));
stats.update(&make_record(&vec![b'A'; 200], &vec![b'I'; 200]));
stats.update(&make_record(&vec![b'A'; 300], &vec![b'I'; 300]));
stats.update(&make_record(&vec![b'A'; 400], &vec![b'I'; 400]));
stats.update(&make_record(&vec![b'A'; 500], &vec![b'I'; 500]));
assert_eq!(stats.n50(), 400);
}
#[test]
fn test_qc_stats_summary() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"ATGC", b"IIII"));
let summary = stats.summary();
assert_eq!(summary.total_reads, 1);
assert_eq!(summary.total_bases, 4);
}
#[test]
fn test_qc_stats_serialize() {
let mut stats = QcStats::new(Mode::Short);
stats.update(&make_record(b"ATGC", b"IIII"));
let json = serde_json::to_string(&stats).unwrap();
let stats2: QcStats = serde_json::from_str(&json).unwrap();
assert_eq!(stats.total_reads, stats2.total_reads);
assert_eq!(stats.total_bases, stats2.total_bases);
}
#[test]
fn test_filtering_stats_new() {
let fs = FilteringStats::new(Mode::Short);
assert_eq!(fs.before.total_reads, 0);
assert_eq!(fs.after.total_reads, 0);
}
#[test]
fn test_filtering_stats_reads_filtered() {
let mut fs = FilteringStats::new(Mode::Short);
fs.before.update(&make_record(b"ATGC", b"IIII"));
fs.before.update(&make_record(b"GCTA", b"!!!!")); fs.after.update(&make_record(b"ATGC", b"IIII"));
assert_eq!(fs.reads_filtered(), 1);
}
#[test]
fn test_filtering_stats_pass_rate() {
let mut fs = FilteringStats::new(Mode::Short);
for _ in 0..100 {
fs.before.update(&make_record(b"ATGC", b"IIII"));
}
for _ in 0..90 {
fs.after.update(&make_record(b"ATGC", b"IIII"));
}
assert!((fs.pass_rate() - 90.0).abs() < 0.001);
}
#[test]
fn test_filtering_stats_bases_filtered() {
let mut fs = FilteringStats::new(Mode::Short);
fs.before.update(&make_record(b"ATGCATGC", b"IIIIIIII")); fs.after.update(&make_record(b"ATGC", b"IIII"));
assert_eq!(fs.bases_filtered(), 4);
}
#[test]
fn test_qc_stats_default() {
let stats = QcStats::default();
assert_eq!(stats.mode(), Mode::Short);
}
}