use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::formats::bam::CigarOp;
use crate::formats::sam::{SamReader, SamRecord};
use crate::parallel::Mergeable;
use crate::stats::{CategoryCounter, RunningStats};
#[derive(Debug, Clone)]
pub struct SamStats {
pub total_reads: usize,
pub mapped_reads: usize,
pub unmapped_reads: usize,
pub properly_paired: usize,
pub duplicates: usize,
pub secondary_alignments: usize,
pub supplementary_alignments: usize,
pub qc_failed: usize,
pub mapq_stats: RunningStats,
pub mapq_distribution: CategoryCounter<u8>,
pub length_stats: RunningStats,
pub length_distribution: CategoryCounter<usize>,
pub reads_per_chrom: CategoryCounter<String>,
pub high_quality_mapq10: usize,
pub high_quality_mapq20: usize,
pub high_quality_mapq30: usize,
pub first_in_pair: usize,
pub second_in_pair: usize,
pub reverse_strand: usize,
pub total_matches: u64,
pub total_insertions: u64,
pub total_deletions: u64,
pub total_soft_clips: u64,
pub total_hard_clips: u64,
}
impl Default for SamStats {
fn default() -> Self {
Self::new()
}
}
impl SamStats {
pub fn new() -> Self {
Self {
total_reads: 0,
mapped_reads: 0,
unmapped_reads: 0,
properly_paired: 0,
duplicates: 0,
secondary_alignments: 0,
supplementary_alignments: 0,
qc_failed: 0,
mapq_stats: RunningStats::new(),
mapq_distribution: CategoryCounter::new(),
length_stats: RunningStats::new(),
length_distribution: CategoryCounter::new(),
reads_per_chrom: CategoryCounter::new(),
high_quality_mapq10: 0,
high_quality_mapq20: 0,
high_quality_mapq30: 0,
first_in_pair: 0,
second_in_pair: 0,
reverse_strand: 0,
total_matches: 0,
total_insertions: 0,
total_deletions: 0,
total_soft_clips: 0,
total_hard_clips: 0,
}
}
pub fn update(&mut self, record: &SamRecord) {
self.total_reads += 1;
let flag = record.flag;
let is_unmapped = (flag & 0x4) != 0;
if is_unmapped {
self.unmapped_reads += 1;
} else {
self.mapped_reads += 1;
let mapq = record.mapq;
self.mapq_stats.push(mapq as f64);
self.mapq_distribution.increment(mapq);
self.high_quality_mapq10 += (mapq >= 10) as usize;
self.high_quality_mapq20 += (mapq >= 20) as usize;
self.high_quality_mapq30 += (mapq >= 30) as usize;
if record.rname != "*" {
self.reads_per_chrom.increment_str(&record.rname);
}
}
self.properly_paired += ((flag & 0x2) != 0) as usize;
self.duplicates += ((flag & 0x400) != 0) as usize;
self.secondary_alignments += ((flag & 0x100) != 0) as usize;
self.supplementary_alignments += ((flag & 0x800) != 0) as usize;
self.qc_failed += ((flag & 0x200) != 0) as usize;
self.first_in_pair += ((flag & 0x40) != 0) as usize;
self.second_in_pair += ((flag & 0x80) != 0) as usize;
self.reverse_strand += ((flag & 0x10) != 0) as usize;
let length = record.seq.len();
self.length_stats.push(length as f64);
self.length_distribution.increment(length);
for op in &record.cigar {
match op {
CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => {
self.total_matches += *len as u64;
}
CigarOp::Ins(len) => {
self.total_insertions += *len as u64;
}
CigarOp::Del(len) => {
self.total_deletions += *len as u64;
}
CigarOp::SoftClip(len) => {
self.total_soft_clips += *len as u64;
}
CigarOp::HardClip(len) => {
self.total_hard_clips += *len as u64;
}
_ => {}
}
}
}
pub fn par_compute(_reader: &mut SamReader, _threads: usize) -> Result<Self> {
Err(crate::error::Error::InvalidInput(
"Parallel statistics computation not yet implemented for SAM format. Use compute() instead.".to_string()
))
}
pub fn compute(reader: &mut SamReader) -> Result<Self> {
let mut stats = Self::new();
let header = reader.header().ok_or_else(|| {
crate::error::Error::InvalidInput("SAM header not read. Call read_header() first.".to_string())
})?;
let _header = header.clone();
while let Some(record) = reader.next_record()? {
stats.update(&record);
}
Ok(stats)
}
pub fn mean_length(&self) -> Option<f64> {
self.length_stats.mean()
}
pub fn std_length(&self) -> Option<f64> {
self.length_stats.std_dev()
}
pub fn min_length(&self) -> Option<f64> {
self.length_stats.min()
}
pub fn max_length(&self) -> Option<f64> {
self.length_stats.max()
}
pub fn mean_mapq(&self) -> Option<f64> {
self.mapq_stats.mean()
}
pub fn std_mapq(&self) -> Option<f64> {
self.mapq_stats.std_dev()
}
pub fn mapping_rate(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.mapped_reads as f64 / self.total_reads as f64) * 100.0
}
}
pub fn properly_paired_rate(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.properly_paired as f64 / self.total_reads as f64) * 100.0
}
}
pub fn duplicate_rate(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.duplicates as f64 / self.total_reads as f64) * 100.0
}
}
pub fn percent_mapq10(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.high_quality_mapq10 as f64 / self.total_reads as f64) * 100.0
}
}
pub fn percent_mapq20(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.high_quality_mapq20 as f64 / self.total_reads as f64) * 100.0
}
}
pub fn percent_mapq30(&self) -> f64 {
if self.total_reads == 0 {
0.0
} else {
(self.high_quality_mapq30 as f64 / self.total_reads as f64) * 100.0
}
}
pub fn print_summary(&self) {
println!("=== SAM Statistics ===\n");
println!("Basic Counts:");
println!(" Total reads: {:>12}", self.total_reads);
println!(" Mapped reads: {:>12}", self.mapped_reads);
println!(" Unmapped reads: {:>12}", self.unmapped_reads);
println!();
println!("Mapping Metrics:");
println!(" Mapping rate: {:>11.1}%", self.mapping_rate());
println!(
" Mean MAPQ: {:>12.2}",
self.mean_mapq().unwrap_or(0.0)
);
println!(
" Std Dev MAPQ: {:>12.2}",
self.std_mapq().unwrap_or(0.0)
);
println!(" MAPQ >= 10: {:>11.1}%", self.percent_mapq10());
println!(" MAPQ >= 20: {:>11.1}%", self.percent_mapq20());
println!(" MAPQ >= 30: {:>11.1}%", self.percent_mapq30());
println!();
println!("Paired-End Metrics:");
println!(" Properly paired: {:>11.1}%", self.properly_paired_rate());
println!(" First in pair: {:>12}", self.first_in_pair);
println!(" Second in pair: {:>12}", self.second_in_pair);
println!();
println!("Quality Flags:");
println!(" Duplicates: {:>11.1}%", self.duplicate_rate());
println!(" Secondary: {:>12}", self.secondary_alignments);
println!(" Supplementary: {:>12}", self.supplementary_alignments);
println!(" QC failed: {:>12}", self.qc_failed);
println!();
println!("Read Length:");
println!(
" Mean: {:>12.2}",
self.mean_length().unwrap_or(0.0)
);
println!(
" Std Dev: {:>12.2}",
self.std_length().unwrap_or(0.0)
);
println!(
" Min: {:>12.0}",
self.min_length().unwrap_or(0.0)
);
println!(
" Max: {:>12.0}",
self.max_length().unwrap_or(0.0)
);
println!();
println!("CIGAR Operations:");
println!(" Matches/Mismatches: {:>11}", self.total_matches);
println!(" Insertions: {:>11}", self.total_insertions);
println!(" Deletions: {:>11}", self.total_deletions);
println!(" Soft clips: {:>11}", self.total_soft_clips);
println!(" Hard clips: {:>11}", self.total_hard_clips);
println!();
if self.mapped_reads > 0 {
println!("Top Chromosomes:");
let mut chrom_vec: Vec<_> = self.reads_per_chrom.iter().collect();
chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); for (chrom, count) in chrom_vec.iter().take(10) {
let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
println!(" {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
}
println!();
}
if self.mapped_reads > 0 {
println!("MAPQ Distribution:");
let mut mapq_vec: Vec<_> = self.mapq_distribution.iter().collect();
mapq_vec.sort_by_key(|a| a.0); for (mapq, count) in mapq_vec.iter().take(15) {
let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
println!(" MAPQ {:>3}: {:>10} ({:>5.1}%)", mapq, count, percent);
}
}
}
}
impl Mergeable for SamStats {
fn merge(&mut self, other: Self) {
self.total_reads += other.total_reads;
self.mapped_reads += other.mapped_reads;
self.unmapped_reads += other.unmapped_reads;
self.properly_paired += other.properly_paired;
self.duplicates += other.duplicates;
self.secondary_alignments += other.secondary_alignments;
self.supplementary_alignments += other.supplementary_alignments;
self.qc_failed += other.qc_failed;
self.high_quality_mapq10 += other.high_quality_mapq10;
self.high_quality_mapq20 += other.high_quality_mapq20;
self.high_quality_mapq30 += other.high_quality_mapq30;
self.first_in_pair += other.first_in_pair;
self.second_in_pair += other.second_in_pair;
self.reverse_strand += other.reverse_strand;
self.total_matches += other.total_matches;
self.total_insertions += other.total_insertions;
self.total_deletions += other.total_deletions;
self.total_soft_clips += other.total_soft_clips;
self.total_hard_clips += other.total_hard_clips;
self.mapq_stats.merge(other.mapq_stats);
self.length_stats.merge(other.length_stats);
self.mapq_distribution.merge(other.mapq_distribution);
self.length_distribution.merge(other.length_distribution);
self.reads_per_chrom.merge(other.reads_per_chrom);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sam_stats_new() {
let stats = SamStats::new();
assert_eq!(stats.total_reads, 0);
assert_eq!(stats.mapped_reads, 0);
assert_eq!(stats.unmapped_reads, 0);
}
#[test]
fn test_sam_stats_merge() {
let mut stats1 = SamStats::new();
stats1.total_reads = 100;
stats1.mapped_reads = 80;
stats1.unmapped_reads = 20;
let mut stats2 = SamStats::new();
stats2.total_reads = 50;
stats2.mapped_reads = 45;
stats2.unmapped_reads = 5;
stats1.merge(stats2);
assert_eq!(stats1.total_reads, 150);
assert_eq!(stats1.mapped_reads, 125);
assert_eq!(stats1.unmapped_reads, 25);
}
#[test]
fn test_sam_stats_rates() {
let mut stats = SamStats::new();
stats.total_reads = 100;
stats.mapped_reads = 90;
stats.properly_paired = 80;
stats.duplicates = 10;
assert!((stats.mapping_rate() - 90.0).abs() < 0.1);
assert!((stats.properly_paired_rate() - 80.0).abs() < 0.1);
assert!((stats.duplicate_rate() - 10.0).abs() < 0.1);
}
#[test]
fn test_sam_stats_empty() {
let stats = SamStats::new();
assert_eq!(stats.mapping_rate(), 0.0);
assert_eq!(stats.properly_paired_rate(), 0.0);
assert_eq!(stats.duplicate_rate(), 0.0);
}
}