Skip to main content

genomicframe_core/formats/sam/
stats.rs

1//! SAM statistics computation
2//!
3//! Streaming statistics collection for SAM alignment files.
4//! Memory-efficient: processes millions of alignments with O(1) memory for basic stats.
5
6use crate::core::GenomicRecordIterator;
7use crate::error::Result;
8use crate::formats::bam::CigarOp;
9use crate::formats::sam::{SamReader, SamRecord};
10use crate::parallel::Mergeable;
11use crate::stats::{CategoryCounter, RunningStats};
12
13/// Comprehensive statistics for SAM files
14#[derive(Debug, Clone)]
15pub struct SamStats {
16    /// Total number of reads
17    pub total_reads: usize,
18
19    /// Number of mapped reads
20    pub mapped_reads: usize,
21
22    /// Number of unmapped reads
23    pub unmapped_reads: usize,
24
25    /// Number of reads mapped in proper pairs
26    pub properly_paired: usize,
27
28    /// Number of duplicate reads
29    pub duplicates: usize,
30
31    /// Number of secondary alignments
32    pub secondary_alignments: usize,
33
34    /// Number of supplementary alignments
35    pub supplementary_alignments: usize,
36
37    /// Number of QC-failed reads
38    pub qc_failed: usize,
39
40    /// Mapping quality statistics (for mapped reads only)
41    pub mapq_stats: RunningStats,
42
43    /// Distribution of mapping qualities
44    pub mapq_distribution: CategoryCounter<u8>,
45
46    /// Read length statistics
47    pub length_stats: RunningStats,
48
49    /// Distribution of read lengths
50    pub length_distribution: CategoryCounter<usize>,
51
52    /// Reads per chromosome
53    pub reads_per_chrom: CategoryCounter<String>,
54
55    /// Number of reads with MAPQ >= 10
56    pub high_quality_mapq10: usize,
57
58    /// Number of reads with MAPQ >= 20
59    pub high_quality_mapq20: usize,
60
61    /// Number of reads with MAPQ >= 30
62    pub high_quality_mapq30: usize,
63
64    /// Paired-end statistics
65    pub first_in_pair: usize,
66    pub second_in_pair: usize,
67    pub reverse_strand: usize,
68
69    /// CIGAR operation statistics
70    pub total_matches: u64,
71    pub total_insertions: u64,
72    pub total_deletions: u64,
73    pub total_soft_clips: u64,
74    pub total_hard_clips: u64,
75}
76
77impl Default for SamStats {
78    fn default() -> Self {
79        Self::new()
80    }
81}
82
83impl SamStats {
84    /// Create a new statistics collector
85    pub fn new() -> Self {
86        Self {
87            total_reads: 0,
88            mapped_reads: 0,
89            unmapped_reads: 0,
90            properly_paired: 0,
91            duplicates: 0,
92            secondary_alignments: 0,
93            supplementary_alignments: 0,
94            qc_failed: 0,
95            mapq_stats: RunningStats::new(),
96            mapq_distribution: CategoryCounter::new(),
97            length_stats: RunningStats::new(),
98            length_distribution: CategoryCounter::new(),
99            reads_per_chrom: CategoryCounter::new(),
100            high_quality_mapq10: 0,
101            high_quality_mapq20: 0,
102            high_quality_mapq30: 0,
103            first_in_pair: 0,
104            second_in_pair: 0,
105            reverse_strand: 0,
106            total_matches: 0,
107            total_insertions: 0,
108            total_deletions: 0,
109            total_soft_clips: 0,
110            total_hard_clips: 0,
111        }
112    }
113
114    /// Update statistics with a single record (streaming)
115    ///
116    /// Optimized for single-pass processing: computes all statistics in one iteration
117    /// over alignment data, avoiding repeated allocations and iterations.
118    pub fn update(&mut self, record: &SamRecord) {
119        self.total_reads += 1;
120
121        // Extract flag once to avoid repeated method calls
122        let flag = record.flag;
123
124        // Flag-based counts (optimized with bitwise operations)
125        let is_unmapped = (flag & 0x4) != 0;
126        if is_unmapped {
127            self.unmapped_reads += 1;
128        } else {
129            self.mapped_reads += 1;
130
131            // Only track MAPQ for mapped reads
132            let mapq = record.mapq;
133            self.mapq_stats.push(mapq as f64);
134            self.mapq_distribution.increment(mapq);
135
136            // Branchless MAPQ quality counting (avoids multiple if statements)
137            self.high_quality_mapq10 += (mapq >= 10) as usize;
138            self.high_quality_mapq20 += (mapq >= 20) as usize;
139            self.high_quality_mapq30 += (mapq >= 30) as usize;
140
141            // Track reads per chromosome (avoid String allocation)
142            if record.rname != "*" {
143                self.reads_per_chrom.increment_str(&record.rname);
144            }
145        }
146
147        // Use bitwise operations directly instead of method calls
148        // This avoids function call overhead
149        self.properly_paired += ((flag & 0x2) != 0) as usize;
150        self.duplicates += ((flag & 0x400) != 0) as usize;
151        self.secondary_alignments += ((flag & 0x100) != 0) as usize;
152        self.supplementary_alignments += ((flag & 0x800) != 0) as usize;
153        self.qc_failed += ((flag & 0x200) != 0) as usize;
154        self.first_in_pair += ((flag & 0x40) != 0) as usize;
155        self.second_in_pair += ((flag & 0x80) != 0) as usize;
156        self.reverse_strand += ((flag & 0x10) != 0) as usize;
157
158        // Read length (from sequence)
159        let length = record.seq.len();
160        self.length_stats.push(length as f64);
161        self.length_distribution.increment(length);
162
163        // CIGAR operation statistics
164        for op in &record.cigar {
165            match op {
166                CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => {
167                    self.total_matches += *len as u64;
168                }
169                CigarOp::Ins(len) => {
170                    self.total_insertions += *len as u64;
171                }
172                CigarOp::Del(len) => {
173                    self.total_deletions += *len as u64;
174                }
175                CigarOp::SoftClip(len) => {
176                    self.total_soft_clips += *len as u64;
177                }
178                CigarOp::HardClip(len) => {
179                    self.total_hard_clips += *len as u64;
180                }
181                _ => {}
182            }
183        }
184    }
185
186    /// Compute statistics in parallel (not yet implemented for SAM)
187    ///
188    /// SAM files are text-based and harder to parallelize efficiently than binary BAM files.
189    /// For now, use `compute()` for sequential processing, which is still fast.
190    ///
191    /// Future implementation may support:
192    /// - Chunked parallel processing with line-boundary detection
193    /// - Memory-mapped file access with parallel scanning
194    ///
195    /// For parallel statistics on alignment files, consider using BAM format instead.
196    pub fn par_compute(_reader: &mut SamReader, _threads: usize) -> Result<Self> {
197        Err(crate::error::Error::InvalidInput(
198            "Parallel statistics computation not yet implemented for SAM format. Use compute() instead.".to_string()
199        ))
200    }
201
202    /// Compute statistics from a SAM reader (sequential processing)
203    ///
204    /// SAM files are text-based, so parallel processing is more complex than BAM.
205    /// For now, we use sequential processing which is still very fast for text parsing.
206    ///
207    /// # Example
208    /// ```ignore
209    /// use genomicframe_core::formats::sam::{SamReader, SamStats};
210    ///
211    /// let mut reader = SamReader::from_path("alignments.sam")?;
212    /// reader.read_header()?;
213    /// let stats = SamStats::compute(&mut reader)?;
214    /// stats.print_summary();
215    /// ```
216    pub fn compute(reader: &mut SamReader) -> Result<Self> {
217        let mut stats = Self::new();
218
219        // Get the header to lookup reference names
220        let header = reader.header().ok_or_else(|| {
221            crate::error::Error::InvalidInput("SAM header not read. Call read_header() first.".to_string())
222        })?;
223
224        // Clone header to avoid borrow conflicts (SAM header is small)
225        let _header = header.clone();
226
227        while let Some(record) = reader.next_record()? {
228            stats.update(&record);
229        }
230
231        Ok(stats)
232    }
233
234    /// Mean read length
235    pub fn mean_length(&self) -> Option<f64> {
236        self.length_stats.mean()
237    }
238
239    /// Standard deviation of read length
240    pub fn std_length(&self) -> Option<f64> {
241        self.length_stats.std_dev()
242    }
243
244    /// Min read length
245    pub fn min_length(&self) -> Option<f64> {
246        self.length_stats.min()
247    }
248
249    /// Max read length
250    pub fn max_length(&self) -> Option<f64> {
251        self.length_stats.max()
252    }
253
254    /// Mean mapping quality (for mapped reads)
255    pub fn mean_mapq(&self) -> Option<f64> {
256        self.mapq_stats.mean()
257    }
258
259    /// Standard deviation of mapping quality
260    pub fn std_mapq(&self) -> Option<f64> {
261        self.mapq_stats.std_dev()
262    }
263
264    /// Mapping rate (percentage of mapped reads)
265    pub fn mapping_rate(&self) -> f64 {
266        if self.total_reads == 0 {
267            0.0
268        } else {
269            (self.mapped_reads as f64 / self.total_reads as f64) * 100.0
270        }
271    }
272
273    /// Properly paired rate (percentage of properly paired reads)
274    pub fn properly_paired_rate(&self) -> f64 {
275        if self.total_reads == 0 {
276            0.0
277        } else {
278            (self.properly_paired as f64 / self.total_reads as f64) * 100.0
279        }
280    }
281
282    /// Duplicate rate (percentage of duplicate reads)
283    pub fn duplicate_rate(&self) -> f64 {
284        if self.total_reads == 0 {
285            0.0
286        } else {
287            (self.duplicates as f64 / self.total_reads as f64) * 100.0
288        }
289    }
290
291    /// Percentage of reads with MAPQ >= 10
292    pub fn percent_mapq10(&self) -> f64 {
293        if self.total_reads == 0 {
294            0.0
295        } else {
296            (self.high_quality_mapq10 as f64 / self.total_reads as f64) * 100.0
297        }
298    }
299
300    /// Percentage of reads with MAPQ >= 20
301    pub fn percent_mapq20(&self) -> f64 {
302        if self.total_reads == 0 {
303            0.0
304        } else {
305            (self.high_quality_mapq20 as f64 / self.total_reads as f64) * 100.0
306        }
307    }
308
309    /// Percentage of reads with MAPQ >= 30
310    pub fn percent_mapq30(&self) -> f64 {
311        if self.total_reads == 0 {
312            0.0
313        } else {
314            (self.high_quality_mapq30 as f64 / self.total_reads as f64) * 100.0
315        }
316    }
317
318    /// Print a human-readable summary of statistics
319    pub fn print_summary(&self) {
320        println!("=== SAM Statistics ===\n");
321
322        println!("Basic Counts:");
323        println!("  Total reads:       {:>12}", self.total_reads);
324        println!("  Mapped reads:      {:>12}", self.mapped_reads);
325        println!("  Unmapped reads:    {:>12}", self.unmapped_reads);
326        println!();
327
328        println!("Mapping Metrics:");
329        println!("  Mapping rate:      {:>11.1}%", self.mapping_rate());
330        println!(
331            "  Mean MAPQ:         {:>12.2}",
332            self.mean_mapq().unwrap_or(0.0)
333        );
334        println!(
335            "  Std Dev MAPQ:      {:>12.2}",
336            self.std_mapq().unwrap_or(0.0)
337        );
338        println!("  MAPQ >= 10:        {:>11.1}%", self.percent_mapq10());
339        println!("  MAPQ >= 20:        {:>11.1}%", self.percent_mapq20());
340        println!("  MAPQ >= 30:        {:>11.1}%", self.percent_mapq30());
341        println!();
342
343        println!("Paired-End Metrics:");
344        println!("  Properly paired:   {:>11.1}%", self.properly_paired_rate());
345        println!("  First in pair:     {:>12}", self.first_in_pair);
346        println!("  Second in pair:    {:>12}", self.second_in_pair);
347        println!();
348
349        println!("Quality Flags:");
350        println!("  Duplicates:        {:>11.1}%", self.duplicate_rate());
351        println!("  Secondary:         {:>12}", self.secondary_alignments);
352        println!("  Supplementary:     {:>12}", self.supplementary_alignments);
353        println!("  QC failed:         {:>12}", self.qc_failed);
354        println!();
355
356        println!("Read Length:");
357        println!(
358            "  Mean:              {:>12.2}",
359            self.mean_length().unwrap_or(0.0)
360        );
361        println!(
362            "  Std Dev:           {:>12.2}",
363            self.std_length().unwrap_or(0.0)
364        );
365        println!(
366            "  Min:               {:>12.0}",
367            self.min_length().unwrap_or(0.0)
368        );
369        println!(
370            "  Max:               {:>12.0}",
371            self.max_length().unwrap_or(0.0)
372        );
373        println!();
374
375        println!("CIGAR Operations:");
376        println!("  Matches/Mismatches: {:>11}", self.total_matches);
377        println!("  Insertions:         {:>11}", self.total_insertions);
378        println!("  Deletions:          {:>11}", self.total_deletions);
379        println!("  Soft clips:         {:>11}", self.total_soft_clips);
380        println!("  Hard clips:         {:>11}", self.total_hard_clips);
381        println!();
382
383        // Top chromosomes (sorted by count descending)
384        if self.mapped_reads > 0 {
385            println!("Top Chromosomes:");
386            let mut chrom_vec: Vec<_> = self.reads_per_chrom.iter().collect();
387            chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
388            for (chrom, count) in chrom_vec.iter().take(10) {
389                let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
390                println!("  {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
391            }
392            println!();
393        }
394
395        // MAPQ distribution (binned, sorted by MAPQ)
396        if self.mapped_reads > 0 {
397            println!("MAPQ Distribution:");
398            let mut mapq_vec: Vec<_> = self.mapq_distribution.iter().collect();
399            mapq_vec.sort_by_key(|a| a.0); // Sort by MAPQ ascending
400            for (mapq, count) in mapq_vec.iter().take(15) {
401                let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
402                println!("  MAPQ {:>3}: {:>10} ({:>5.1}%)", mapq, count, percent);
403            }
404        }
405    }
406}
407
408/// Implementation of Mergeable for parallel statistics computation
409///
410/// Merges SAM statistics from multiple threads/chunks.
411/// All statistics are correctly combined using the underlying
412/// Mergeable implementations for RunningStats and CategoryCounter.
413impl Mergeable for SamStats {
414    fn merge(&mut self, other: Self) {
415        // Merge simple counters
416        self.total_reads += other.total_reads;
417        self.mapped_reads += other.mapped_reads;
418        self.unmapped_reads += other.unmapped_reads;
419        self.properly_paired += other.properly_paired;
420        self.duplicates += other.duplicates;
421        self.secondary_alignments += other.secondary_alignments;
422        self.supplementary_alignments += other.supplementary_alignments;
423        self.qc_failed += other.qc_failed;
424        self.high_quality_mapq10 += other.high_quality_mapq10;
425        self.high_quality_mapq20 += other.high_quality_mapq20;
426        self.high_quality_mapq30 += other.high_quality_mapq30;
427        self.first_in_pair += other.first_in_pair;
428        self.second_in_pair += other.second_in_pair;
429        self.reverse_strand += other.reverse_strand;
430        self.total_matches += other.total_matches;
431        self.total_insertions += other.total_insertions;
432        self.total_deletions += other.total_deletions;
433        self.total_soft_clips += other.total_soft_clips;
434        self.total_hard_clips += other.total_hard_clips;
435
436        // Merge running statistics
437        self.mapq_stats.merge(other.mapq_stats);
438        self.length_stats.merge(other.length_stats);
439
440        // Merge categorical data
441        self.mapq_distribution.merge(other.mapq_distribution);
442        self.length_distribution.merge(other.length_distribution);
443        self.reads_per_chrom.merge(other.reads_per_chrom);
444    }
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn test_sam_stats_new() {
453        let stats = SamStats::new();
454        assert_eq!(stats.total_reads, 0);
455        assert_eq!(stats.mapped_reads, 0);
456        assert_eq!(stats.unmapped_reads, 0);
457    }
458
459    #[test]
460    fn test_sam_stats_merge() {
461        let mut stats1 = SamStats::new();
462        stats1.total_reads = 100;
463        stats1.mapped_reads = 80;
464        stats1.unmapped_reads = 20;
465
466        let mut stats2 = SamStats::new();
467        stats2.total_reads = 50;
468        stats2.mapped_reads = 45;
469        stats2.unmapped_reads = 5;
470
471        stats1.merge(stats2);
472
473        assert_eq!(stats1.total_reads, 150);
474        assert_eq!(stats1.mapped_reads, 125);
475        assert_eq!(stats1.unmapped_reads, 25);
476    }
477
478    #[test]
479    fn test_sam_stats_rates() {
480        let mut stats = SamStats::new();
481        stats.total_reads = 100;
482        stats.mapped_reads = 90;
483        stats.properly_paired = 80;
484        stats.duplicates = 10;
485
486        assert!((stats.mapping_rate() - 90.0).abs() < 0.1);
487        assert!((stats.properly_paired_rate() - 80.0).abs() < 0.1);
488        assert!((stats.duplicate_rate() - 10.0).abs() < 0.1);
489    }
490
491    #[test]
492    fn test_sam_stats_empty() {
493        let stats = SamStats::new();
494        assert_eq!(stats.mapping_rate(), 0.0);
495        assert_eq!(stats.properly_paired_rate(), 0.0);
496        assert_eq!(stats.duplicate_rate(), 0.0);
497    }
498}