genomicframe_core/formats/sam/
stats.rs1use crate::core::GenomicRecordIterator;
7use crate::error::Result;
8use crate::formats::bam::CigarOp;
9use crate::formats::sam::{SamReader, SamRecord};
10use crate::parallel::Mergeable;
11use crate::stats::{CategoryCounter, RunningStats};
12
13#[derive(Debug, Clone)]
15pub struct SamStats {
16 pub total_reads: usize,
18
19 pub mapped_reads: usize,
21
22 pub unmapped_reads: usize,
24
25 pub properly_paired: usize,
27
28 pub duplicates: usize,
30
31 pub secondary_alignments: usize,
33
34 pub supplementary_alignments: usize,
36
37 pub qc_failed: usize,
39
40 pub mapq_stats: RunningStats,
42
43 pub mapq_distribution: CategoryCounter<u8>,
45
46 pub length_stats: RunningStats,
48
49 pub length_distribution: CategoryCounter<usize>,
51
52 pub reads_per_chrom: CategoryCounter<String>,
54
55 pub high_quality_mapq10: usize,
57
58 pub high_quality_mapq20: usize,
60
61 pub high_quality_mapq30: usize,
63
64 pub first_in_pair: usize,
66 pub second_in_pair: usize,
67 pub reverse_strand: usize,
68
69 pub total_matches: u64,
71 pub total_insertions: u64,
72 pub total_deletions: u64,
73 pub total_soft_clips: u64,
74 pub total_hard_clips: u64,
75}
76
77impl Default for SamStats {
78 fn default() -> Self {
79 Self::new()
80 }
81}
82
83impl SamStats {
84 pub fn new() -> Self {
86 Self {
87 total_reads: 0,
88 mapped_reads: 0,
89 unmapped_reads: 0,
90 properly_paired: 0,
91 duplicates: 0,
92 secondary_alignments: 0,
93 supplementary_alignments: 0,
94 qc_failed: 0,
95 mapq_stats: RunningStats::new(),
96 mapq_distribution: CategoryCounter::new(),
97 length_stats: RunningStats::new(),
98 length_distribution: CategoryCounter::new(),
99 reads_per_chrom: CategoryCounter::new(),
100 high_quality_mapq10: 0,
101 high_quality_mapq20: 0,
102 high_quality_mapq30: 0,
103 first_in_pair: 0,
104 second_in_pair: 0,
105 reverse_strand: 0,
106 total_matches: 0,
107 total_insertions: 0,
108 total_deletions: 0,
109 total_soft_clips: 0,
110 total_hard_clips: 0,
111 }
112 }
113
114 pub fn update(&mut self, record: &SamRecord) {
119 self.total_reads += 1;
120
121 let flag = record.flag;
123
124 let is_unmapped = (flag & 0x4) != 0;
126 if is_unmapped {
127 self.unmapped_reads += 1;
128 } else {
129 self.mapped_reads += 1;
130
131 let mapq = record.mapq;
133 self.mapq_stats.push(mapq as f64);
134 self.mapq_distribution.increment(mapq);
135
136 self.high_quality_mapq10 += (mapq >= 10) as usize;
138 self.high_quality_mapq20 += (mapq >= 20) as usize;
139 self.high_quality_mapq30 += (mapq >= 30) as usize;
140
141 if record.rname != "*" {
143 self.reads_per_chrom.increment_str(&record.rname);
144 }
145 }
146
147 self.properly_paired += ((flag & 0x2) != 0) as usize;
150 self.duplicates += ((flag & 0x400) != 0) as usize;
151 self.secondary_alignments += ((flag & 0x100) != 0) as usize;
152 self.supplementary_alignments += ((flag & 0x800) != 0) as usize;
153 self.qc_failed += ((flag & 0x200) != 0) as usize;
154 self.first_in_pair += ((flag & 0x40) != 0) as usize;
155 self.second_in_pair += ((flag & 0x80) != 0) as usize;
156 self.reverse_strand += ((flag & 0x10) != 0) as usize;
157
158 let length = record.seq.len();
160 self.length_stats.push(length as f64);
161 self.length_distribution.increment(length);
162
163 for op in &record.cigar {
165 match op {
166 CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => {
167 self.total_matches += *len as u64;
168 }
169 CigarOp::Ins(len) => {
170 self.total_insertions += *len as u64;
171 }
172 CigarOp::Del(len) => {
173 self.total_deletions += *len as u64;
174 }
175 CigarOp::SoftClip(len) => {
176 self.total_soft_clips += *len as u64;
177 }
178 CigarOp::HardClip(len) => {
179 self.total_hard_clips += *len as u64;
180 }
181 _ => {}
182 }
183 }
184 }
185
186 pub fn par_compute(_reader: &mut SamReader, _threads: usize) -> Result<Self> {
197 Err(crate::error::Error::InvalidInput(
198 "Parallel statistics computation not yet implemented for SAM format. Use compute() instead.".to_string()
199 ))
200 }
201
202 pub fn compute(reader: &mut SamReader) -> Result<Self> {
217 let mut stats = Self::new();
218
219 let header = reader.header().ok_or_else(|| {
221 crate::error::Error::InvalidInput("SAM header not read. Call read_header() first.".to_string())
222 })?;
223
224 let _header = header.clone();
226
227 while let Some(record) = reader.next_record()? {
228 stats.update(&record);
229 }
230
231 Ok(stats)
232 }
233
234 pub fn mean_length(&self) -> Option<f64> {
236 self.length_stats.mean()
237 }
238
239 pub fn std_length(&self) -> Option<f64> {
241 self.length_stats.std_dev()
242 }
243
244 pub fn min_length(&self) -> Option<f64> {
246 self.length_stats.min()
247 }
248
249 pub fn max_length(&self) -> Option<f64> {
251 self.length_stats.max()
252 }
253
254 pub fn mean_mapq(&self) -> Option<f64> {
256 self.mapq_stats.mean()
257 }
258
259 pub fn std_mapq(&self) -> Option<f64> {
261 self.mapq_stats.std_dev()
262 }
263
264 pub fn mapping_rate(&self) -> f64 {
266 if self.total_reads == 0 {
267 0.0
268 } else {
269 (self.mapped_reads as f64 / self.total_reads as f64) * 100.0
270 }
271 }
272
273 pub fn properly_paired_rate(&self) -> f64 {
275 if self.total_reads == 0 {
276 0.0
277 } else {
278 (self.properly_paired as f64 / self.total_reads as f64) * 100.0
279 }
280 }
281
282 pub fn duplicate_rate(&self) -> f64 {
284 if self.total_reads == 0 {
285 0.0
286 } else {
287 (self.duplicates as f64 / self.total_reads as f64) * 100.0
288 }
289 }
290
291 pub fn percent_mapq10(&self) -> f64 {
293 if self.total_reads == 0 {
294 0.0
295 } else {
296 (self.high_quality_mapq10 as f64 / self.total_reads as f64) * 100.0
297 }
298 }
299
300 pub fn percent_mapq20(&self) -> f64 {
302 if self.total_reads == 0 {
303 0.0
304 } else {
305 (self.high_quality_mapq20 as f64 / self.total_reads as f64) * 100.0
306 }
307 }
308
309 pub fn percent_mapq30(&self) -> f64 {
311 if self.total_reads == 0 {
312 0.0
313 } else {
314 (self.high_quality_mapq30 as f64 / self.total_reads as f64) * 100.0
315 }
316 }
317
318 pub fn print_summary(&self) {
320 println!("=== SAM Statistics ===\n");
321
322 println!("Basic Counts:");
323 println!(" Total reads: {:>12}", self.total_reads);
324 println!(" Mapped reads: {:>12}", self.mapped_reads);
325 println!(" Unmapped reads: {:>12}", self.unmapped_reads);
326 println!();
327
328 println!("Mapping Metrics:");
329 println!(" Mapping rate: {:>11.1}%", self.mapping_rate());
330 println!(
331 " Mean MAPQ: {:>12.2}",
332 self.mean_mapq().unwrap_or(0.0)
333 );
334 println!(
335 " Std Dev MAPQ: {:>12.2}",
336 self.std_mapq().unwrap_or(0.0)
337 );
338 println!(" MAPQ >= 10: {:>11.1}%", self.percent_mapq10());
339 println!(" MAPQ >= 20: {:>11.1}%", self.percent_mapq20());
340 println!(" MAPQ >= 30: {:>11.1}%", self.percent_mapq30());
341 println!();
342
343 println!("Paired-End Metrics:");
344 println!(" Properly paired: {:>11.1}%", self.properly_paired_rate());
345 println!(" First in pair: {:>12}", self.first_in_pair);
346 println!(" Second in pair: {:>12}", self.second_in_pair);
347 println!();
348
349 println!("Quality Flags:");
350 println!(" Duplicates: {:>11.1}%", self.duplicate_rate());
351 println!(" Secondary: {:>12}", self.secondary_alignments);
352 println!(" Supplementary: {:>12}", self.supplementary_alignments);
353 println!(" QC failed: {:>12}", self.qc_failed);
354 println!();
355
356 println!("Read Length:");
357 println!(
358 " Mean: {:>12.2}",
359 self.mean_length().unwrap_or(0.0)
360 );
361 println!(
362 " Std Dev: {:>12.2}",
363 self.std_length().unwrap_or(0.0)
364 );
365 println!(
366 " Min: {:>12.0}",
367 self.min_length().unwrap_or(0.0)
368 );
369 println!(
370 " Max: {:>12.0}",
371 self.max_length().unwrap_or(0.0)
372 );
373 println!();
374
375 println!("CIGAR Operations:");
376 println!(" Matches/Mismatches: {:>11}", self.total_matches);
377 println!(" Insertions: {:>11}", self.total_insertions);
378 println!(" Deletions: {:>11}", self.total_deletions);
379 println!(" Soft clips: {:>11}", self.total_soft_clips);
380 println!(" Hard clips: {:>11}", self.total_hard_clips);
381 println!();
382
383 if self.mapped_reads > 0 {
385 println!("Top Chromosomes:");
386 let mut chrom_vec: Vec<_> = self.reads_per_chrom.iter().collect();
387 chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); for (chrom, count) in chrom_vec.iter().take(10) {
389 let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
390 println!(" {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
391 }
392 println!();
393 }
394
395 if self.mapped_reads > 0 {
397 println!("MAPQ Distribution:");
398 let mut mapq_vec: Vec<_> = self.mapq_distribution.iter().collect();
399 mapq_vec.sort_by_key(|a| a.0); for (mapq, count) in mapq_vec.iter().take(15) {
401 let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
402 println!(" MAPQ {:>3}: {:>10} ({:>5.1}%)", mapq, count, percent);
403 }
404 }
405 }
406}
407
408impl Mergeable for SamStats {
414 fn merge(&mut self, other: Self) {
415 self.total_reads += other.total_reads;
417 self.mapped_reads += other.mapped_reads;
418 self.unmapped_reads += other.unmapped_reads;
419 self.properly_paired += other.properly_paired;
420 self.duplicates += other.duplicates;
421 self.secondary_alignments += other.secondary_alignments;
422 self.supplementary_alignments += other.supplementary_alignments;
423 self.qc_failed += other.qc_failed;
424 self.high_quality_mapq10 += other.high_quality_mapq10;
425 self.high_quality_mapq20 += other.high_quality_mapq20;
426 self.high_quality_mapq30 += other.high_quality_mapq30;
427 self.first_in_pair += other.first_in_pair;
428 self.second_in_pair += other.second_in_pair;
429 self.reverse_strand += other.reverse_strand;
430 self.total_matches += other.total_matches;
431 self.total_insertions += other.total_insertions;
432 self.total_deletions += other.total_deletions;
433 self.total_soft_clips += other.total_soft_clips;
434 self.total_hard_clips += other.total_hard_clips;
435
436 self.mapq_stats.merge(other.mapq_stats);
438 self.length_stats.merge(other.length_stats);
439
440 self.mapq_distribution.merge(other.mapq_distribution);
442 self.length_distribution.merge(other.length_distribution);
443 self.reads_per_chrom.merge(other.reads_per_chrom);
444 }
445}
446
447#[cfg(test)]
448mod tests {
449 use super::*;
450
451 #[test]
452 fn test_sam_stats_new() {
453 let stats = SamStats::new();
454 assert_eq!(stats.total_reads, 0);
455 assert_eq!(stats.mapped_reads, 0);
456 assert_eq!(stats.unmapped_reads, 0);
457 }
458
459 #[test]
460 fn test_sam_stats_merge() {
461 let mut stats1 = SamStats::new();
462 stats1.total_reads = 100;
463 stats1.mapped_reads = 80;
464 stats1.unmapped_reads = 20;
465
466 let mut stats2 = SamStats::new();
467 stats2.total_reads = 50;
468 stats2.mapped_reads = 45;
469 stats2.unmapped_reads = 5;
470
471 stats1.merge(stats2);
472
473 assert_eq!(stats1.total_reads, 150);
474 assert_eq!(stats1.mapped_reads, 125);
475 assert_eq!(stats1.unmapped_reads, 25);
476 }
477
478 #[test]
479 fn test_sam_stats_rates() {
480 let mut stats = SamStats::new();
481 stats.total_reads = 100;
482 stats.mapped_reads = 90;
483 stats.properly_paired = 80;
484 stats.duplicates = 10;
485
486 assert!((stats.mapping_rate() - 90.0).abs() < 0.1);
487 assert!((stats.properly_paired_rate() - 80.0).abs() < 0.1);
488 assert!((stats.duplicate_rate() - 10.0).abs() < 0.1);
489 }
490
491 #[test]
492 fn test_sam_stats_empty() {
493 let stats = SamStats::new();
494 assert_eq!(stats.mapping_rate(), 0.0);
495 assert_eq!(stats.properly_paired_rate(), 0.0);
496 assert_eq!(stats.duplicate_rate(), 0.0);
497 }
498}