genomicframe_core/formats/bam/
stats.rs1use crate::core::GenomicRecordIterator;
7use crate::error::Result;
8use crate::parallel::Mergeable;
9use crate::stats::{CategoryCounter, RunningStats};
10
11use super::reader::{BamReader, BamRecord};
12
13#[derive(Debug, Clone)]
15pub struct BamStats {
16 pub total_reads: usize,
18
19 pub mapped_reads: usize,
21
22 pub unmapped_reads: usize,
24
25 pub properly_paired: usize,
27
28 pub duplicates: usize,
30
31 pub secondary_alignments: usize,
33
34 pub supplementary_alignments: usize,
36
37 pub qc_failed: usize,
39
40 pub mapq_stats: RunningStats,
42
43 pub mapq_distribution: CategoryCounter<u8>,
45
46 pub length_stats: RunningStats,
48
49 pub length_distribution: CategoryCounter<usize>,
51
52 pub reads_per_chrom: CategoryCounter<String>,
54
55 pub high_quality_mapq10: usize,
57
58 pub high_quality_mapq20: usize,
60
61 pub high_quality_mapq30: usize,
63
64 pub first_in_pair: usize,
66 pub second_in_pair: usize,
67 pub reverse_strand: usize,
68
69 pub total_matches: u64,
71 pub total_insertions: u64,
72 pub total_deletions: u64,
73 pub total_soft_clips: u64,
74 pub total_hard_clips: u64,
75}
76
77impl Default for BamStats {
78 fn default() -> Self {
79 Self::new()
80 }
81}
82
83impl BamStats {
84 pub fn new() -> Self {
86 Self {
87 total_reads: 0,
88 mapped_reads: 0,
89 unmapped_reads: 0,
90 properly_paired: 0,
91 duplicates: 0,
92 secondary_alignments: 0,
93 supplementary_alignments: 0,
94 qc_failed: 0,
95 mapq_stats: RunningStats::new(),
96 mapq_distribution: CategoryCounter::new(),
97 length_stats: RunningStats::new(),
98 length_distribution: CategoryCounter::new(),
99 reads_per_chrom: CategoryCounter::new(),
100 high_quality_mapq10: 0,
101 high_quality_mapq20: 0,
102 high_quality_mapq30: 0,
103 first_in_pair: 0,
104 second_in_pair: 0,
105 reverse_strand: 0,
106 total_matches: 0,
107 total_insertions: 0,
108 total_deletions: 0,
109 total_soft_clips: 0,
110 total_hard_clips: 0,
111 }
112 }
113
114 pub fn update(&mut self, record: &BamRecord, ref_name: Option<&str>) {
119 self.total_reads += 1;
120
121 let flag = record.flag;
123
124 let is_unmapped = (flag & 0x4) != 0;
126 if is_unmapped {
127 self.unmapped_reads += 1;
128 } else {
129 self.mapped_reads += 1;
130
131 let mapq = record.mapq;
133 self.mapq_stats.push(mapq as f64);
134 self.mapq_distribution.increment(mapq);
135
136 self.high_quality_mapq10 += (mapq >= 10) as usize;
138 self.high_quality_mapq20 += (mapq >= 20) as usize;
139 self.high_quality_mapq30 += (mapq >= 30) as usize;
140
141 if let Some(chrom) = ref_name {
143 self.reads_per_chrom.increment_str(chrom);
144 }
145 }
146
147 self.properly_paired += ((flag & 0x2) != 0) as usize;
150 self.duplicates += ((flag & 0x400) != 0) as usize;
151 self.secondary_alignments += ((flag & 0x100) != 0) as usize;
152 self.supplementary_alignments += ((flag & 0x800) != 0) as usize;
153 self.qc_failed += ((flag & 0x200) != 0) as usize;
154 self.first_in_pair += ((flag & 0x40) != 0) as usize;
155 self.second_in_pair += ((flag & 0x80) != 0) as usize;
156 self.reverse_strand += ((flag & 0x10) != 0) as usize;
157
158 let length = record.seq.len();
160 self.length_stats.push(length as f64);
161 self.length_distribution.increment(length);
162
163 for op in &record.cigar {
166 use super::reader::CigarOp;
167 match op {
168 CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => {
169 self.total_matches += *len as u64;
170 }
171 CigarOp::Ins(len) => {
172 self.total_insertions += *len as u64;
173 }
174 CigarOp::Del(len) => {
175 self.total_deletions += *len as u64;
176 }
177 CigarOp::SoftClip(len) => {
178 self.total_soft_clips += *len as u64;
179 }
180 CigarOp::HardClip(len) => {
181 self.total_hard_clips += *len as u64;
182 }
183 _ => {}
184 }
185 }
186 }
187
188 pub fn par_compute(
223 reader: BamReader<std::fs::File>,
224 config: Option<crate::parallel::ParallelConfig>,
225 ) -> Result<Self> {
226 let path = reader.file_path()
228 .ok_or_else(|| {
229 crate::error::Error::InvalidInput(
230 "No file path available. Use BamReader::from_path() to open the file.".to_string()
231 )
232 })?
233 .to_path_buf();
234
235 Self::par_compute_from_path(path, config)
237 }
238
239 pub fn par_compute_from_path<P: AsRef<std::path::Path>>(
251 path: P,
252 config: Option<crate::parallel::ParallelConfig>,
253 ) -> Result<Self> {
254 use rayon::prelude::*;
255 use std::fs::File;
256 use std::io::{BufReader, Seek, SeekFrom};
257
258 let config = config.unwrap_or_default();
259 let path = path.as_ref();
260
261 let file_size = std::fs::metadata(path)?.len();
263
264 if file_size < 100_000_000 {
266 let mut reader = BamReader::from_path(path)?;
268 reader.read_header()?;
269 return Self::compute_sequential(&mut reader);
270 }
271
272 let start_scan = std::time::Instant::now();
273
274 let block_offsets = crate::formats::bam::block_parsing::scan_bgzf_blocks(path)?;
276 let num_blocks = block_offsets.len();
277
278 println!(" Found {} BGZF blocks in {:?}", num_blocks, start_scan.elapsed());
279
280 let mut header_reader = BamReader::from_path(path)?;
282 let header = header_reader.read_header()?.clone();
283
284 let header_end_block = header_reader.blocks_decompressed();
287
288
289 let alignment_blocks: Vec<u64> = block_offsets.iter()
291 .skip(header_end_block)
292 .copied()
293 .collect();
294 let num_alignment_blocks = alignment_blocks.len();
295
296 let num_threads = config.threads();
297 let blocks_per_thread = (num_alignment_blocks + num_threads - 1) / num_threads; let pool = crate::parallel::get_thread_pool(num_threads)?;
301
302 let partial_stats: Vec<Self> = pool.install(|| {
304 (0..num_threads)
305 .into_par_iter()
306 .map(|thread_id| -> Result<Self> {
307 let start_block_idx = thread_id * blocks_per_thread;
308 if start_block_idx >= num_alignment_blocks {
309 return Ok(Self::new());
311 }
312
313 let end_block_idx = ((thread_id + 1) * blocks_per_thread).min(num_alignment_blocks);
314 let blocks_to_process = end_block_idx - start_block_idx;
315
316 let file = File::open(path)?;
318 let start_offset = alignment_blocks[start_block_idx];
319
320 let mut buf_reader = BufReader::new(file);
321 buf_reader.seek(SeekFrom::Start(start_offset))?;
322
323 let mut reader = BamReader::new(buf_reader);
325 reader.set_header(header.clone());
326
327 let mut stats = Self::new();
328 let mut records_processed = 0;
329
330 while reader.blocks_decompressed() < blocks_to_process {
332 match reader.next_record()? {
333 Some(record) => {
334 let ref_name = if record.refid >= 0 {
335 header.ref_name(record.refid)
336 } else {
337 None
338 };
339 stats.update(&record, ref_name);
340 records_processed += 1;
341 }
342 None => break, }
344 }
345
346 if records_processed > 0 {
348 println!(" Thread {}: processed {} records from {} blocks",
349 thread_id, records_processed, blocks_to_process);
350 }
351
352 Ok(stats)
353 })
354 .collect::<Result<Vec<_>>>()
355 })?;
356
357 println!("✓ Computed partial statistics");
358
359 let merged = Self::merge_all(partial_stats)
361 .ok_or_else(|| crate::error::Error::InvalidInput("No statistics computed".to_string()))?;
362
363 println!("✓ Merged result has {} total reads", merged.total_reads);
364
365 Ok(merged)
366 }
367
368
369 pub fn compute_sequential<R: std::io::Read>(reader: &mut BamReader<R>) -> Result<Self> {
371 let mut stats = Self::new();
372
373 let header = reader.header().ok_or_else(|| {
375 crate::error::Error::InvalidInput("BAM header not read. Call read_header() first.".to_string())
376 })?.clone();
377
378 while let Some(record) = reader.next_record()? {
379 let ref_name = if record.refid >= 0 {
380 header.ref_name(record.refid)
381 } else {
382 None
383 };
384 stats.update(&record, ref_name);
385 }
386
387 Ok(stats)
388 }
389
390 pub fn mean_length(&self) -> Option<f64> {
392 self.length_stats.mean()
393 }
394
395 pub fn std_length(&self) -> Option<f64> {
397 self.length_stats.std_dev()
398 }
399
400 pub fn min_length(&self) -> Option<f64> {
402 self.length_stats.min()
403 }
404
405 pub fn max_length(&self) -> Option<f64> {
407 self.length_stats.max()
408 }
409
410 pub fn mean_mapq(&self) -> Option<f64> {
412 self.mapq_stats.mean()
413 }
414
415 pub fn std_mapq(&self) -> Option<f64> {
417 self.mapq_stats.std_dev()
418 }
419
420 pub fn mapping_rate(&self) -> f64 {
422 if self.total_reads == 0 {
423 0.0
424 } else {
425 (self.mapped_reads as f64 / self.total_reads as f64) * 100.0
426 }
427 }
428
429 pub fn properly_paired_rate(&self) -> f64 {
431 if self.total_reads == 0 {
432 0.0
433 } else {
434 (self.properly_paired as f64 / self.total_reads as f64) * 100.0
435 }
436 }
437
438 pub fn duplicate_rate(&self) -> f64 {
440 if self.total_reads == 0 {
441 0.0
442 } else {
443 (self.duplicates as f64 / self.total_reads as f64) * 100.0
444 }
445 }
446
447 pub fn percent_mapq10(&self) -> f64 {
449 if self.total_reads == 0 {
450 0.0
451 } else {
452 (self.high_quality_mapq10 as f64 / self.total_reads as f64) * 100.0
453 }
454 }
455
456 pub fn percent_mapq20(&self) -> f64 {
458 if self.total_reads == 0 {
459 0.0
460 } else {
461 (self.high_quality_mapq20 as f64 / self.total_reads as f64) * 100.0
462 }
463 }
464
465 pub fn percent_mapq30(&self) -> f64 {
467 if self.total_reads == 0 {
468 0.0
469 } else {
470 (self.high_quality_mapq30 as f64 / self.total_reads as f64) * 100.0
471 }
472 }
473
474 pub fn print_summary(&self) {
476 println!("=== BAM Statistics ===\n");
477
478 println!("Basic Counts:");
479 println!(" Total reads: {:>12}", self.total_reads);
480 println!(" Mapped reads: {:>12}", self.mapped_reads);
481 println!(" Unmapped reads: {:>12}", self.unmapped_reads);
482 println!();
483
484 println!("Mapping Metrics:");
485 println!(" Mapping rate: {:>11.1}%", self.mapping_rate());
486 println!(
487 " Mean MAPQ: {:>12.2}",
488 self.mean_mapq().unwrap_or(0.0)
489 );
490 println!(
491 " Std Dev MAPQ: {:>12.2}",
492 self.std_mapq().unwrap_or(0.0)
493 );
494 println!(" MAPQ >= 10: {:>11.1}%", self.percent_mapq10());
495 println!(" MAPQ >= 20: {:>11.1}%", self.percent_mapq20());
496 println!(" MAPQ >= 30: {:>11.1}%", self.percent_mapq30());
497 println!();
498
499 println!("Paired-End Metrics:");
500 println!(" Properly paired: {:>11.1}%", self.properly_paired_rate());
501 println!(" First in pair: {:>12}", self.first_in_pair);
502 println!(" Second in pair: {:>12}", self.second_in_pair);
503 println!();
504
505 println!("Quality Flags:");
506 println!(" Duplicates: {:>11.1}%", self.duplicate_rate());
507 println!(" Secondary: {:>12}", self.secondary_alignments);
508 println!(" Supplementary: {:>12}", self.supplementary_alignments);
509 println!(" QC failed: {:>12}", self.qc_failed);
510 println!();
511
512 println!("Read Length:");
513 println!(
514 " Mean: {:>12.2}",
515 self.mean_length().unwrap_or(0.0)
516 );
517 println!(
518 " Std Dev: {:>12.2}",
519 self.std_length().unwrap_or(0.0)
520 );
521 println!(
522 " Min: {:>12.0}",
523 self.min_length().unwrap_or(0.0)
524 );
525 println!(
526 " Max: {:>12.0}",
527 self.max_length().unwrap_or(0.0)
528 );
529 println!();
530
531 println!("CIGAR Operations:");
532 println!(" Matches/Mismatches: {:>11}", self.total_matches);
533 println!(" Insertions: {:>11}", self.total_insertions);
534 println!(" Deletions: {:>11}", self.total_deletions);
535 println!(" Soft clips: {:>11}", self.total_soft_clips);
536 println!(" Hard clips: {:>11}", self.total_hard_clips);
537 println!();
538
539 println!("Top Chromosomes:");
541 let mut chrom_vec: Vec<_> = self.reads_per_chrom.iter().collect();
542 chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); for (chrom, count) in chrom_vec.iter().take(10) {
544 let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
545 println!(" {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
546 }
547 println!();
548
549 println!("MAPQ Distribution:");
551 let mut mapq_vec: Vec<_> = self.mapq_distribution.iter().collect();
552 mapq_vec.sort_by_key(|a| a.0); for (mapq, count) in mapq_vec.iter().take(15) {
554 let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
555 println!(" MAPQ {:>3}: {:>10} ({:>5.1}%)", mapq, count, percent);
556 }
557 }
558}
559
560impl Mergeable for BamStats {
566 fn merge(&mut self, other: Self) {
567 self.total_reads += other.total_reads;
569 self.mapped_reads += other.mapped_reads;
570 self.unmapped_reads += other.unmapped_reads;
571 self.properly_paired += other.properly_paired;
572 self.duplicates += other.duplicates;
573 self.secondary_alignments += other.secondary_alignments;
574 self.supplementary_alignments += other.supplementary_alignments;
575 self.qc_failed += other.qc_failed;
576 self.high_quality_mapq10 += other.high_quality_mapq10;
577 self.high_quality_mapq20 += other.high_quality_mapq20;
578 self.high_quality_mapq30 += other.high_quality_mapq30;
579 self.first_in_pair += other.first_in_pair;
580 self.second_in_pair += other.second_in_pair;
581 self.reverse_strand += other.reverse_strand;
582 self.total_matches += other.total_matches;
583 self.total_insertions += other.total_insertions;
584 self.total_deletions += other.total_deletions;
585 self.total_soft_clips += other.total_soft_clips;
586 self.total_hard_clips += other.total_hard_clips;
587
588 self.mapq_stats.merge(other.mapq_stats);
590 self.length_stats.merge(other.length_stats);
591
592 self.mapq_distribution.merge(other.mapq_distribution);
594 self.length_distribution.merge(other.length_distribution);
595 self.reads_per_chrom.merge(other.reads_per_chrom);
596 }
597}
598
599#[cfg(test)]
600mod tests {
601 use super::*;
602
603 #[test]
604 fn test_bam_stats_new() {
605 let stats = BamStats::new();
606 assert_eq!(stats.total_reads, 0);
607 assert_eq!(stats.mapped_reads, 0);
608 assert_eq!(stats.unmapped_reads, 0);
609 }
610
611 #[test]
612 fn test_bam_stats_merge() {
613 let mut stats1 = BamStats::new();
614 stats1.total_reads = 100;
615 stats1.mapped_reads = 80;
616 stats1.unmapped_reads = 20;
617
618 let mut stats2 = BamStats::new();
619 stats2.total_reads = 50;
620 stats2.mapped_reads = 45;
621 stats2.unmapped_reads = 5;
622
623 stats1.merge(stats2);
624
625 assert_eq!(stats1.total_reads, 150);
626 assert_eq!(stats1.mapped_reads, 125);
627 assert_eq!(stats1.unmapped_reads, 25);
628 }
629
630 #[test]
631 fn test_bam_stats_rates() {
632 let mut stats = BamStats::new();
633 stats.total_reads = 100;
634 stats.mapped_reads = 90;
635 stats.properly_paired = 80;
636 stats.duplicates = 10;
637
638 assert!((stats.mapping_rate() - 90.0).abs() < 0.1);
639 assert!((stats.properly_paired_rate() - 80.0).abs() < 0.1);
640 assert!((stats.duplicate_rate() - 10.0).abs() < 0.1);
641 }
642
643 #[test]
644 fn test_bam_stats_empty() {
645 let stats = BamStats::new();
646 assert_eq!(stats.mapping_rate(), 0.0);
647 assert_eq!(stats.properly_paired_rate(), 0.0);
648 assert_eq!(stats.duplicate_rate(), 0.0);
649 }
650}