genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
//! SAM statistics computation
//!
//! Streaming statistics collection for SAM alignment files.
//! Memory-efficient: processes millions of alignments with O(1) memory for basic stats.

use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::formats::bam::CigarOp;
use crate::formats::sam::{SamReader, SamRecord};
use crate::parallel::Mergeable;
use crate::stats::{CategoryCounter, RunningStats};

/// Comprehensive statistics for SAM files
#[derive(Debug, Clone)]
pub struct SamStats {
    /// Total number of reads
    pub total_reads: usize,

    /// Number of mapped reads
    pub mapped_reads: usize,

    /// Number of unmapped reads
    pub unmapped_reads: usize,

    /// Number of reads mapped in proper pairs
    pub properly_paired: usize,

    /// Number of duplicate reads
    pub duplicates: usize,

    /// Number of secondary alignments
    pub secondary_alignments: usize,

    /// Number of supplementary alignments
    pub supplementary_alignments: usize,

    /// Number of QC-failed reads
    pub qc_failed: usize,

    /// Mapping quality statistics (for mapped reads only)
    pub mapq_stats: RunningStats,

    /// Distribution of mapping qualities
    pub mapq_distribution: CategoryCounter<u8>,

    /// Read length statistics
    pub length_stats: RunningStats,

    /// Distribution of read lengths
    pub length_distribution: CategoryCounter<usize>,

    /// Reads per chromosome
    pub reads_per_chrom: CategoryCounter<String>,

    /// Number of reads with MAPQ >= 10
    pub high_quality_mapq10: usize,

    /// Number of reads with MAPQ >= 20
    pub high_quality_mapq20: usize,

    /// Number of reads with MAPQ >= 30
    pub high_quality_mapq30: usize,

    /// Paired-end statistics
    pub first_in_pair: usize,
    pub second_in_pair: usize,
    pub reverse_strand: usize,

    /// CIGAR operation statistics
    pub total_matches: u64,
    pub total_insertions: u64,
    pub total_deletions: u64,
    pub total_soft_clips: u64,
    pub total_hard_clips: u64,
}

impl Default for SamStats {
    fn default() -> Self {
        Self::new()
    }
}

impl SamStats {
    /// Create a new statistics collector
    pub fn new() -> Self {
        Self {
            total_reads: 0,
            mapped_reads: 0,
            unmapped_reads: 0,
            properly_paired: 0,
            duplicates: 0,
            secondary_alignments: 0,
            supplementary_alignments: 0,
            qc_failed: 0,
            mapq_stats: RunningStats::new(),
            mapq_distribution: CategoryCounter::new(),
            length_stats: RunningStats::new(),
            length_distribution: CategoryCounter::new(),
            reads_per_chrom: CategoryCounter::new(),
            high_quality_mapq10: 0,
            high_quality_mapq20: 0,
            high_quality_mapq30: 0,
            first_in_pair: 0,
            second_in_pair: 0,
            reverse_strand: 0,
            total_matches: 0,
            total_insertions: 0,
            total_deletions: 0,
            total_soft_clips: 0,
            total_hard_clips: 0,
        }
    }

    /// Update statistics with a single record (streaming)
    ///
    /// Optimized for single-pass processing: computes all statistics in one iteration
    /// over alignment data, avoiding repeated allocations and iterations.
    pub fn update(&mut self, record: &SamRecord) {
        self.total_reads += 1;

        // Extract flag once to avoid repeated method calls
        let flag = record.flag;

        // Flag-based counts (optimized with bitwise operations)
        let is_unmapped = (flag & 0x4) != 0;
        if is_unmapped {
            self.unmapped_reads += 1;
        } else {
            self.mapped_reads += 1;

            // Only track MAPQ for mapped reads
            let mapq = record.mapq;
            self.mapq_stats.push(mapq as f64);
            self.mapq_distribution.increment(mapq);

            // Branchless MAPQ quality counting (avoids multiple if statements)
            self.high_quality_mapq10 += (mapq >= 10) as usize;
            self.high_quality_mapq20 += (mapq >= 20) as usize;
            self.high_quality_mapq30 += (mapq >= 30) as usize;

            // Track reads per chromosome (avoid String allocation)
            if record.rname != "*" {
                self.reads_per_chrom.increment_str(&record.rname);
            }
        }

        // Use bitwise operations directly instead of method calls
        // This avoids function call overhead
        self.properly_paired += ((flag & 0x2) != 0) as usize;
        self.duplicates += ((flag & 0x400) != 0) as usize;
        self.secondary_alignments += ((flag & 0x100) != 0) as usize;
        self.supplementary_alignments += ((flag & 0x800) != 0) as usize;
        self.qc_failed += ((flag & 0x200) != 0) as usize;
        self.first_in_pair += ((flag & 0x40) != 0) as usize;
        self.second_in_pair += ((flag & 0x80) != 0) as usize;
        self.reverse_strand += ((flag & 0x10) != 0) as usize;

        // Read length (from sequence)
        let length = record.seq.len();
        self.length_stats.push(length as f64);
        self.length_distribution.increment(length);

        // CIGAR operation statistics
        for op in &record.cigar {
            match op {
                CigarOp::Match(len) | CigarOp::Equal(len) | CigarOp::Diff(len) => {
                    self.total_matches += *len as u64;
                }
                CigarOp::Ins(len) => {
                    self.total_insertions += *len as u64;
                }
                CigarOp::Del(len) => {
                    self.total_deletions += *len as u64;
                }
                CigarOp::SoftClip(len) => {
                    self.total_soft_clips += *len as u64;
                }
                CigarOp::HardClip(len) => {
                    self.total_hard_clips += *len as u64;
                }
                _ => {}
            }
        }
    }

    /// Compute statistics in parallel (not yet implemented for SAM)
    ///
    /// SAM files are text-based and harder to parallelize efficiently than binary BAM files.
    /// For now, use `compute()` for sequential processing, which is still fast.
    ///
    /// Future implementation may support:
    /// - Chunked parallel processing with line-boundary detection
    /// - Memory-mapped file access with parallel scanning
    ///
    /// For parallel statistics on alignment files, consider using BAM format instead.
    pub fn par_compute(_reader: &mut SamReader, _threads: usize) -> Result<Self> {
        Err(crate::error::Error::InvalidInput(
            "Parallel statistics computation not yet implemented for SAM format. Use compute() instead.".to_string()
        ))
    }

    /// Compute statistics from a SAM reader (sequential processing)
    ///
    /// SAM files are text-based, so parallel processing is more complex than BAM.
    /// For now, we use sequential processing which is still very fast for text parsing.
    ///
    /// # Example
    /// ```ignore
    /// use genomicframe_core::formats::sam::{SamReader, SamStats};
    ///
    /// let mut reader = SamReader::from_path("alignments.sam")?;
    /// reader.read_header()?;
    /// let stats = SamStats::compute(&mut reader)?;
    /// stats.print_summary();
    /// ```
    pub fn compute(reader: &mut SamReader) -> Result<Self> {
        let mut stats = Self::new();

        // Get the header to lookup reference names
        let header = reader.header().ok_or_else(|| {
            crate::error::Error::InvalidInput("SAM header not read. Call read_header() first.".to_string())
        })?;

        // Clone header to avoid borrow conflicts (SAM header is small)
        let _header = header.clone();

        while let Some(record) = reader.next_record()? {
            stats.update(&record);
        }

        Ok(stats)
    }

    /// Mean read length
    pub fn mean_length(&self) -> Option<f64> {
        self.length_stats.mean()
    }

    /// Standard deviation of read length
    pub fn std_length(&self) -> Option<f64> {
        self.length_stats.std_dev()
    }

    /// Min read length
    pub fn min_length(&self) -> Option<f64> {
        self.length_stats.min()
    }

    /// Max read length
    pub fn max_length(&self) -> Option<f64> {
        self.length_stats.max()
    }

    /// Mean mapping quality (for mapped reads)
    pub fn mean_mapq(&self) -> Option<f64> {
        self.mapq_stats.mean()
    }

    /// Standard deviation of mapping quality
    pub fn std_mapq(&self) -> Option<f64> {
        self.mapq_stats.std_dev()
    }

    /// Mapping rate (percentage of mapped reads)
    pub fn mapping_rate(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.mapped_reads as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Properly paired rate (percentage of properly paired reads)
    pub fn properly_paired_rate(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.properly_paired as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Duplicate rate (percentage of duplicate reads)
    pub fn duplicate_rate(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.duplicates as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Percentage of reads with MAPQ >= 10
    pub fn percent_mapq10(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.high_quality_mapq10 as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Percentage of reads with MAPQ >= 20
    pub fn percent_mapq20(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.high_quality_mapq20 as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Percentage of reads with MAPQ >= 30
    pub fn percent_mapq30(&self) -> f64 {
        if self.total_reads == 0 {
            0.0
        } else {
            (self.high_quality_mapq30 as f64 / self.total_reads as f64) * 100.0
        }
    }

    /// Print a human-readable summary of statistics
    pub fn print_summary(&self) {
        println!("=== SAM Statistics ===\n");

        println!("Basic Counts:");
        println!("  Total reads:       {:>12}", self.total_reads);
        println!("  Mapped reads:      {:>12}", self.mapped_reads);
        println!("  Unmapped reads:    {:>12}", self.unmapped_reads);
        println!();

        println!("Mapping Metrics:");
        println!("  Mapping rate:      {:>11.1}%", self.mapping_rate());
        println!(
            "  Mean MAPQ:         {:>12.2}",
            self.mean_mapq().unwrap_or(0.0)
        );
        println!(
            "  Std Dev MAPQ:      {:>12.2}",
            self.std_mapq().unwrap_or(0.0)
        );
        println!("  MAPQ >= 10:        {:>11.1}%", self.percent_mapq10());
        println!("  MAPQ >= 20:        {:>11.1}%", self.percent_mapq20());
        println!("  MAPQ >= 30:        {:>11.1}%", self.percent_mapq30());
        println!();

        println!("Paired-End Metrics:");
        println!("  Properly paired:   {:>11.1}%", self.properly_paired_rate());
        println!("  First in pair:     {:>12}", self.first_in_pair);
        println!("  Second in pair:    {:>12}", self.second_in_pair);
        println!();

        println!("Quality Flags:");
        println!("  Duplicates:        {:>11.1}%", self.duplicate_rate());
        println!("  Secondary:         {:>12}", self.secondary_alignments);
        println!("  Supplementary:     {:>12}", self.supplementary_alignments);
        println!("  QC failed:         {:>12}", self.qc_failed);
        println!();

        println!("Read Length:");
        println!(
            "  Mean:              {:>12.2}",
            self.mean_length().unwrap_or(0.0)
        );
        println!(
            "  Std Dev:           {:>12.2}",
            self.std_length().unwrap_or(0.0)
        );
        println!(
            "  Min:               {:>12.0}",
            self.min_length().unwrap_or(0.0)
        );
        println!(
            "  Max:               {:>12.0}",
            self.max_length().unwrap_or(0.0)
        );
        println!();

        println!("CIGAR Operations:");
        println!("  Matches/Mismatches: {:>11}", self.total_matches);
        println!("  Insertions:         {:>11}", self.total_insertions);
        println!("  Deletions:          {:>11}", self.total_deletions);
        println!("  Soft clips:         {:>11}", self.total_soft_clips);
        println!("  Hard clips:         {:>11}", self.total_hard_clips);
        println!();

        // Top chromosomes (sorted by count descending)
        if self.mapped_reads > 0 {
            println!("Top Chromosomes:");
            let mut chrom_vec: Vec<_> = self.reads_per_chrom.iter().collect();
            chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
            for (chrom, count) in chrom_vec.iter().take(10) {
                let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
                println!("  {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
            }
            println!();
        }

        // MAPQ distribution (binned, sorted by MAPQ)
        if self.mapped_reads > 0 {
            println!("MAPQ Distribution:");
            let mut mapq_vec: Vec<_> = self.mapq_distribution.iter().collect();
            mapq_vec.sort_by_key(|a| a.0); // Sort by MAPQ ascending
            for (mapq, count) in mapq_vec.iter().take(15) {
                let percent = (**count as f64 / self.mapped_reads as f64) * 100.0;
                println!("  MAPQ {:>3}: {:>10} ({:>5.1}%)", mapq, count, percent);
            }
        }
    }
}

/// Implementation of Mergeable for parallel statistics computation
///
/// Merges SAM statistics from multiple threads/chunks.
/// All statistics are correctly combined using the underlying
/// Mergeable implementations for RunningStats and CategoryCounter.
impl Mergeable for SamStats {
    fn merge(&mut self, other: Self) {
        // Merge simple counters
        self.total_reads += other.total_reads;
        self.mapped_reads += other.mapped_reads;
        self.unmapped_reads += other.unmapped_reads;
        self.properly_paired += other.properly_paired;
        self.duplicates += other.duplicates;
        self.secondary_alignments += other.secondary_alignments;
        self.supplementary_alignments += other.supplementary_alignments;
        self.qc_failed += other.qc_failed;
        self.high_quality_mapq10 += other.high_quality_mapq10;
        self.high_quality_mapq20 += other.high_quality_mapq20;
        self.high_quality_mapq30 += other.high_quality_mapq30;
        self.first_in_pair += other.first_in_pair;
        self.second_in_pair += other.second_in_pair;
        self.reverse_strand += other.reverse_strand;
        self.total_matches += other.total_matches;
        self.total_insertions += other.total_insertions;
        self.total_deletions += other.total_deletions;
        self.total_soft_clips += other.total_soft_clips;
        self.total_hard_clips += other.total_hard_clips;

        // Merge running statistics
        self.mapq_stats.merge(other.mapq_stats);
        self.length_stats.merge(other.length_stats);

        // Merge categorical data
        self.mapq_distribution.merge(other.mapq_distribution);
        self.length_distribution.merge(other.length_distribution);
        self.reads_per_chrom.merge(other.reads_per_chrom);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sam_stats_new() {
        let stats = SamStats::new();
        assert_eq!(stats.total_reads, 0);
        assert_eq!(stats.mapped_reads, 0);
        assert_eq!(stats.unmapped_reads, 0);
    }

    #[test]
    fn test_sam_stats_merge() {
        let mut stats1 = SamStats::new();
        stats1.total_reads = 100;
        stats1.mapped_reads = 80;
        stats1.unmapped_reads = 20;

        let mut stats2 = SamStats::new();
        stats2.total_reads = 50;
        stats2.mapped_reads = 45;
        stats2.unmapped_reads = 5;

        stats1.merge(stats2);

        assert_eq!(stats1.total_reads, 150);
        assert_eq!(stats1.mapped_reads, 125);
        assert_eq!(stats1.unmapped_reads, 25);
    }

    #[test]
    fn test_sam_stats_rates() {
        let mut stats = SamStats::new();
        stats.total_reads = 100;
        stats.mapped_reads = 90;
        stats.properly_paired = 80;
        stats.duplicates = 10;

        assert!((stats.mapping_rate() - 90.0).abs() < 0.1);
        assert!((stats.properly_paired_rate() - 80.0).abs() < 0.1);
        assert!((stats.duplicate_rate() - 10.0).abs() < 0.1);
    }

    #[test]
    fn test_sam_stats_empty() {
        let stats = SamStats::new();
        assert_eq!(stats.mapping_rate(), 0.0);
        assert_eq!(stats.properly_paired_rate(), 0.0);
        assert_eq!(stats.duplicate_rate(), 0.0);
    }
}