Skip to main content

genomicframe_core/formats/bed/
stats.rs

1//! BED statistics computation
2//!
3//! Streaming statistics collection for BED interval files.
4//! Memory-efficient: processes millions of intervals with O(1) memory for basic stats.
5
6use crate::core::GenomicRecordIterator;
7use crate::error::Result;
8use crate::parallel::Mergeable;
9use crate::stats::{CategoryCounter, RunningStats};
10
11use super::reader::{BedReader, BedRecord};
12
13/// Comprehensive statistics for BED files
14#[derive(Debug, Clone)]
15pub struct BedStats {
16    /// Total number of intervals
17    pub total_intervals: usize,
18
19    /// Number of BED3 records (minimal fields only)
20    pub bed3_count: usize,
21
22    /// Number of BED6 records (includes name, score, strand)
23    pub bed6_count: usize,
24
25    /// Number of BED12 records (includes blocks/exons)
26    pub bed12_count: usize,
27
28    /// Interval length statistics
29    pub length_stats: RunningStats,
30
31    /// Distribution of interval lengths
32    pub length_distribution: CategoryCounter<u64>,
33
34    /// Intervals per chromosome
35    pub intervals_per_chrom: CategoryCounter<String>,
36
37    /// Score statistics (for BED6+ records)
38    pub score_stats: RunningStats,
39
40    /// Strand distribution (for BED6+ records)
41    pub plus_strand: usize,
42    pub minus_strand: usize,
43    pub unstranded: usize,
44
45    /// Block/exon statistics (for BED12 records)
46    pub block_count_stats: RunningStats,
47    pub total_blocks: usize,
48
49    /// Total genomic span covered (sum of all interval lengths)
50    pub total_span: u64,
51
52    /// Min interval length
53    pub min_length: u64,
54
55    /// Max interval length
56    pub max_length: u64,
57}
58
59impl Default for BedStats {
60    fn default() -> Self {
61        Self::new()
62    }
63}
64
65impl BedStats {
66    /// Create a new statistics collector
67    pub fn new() -> Self {
68        Self {
69            total_intervals: 0,
70            bed3_count: 0,
71            bed6_count: 0,
72            bed12_count: 0,
73            length_stats: RunningStats::new(),
74            length_distribution: CategoryCounter::new(),
75            intervals_per_chrom: CategoryCounter::new(),
76            score_stats: RunningStats::new(),
77            plus_strand: 0,
78            minus_strand: 0,
79            unstranded: 0,
80            block_count_stats: RunningStats::new(),
81            total_blocks: 0,
82            total_span: 0,
83            min_length: u64::MAX,
84            max_length: 0,
85        }
86    }
87
88    /// Update statistics with a single record (streaming)
89    pub fn update(&mut self, record: &BedRecord) {
90        self.total_intervals += 1;
91
92        // Classify record type
93        if record.is_bed12() {
94            self.bed12_count += 1;
95        } else if record.is_bed6() {
96            self.bed6_count += 1;
97        } else {
98            self.bed3_count += 1;
99        }
100
101        // Interval length
102        let length = record.len();
103        self.length_stats.push(length as f64);
104        self.length_distribution.increment(length);
105        self.total_span += length;
106        self.min_length = self.min_length.min(length);
107        self.max_length = self.max_length.max(length);
108
109        // Chromosome distribution
110        self.intervals_per_chrom.increment_str(&record.chrom);
111
112        // Score statistics (BED6+)
113        if let Some(score) = record.score {
114            self.score_stats.push(score);
115        }
116
117        // Strand distribution (BED6+)
118        match record.strand {
119            Some('+') => self.plus_strand += 1,
120            Some('-') => self.minus_strand += 1,
121            _ => self.unstranded += 1,
122        }
123
124        // Block/exon statistics (BED12)
125        if let Some(block_count) = record.block_count {
126            self.block_count_stats.push(block_count as f64);
127            self.total_blocks += block_count as usize;
128        }
129    }
130
131    /// Compute statistics from a BED reader (sequential processing)
132    ///
133    /// # Example
134    /// ```ignore
135    /// use genomicframe_core::formats::bed::{BedReader, BedStats};
136    ///
137    /// let mut reader = BedReader::from_path("regions.bed")?;
138    /// let stats = BedStats::compute(&mut reader)?;
139    /// stats.print_summary();
140    /// ```
141    pub fn compute(reader: &mut BedReader) -> Result<Self> {
142        let mut stats = Self::new();
143
144        while let Some(record) = reader.next_record()? {
145            stats.update(&record);
146        }
147
148        Ok(stats)
149    }
150
151    /// Mean interval length
152    pub fn mean_length(&self) -> Option<f64> {
153        self.length_stats.mean()
154    }
155
156    /// Standard deviation of interval length
157    pub fn std_length(&self) -> Option<f64> {
158        self.length_stats.std_dev()
159    }
160
161    /// Mean score (for BED6+ records)
162    pub fn mean_score(&self) -> Option<f64> {
163        self.score_stats.mean()
164    }
165
166    /// Standard deviation of score
167    pub fn std_score(&self) -> Option<f64> {
168        self.score_stats.std_dev()
169    }
170
171    /// Mean number of blocks per interval (for BED12 records)
172    pub fn mean_blocks(&self) -> Option<f64> {
173        self.block_count_stats.mean()
174    }
175
176    /// Percentage of intervals on plus strand
177    pub fn plus_strand_percent(&self) -> f64 {
178        if self.total_intervals == 0 {
179            0.0
180        } else {
181            (self.plus_strand as f64 / self.total_intervals as f64) * 100.0
182        }
183    }
184
185    /// Percentage of intervals on minus strand
186    pub fn minus_strand_percent(&self) -> f64 {
187        if self.total_intervals == 0 {
188            0.0
189        } else {
190            (self.minus_strand as f64 / self.total_intervals as f64) * 100.0
191        }
192    }
193
194    /// Print a human-readable summary of statistics
195    pub fn print_summary(&self) {
196        println!("=== BED Statistics ===\n");
197
198        println!("Basic Counts:");
199        println!("  Total intervals:   {:>12}", self.total_intervals);
200        println!("  BED3 (minimal):    {:>12}", self.bed3_count);
201        println!("  BED6 (standard):   {:>12}", self.bed6_count);
202        println!("  BED12 (full):      {:>12}", self.bed12_count);
203        println!();
204
205        println!("Interval Lengths:");
206        println!(
207            "  Mean:              {:>12.2}",
208            self.mean_length().unwrap_or(0.0)
209        );
210        println!(
211            "  Std Dev:           {:>12.2}",
212            self.std_length().unwrap_or(0.0)
213        );
214        println!("  Min:               {:>12}", self.min_length);
215        println!("  Max:               {:>12}", self.max_length);
216        println!("  Total span:        {:>12}", self.total_span);
217        println!();
218
219        if self.bed6_count > 0 || self.bed12_count > 0 {
220            println!("Strand Distribution:");
221            println!(
222                "  Plus (+):          {:>11.1}%",
223                self.plus_strand_percent()
224            );
225            println!(
226                "  Minus (-):         {:>11.1}%",
227                self.minus_strand_percent()
228            );
229            println!("  Unstranded (.):    {:>12}", self.unstranded);
230            println!();
231
232            if self.score_stats.count() > 0 {
233                println!("Score Statistics:");
234                println!(
235                    "  Mean:              {:>12.2}",
236                    self.mean_score().unwrap_or(0.0)
237                );
238                println!(
239                    "  Std Dev:           {:>12.2}",
240                    self.std_score().unwrap_or(0.0)
241                );
242                println!();
243            }
244        }
245
246        if self.bed12_count > 0 {
247            println!("Block/Exon Structure (BED12):");
248            println!("  Total blocks:      {:>12}", self.total_blocks);
249            println!(
250                "  Mean blocks/int:   {:>12.2}",
251                self.mean_blocks().unwrap_or(0.0)
252            );
253            println!();
254        }
255
256        // Top chromosomes (sorted by count descending)
257        println!("Top Chromosomes:");
258        let mut chrom_vec: Vec<_> = self.intervals_per_chrom.iter().collect();
259        chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
260        for (chrom, count) in chrom_vec.iter().take(10) {
261            let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
262            println!("  {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
263        }
264        println!();
265
266        // Length distribution (show quantiles)
267        if self.total_intervals > 0 {
268            println!("Length Distribution (most common):");
269            let mut length_vec: Vec<_> = self.length_distribution.iter().collect();
270            length_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
271            for (length, count) in length_vec.iter().take(10) {
272                let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
273                println!("  {:>12} bp: {:>8} ({:>5.1}%)", length, count, percent);
274            }
275        }
276    }
277}
278
279/// Implementation of Mergeable for parallel statistics computation
280///
281/// Merges BED statistics from multiple threads/chunks.
282/// All statistics are correctly combined using the underlying
283/// Mergeable implementations for RunningStats and CategoryCounter.
284impl Mergeable for BedStats {
285    fn merge(&mut self, other: Self) {
286        // Merge simple counters
287        self.total_intervals += other.total_intervals;
288        self.bed3_count += other.bed3_count;
289        self.bed6_count += other.bed6_count;
290        self.bed12_count += other.bed12_count;
291        self.plus_strand += other.plus_strand;
292        self.minus_strand += other.minus_strand;
293        self.unstranded += other.unstranded;
294        self.total_blocks += other.total_blocks;
295        self.total_span += other.total_span;
296        self.min_length = self.min_length.min(other.min_length);
297        self.max_length = self.max_length.max(other.max_length);
298
299        // Merge running statistics
300        self.length_stats.merge(other.length_stats);
301        self.score_stats.merge(other.score_stats);
302        self.block_count_stats.merge(other.block_count_stats);
303
304        // Merge categorical data
305        self.length_distribution.merge(other.length_distribution);
306        self.intervals_per_chrom.merge(other.intervals_per_chrom);
307    }
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313
314    #[test]
315    fn test_bed_stats_new() {
316        let stats = BedStats::new();
317        assert_eq!(stats.total_intervals, 0);
318        assert_eq!(stats.bed3_count, 0);
319        assert_eq!(stats.bed6_count, 0);
320    }
321
322    #[test]
323    fn test_bed_stats_merge() {
324        let mut stats1 = BedStats::new();
325        stats1.total_intervals = 100;
326        stats1.bed3_count = 50;
327        stats1.bed6_count = 30;
328        stats1.total_span = 10000;
329
330        let mut stats2 = BedStats::new();
331        stats2.total_intervals = 50;
332        stats2.bed3_count = 25;
333        stats2.bed6_count = 15;
334        stats2.total_span = 5000;
335
336        stats1.merge(stats2);
337
338        assert_eq!(stats1.total_intervals, 150);
339        assert_eq!(stats1.bed3_count, 75);
340        assert_eq!(stats1.bed6_count, 45);
341        assert_eq!(stats1.total_span, 15000);
342    }
343
344    #[test]
345    fn test_bed_stats_percentages() {
346        let mut stats = BedStats::new();
347        stats.total_intervals = 100;
348        stats.plus_strand = 60;
349        stats.minus_strand = 40;
350
351        assert!((stats.plus_strand_percent() - 60.0).abs() < 0.1);
352        assert!((stats.minus_strand_percent() - 40.0).abs() < 0.1);
353    }
354
355    #[test]
356    fn test_bed_stats_empty() {
357        let stats = BedStats::new();
358        assert_eq!(stats.plus_strand_percent(), 0.0);
359        assert_eq!(stats.minus_strand_percent(), 0.0);
360    }
361}