genomicframe_core/formats/bed/
stats.rs1use crate::core::GenomicRecordIterator;
7use crate::error::Result;
8use crate::parallel::Mergeable;
9use crate::stats::{CategoryCounter, RunningStats};
10
11use super::reader::{BedReader, BedRecord};
12
13#[derive(Debug, Clone)]
15pub struct BedStats {
16 pub total_intervals: usize,
18
19 pub bed3_count: usize,
21
22 pub bed6_count: usize,
24
25 pub bed12_count: usize,
27
28 pub length_stats: RunningStats,
30
31 pub length_distribution: CategoryCounter<u64>,
33
34 pub intervals_per_chrom: CategoryCounter<String>,
36
37 pub score_stats: RunningStats,
39
40 pub plus_strand: usize,
42 pub minus_strand: usize,
43 pub unstranded: usize,
44
45 pub block_count_stats: RunningStats,
47 pub total_blocks: usize,
48
49 pub total_span: u64,
51
52 pub min_length: u64,
54
55 pub max_length: u64,
57}
58
59impl Default for BedStats {
60 fn default() -> Self {
61 Self::new()
62 }
63}
64
65impl BedStats {
66 pub fn new() -> Self {
68 Self {
69 total_intervals: 0,
70 bed3_count: 0,
71 bed6_count: 0,
72 bed12_count: 0,
73 length_stats: RunningStats::new(),
74 length_distribution: CategoryCounter::new(),
75 intervals_per_chrom: CategoryCounter::new(),
76 score_stats: RunningStats::new(),
77 plus_strand: 0,
78 minus_strand: 0,
79 unstranded: 0,
80 block_count_stats: RunningStats::new(),
81 total_blocks: 0,
82 total_span: 0,
83 min_length: u64::MAX,
84 max_length: 0,
85 }
86 }
87
88 pub fn update(&mut self, record: &BedRecord) {
90 self.total_intervals += 1;
91
92 if record.is_bed12() {
94 self.bed12_count += 1;
95 } else if record.is_bed6() {
96 self.bed6_count += 1;
97 } else {
98 self.bed3_count += 1;
99 }
100
101 let length = record.len();
103 self.length_stats.push(length as f64);
104 self.length_distribution.increment(length);
105 self.total_span += length;
106 self.min_length = self.min_length.min(length);
107 self.max_length = self.max_length.max(length);
108
109 self.intervals_per_chrom.increment_str(&record.chrom);
111
112 if let Some(score) = record.score {
114 self.score_stats.push(score);
115 }
116
117 match record.strand {
119 Some('+') => self.plus_strand += 1,
120 Some('-') => self.minus_strand += 1,
121 _ => self.unstranded += 1,
122 }
123
124 if let Some(block_count) = record.block_count {
126 self.block_count_stats.push(block_count as f64);
127 self.total_blocks += block_count as usize;
128 }
129 }
130
131 pub fn compute(reader: &mut BedReader) -> Result<Self> {
142 let mut stats = Self::new();
143
144 while let Some(record) = reader.next_record()? {
145 stats.update(&record);
146 }
147
148 Ok(stats)
149 }
150
151 pub fn mean_length(&self) -> Option<f64> {
153 self.length_stats.mean()
154 }
155
156 pub fn std_length(&self) -> Option<f64> {
158 self.length_stats.std_dev()
159 }
160
161 pub fn mean_score(&self) -> Option<f64> {
163 self.score_stats.mean()
164 }
165
166 pub fn std_score(&self) -> Option<f64> {
168 self.score_stats.std_dev()
169 }
170
171 pub fn mean_blocks(&self) -> Option<f64> {
173 self.block_count_stats.mean()
174 }
175
176 pub fn plus_strand_percent(&self) -> f64 {
178 if self.total_intervals == 0 {
179 0.0
180 } else {
181 (self.plus_strand as f64 / self.total_intervals as f64) * 100.0
182 }
183 }
184
185 pub fn minus_strand_percent(&self) -> f64 {
187 if self.total_intervals == 0 {
188 0.0
189 } else {
190 (self.minus_strand as f64 / self.total_intervals as f64) * 100.0
191 }
192 }
193
194 pub fn print_summary(&self) {
196 println!("=== BED Statistics ===\n");
197
198 println!("Basic Counts:");
199 println!(" Total intervals: {:>12}", self.total_intervals);
200 println!(" BED3 (minimal): {:>12}", self.bed3_count);
201 println!(" BED6 (standard): {:>12}", self.bed6_count);
202 println!(" BED12 (full): {:>12}", self.bed12_count);
203 println!();
204
205 println!("Interval Lengths:");
206 println!(
207 " Mean: {:>12.2}",
208 self.mean_length().unwrap_or(0.0)
209 );
210 println!(
211 " Std Dev: {:>12.2}",
212 self.std_length().unwrap_or(0.0)
213 );
214 println!(" Min: {:>12}", self.min_length);
215 println!(" Max: {:>12}", self.max_length);
216 println!(" Total span: {:>12}", self.total_span);
217 println!();
218
219 if self.bed6_count > 0 || self.bed12_count > 0 {
220 println!("Strand Distribution:");
221 println!(
222 " Plus (+): {:>11.1}%",
223 self.plus_strand_percent()
224 );
225 println!(
226 " Minus (-): {:>11.1}%",
227 self.minus_strand_percent()
228 );
229 println!(" Unstranded (.): {:>12}", self.unstranded);
230 println!();
231
232 if self.score_stats.count() > 0 {
233 println!("Score Statistics:");
234 println!(
235 " Mean: {:>12.2}",
236 self.mean_score().unwrap_or(0.0)
237 );
238 println!(
239 " Std Dev: {:>12.2}",
240 self.std_score().unwrap_or(0.0)
241 );
242 println!();
243 }
244 }
245
246 if self.bed12_count > 0 {
247 println!("Block/Exon Structure (BED12):");
248 println!(" Total blocks: {:>12}", self.total_blocks);
249 println!(
250 " Mean blocks/int: {:>12.2}",
251 self.mean_blocks().unwrap_or(0.0)
252 );
253 println!();
254 }
255
256 println!("Top Chromosomes:");
258 let mut chrom_vec: Vec<_> = self.intervals_per_chrom.iter().collect();
259 chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); for (chrom, count) in chrom_vec.iter().take(10) {
261 let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
262 println!(" {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
263 }
264 println!();
265
266 if self.total_intervals > 0 {
268 println!("Length Distribution (most common):");
269 let mut length_vec: Vec<_> = self.length_distribution.iter().collect();
270 length_vec.sort_by(|a, b| b.1.cmp(a.1)); for (length, count) in length_vec.iter().take(10) {
272 let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
273 println!(" {:>12} bp: {:>8} ({:>5.1}%)", length, count, percent);
274 }
275 }
276 }
277}
278
279impl Mergeable for BedStats {
285 fn merge(&mut self, other: Self) {
286 self.total_intervals += other.total_intervals;
288 self.bed3_count += other.bed3_count;
289 self.bed6_count += other.bed6_count;
290 self.bed12_count += other.bed12_count;
291 self.plus_strand += other.plus_strand;
292 self.minus_strand += other.minus_strand;
293 self.unstranded += other.unstranded;
294 self.total_blocks += other.total_blocks;
295 self.total_span += other.total_span;
296 self.min_length = self.min_length.min(other.min_length);
297 self.max_length = self.max_length.max(other.max_length);
298
299 self.length_stats.merge(other.length_stats);
301 self.score_stats.merge(other.score_stats);
302 self.block_count_stats.merge(other.block_count_stats);
303
304 self.length_distribution.merge(other.length_distribution);
306 self.intervals_per_chrom.merge(other.intervals_per_chrom);
307 }
308}
309
310#[cfg(test)]
311mod tests {
312 use super::*;
313
314 #[test]
315 fn test_bed_stats_new() {
316 let stats = BedStats::new();
317 assert_eq!(stats.total_intervals, 0);
318 assert_eq!(stats.bed3_count, 0);
319 assert_eq!(stats.bed6_count, 0);
320 }
321
322 #[test]
323 fn test_bed_stats_merge() {
324 let mut stats1 = BedStats::new();
325 stats1.total_intervals = 100;
326 stats1.bed3_count = 50;
327 stats1.bed6_count = 30;
328 stats1.total_span = 10000;
329
330 let mut stats2 = BedStats::new();
331 stats2.total_intervals = 50;
332 stats2.bed3_count = 25;
333 stats2.bed6_count = 15;
334 stats2.total_span = 5000;
335
336 stats1.merge(stats2);
337
338 assert_eq!(stats1.total_intervals, 150);
339 assert_eq!(stats1.bed3_count, 75);
340 assert_eq!(stats1.bed6_count, 45);
341 assert_eq!(stats1.total_span, 15000);
342 }
343
344 #[test]
345 fn test_bed_stats_percentages() {
346 let mut stats = BedStats::new();
347 stats.total_intervals = 100;
348 stats.plus_strand = 60;
349 stats.minus_strand = 40;
350
351 assert!((stats.plus_strand_percent() - 60.0).abs() < 0.1);
352 assert!((stats.minus_strand_percent() - 40.0).abs() < 0.1);
353 }
354
355 #[test]
356 fn test_bed_stats_empty() {
357 let stats = BedStats::new();
358 assert_eq!(stats.plus_strand_percent(), 0.0);
359 assert_eq!(stats.minus_strand_percent(), 0.0);
360 }
361}