bed_utils/
coverage.rs

1use num::Integer;
2use num_traits::{Num, NumAssignOps, NumCast};
3use std::collections::BTreeMap;
4
5use crate::bed::{map::GIntervalIndexSet, BEDLike, GenomicRange};
6
7#[derive(Debug, Clone)]
8pub struct Coverage<'a, N> {
9    total_count: f64,
10    intervals: &'a GIntervalIndexSet,
11    coverage: Vec<N>,
12}
13
14impl<'a, N: Num + NumCast + NumAssignOps + Copy> Coverage<'a, N> {
15    pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
16        Self {
17            total_count: 0.0,
18            intervals,
19            coverage: vec![N::zero(); intervals.len()],
20        }
21    }
22
23    pub fn len(&self) -> usize {
24        self.coverage.len()
25    }
26
27    pub fn total_count(&self) -> f64 {
28        self.total_count
29    }
30
31    pub fn reset(&mut self) {
32        self.total_count = 0.0;
33        self.coverage.fill(N::zero());
34    }
35
36    pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
37    where
38        D: BEDLike,
39    {
40        self.intervals.find_index_of(tag)
41    }
42
43    pub fn insert<D>(&mut self, tag: &D, multiplicity: N)
44    where
45        D: BEDLike,
46    {
47        self.total_count += <f64 as NumCast>::from(multiplicity).unwrap();
48        self.intervals
49            .find_index_of(tag)
50            .for_each(|idx| self.coverage[idx] += multiplicity);
51    }
52
53    pub fn insert_at_index<D>(&mut self, index: usize, count: N) {
54        self.total_count += <f64 as NumCast>::from(count).unwrap();
55        self.coverage[index] += count;
56    }
57
58    pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
59        self.intervals.iter()
60    }
61
62    pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
63        self.intervals.get(index)
64    }
65
66    pub fn get_coverage(&self) -> &Vec<N> {
67        &self.coverage
68    }
69}
70
71#[derive(Clone)]
72pub struct SparseCoverage<'a, N> {
73    total_count: f64,
74    intervals: &'a GIntervalIndexSet,
75    coverage: BTreeMap<usize, N>,
76}
77
78impl<'a, N: Num + NumCast + NumAssignOps + Copy> SparseCoverage<'a, N> {
79    pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
80        Self {
81            total_count: 0.0,
82            intervals,
83            coverage: BTreeMap::new(),
84        }
85    }
86
87    pub fn len(&self) -> usize {
88        self.intervals.len()
89    }
90
91    pub fn total_count(&self) -> f64 {
92        self.total_count
93    }
94
95    pub fn reset(&mut self) {
96        self.total_count = 0.0;
97        self.coverage = BTreeMap::new();
98    }
99
100    pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
101    where
102        D: BEDLike,
103    {
104        self.intervals.find_index_of(tag)
105    }
106
107    pub fn insert<D>(&mut self, tag: &D, count: N)
108    where
109        D: BEDLike,
110    {
111        self.total_count += <f64 as NumCast>::from(count).unwrap();
112        self.intervals.find_index_of(tag).for_each(|idx| {
113            self.coverage
114                .entry(idx)
115                .and_modify(|x| *x += count)
116                .or_insert(count);
117        });
118    }
119
120    pub fn insert_at_index<D>(&mut self, index: usize, count: N) {
121        self.total_count += <f64 as NumCast>::from(count).unwrap();
122        self.coverage
123            .entry(index)
124            .and_modify(|x| *x += count)
125            .or_insert(count);
126    }
127
128    pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
129        self.intervals.iter()
130    }
131
132    pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
133        self.intervals.get(index)
134    }
135
136    pub fn get_coverage(&self) -> &BTreeMap<usize, N> {
137        &self.coverage
138    }
139
140    pub fn get_coverage_as_vec(&self) -> Vec<N> {
141        let mut coverage = vec![N::zero(); self.intervals.len()];
142        self.coverage
143            .iter()
144            .for_each(|(idx, v)| coverage[*idx] = *v);
145        coverage
146    }
147}
148
149#[derive(Debug, Clone)]
150pub struct BinnedCoverage<'a, N> {
151    len: usize,
152    bin_size: u64,
153    consumed_tags: f64,
154    intervals: &'a GIntervalIndexSet,
155    coverage: Vec<Vec<N>>,
156}
157
158impl<'a, N: Num + NumCast + NumAssignOps + Copy> BinnedCoverage<'a, N> {
159    pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
160        let coverage: Vec<Vec<N>> = intervals
161            .iter()
162            .map(|x| vec![N::zero(); x.len().div_ceil(bin_size) as usize])
163            .collect();
164        let len = coverage.iter().map(|x| x.len()).sum();
165        Self {
166            intervals,
167            len,
168            bin_size,
169            coverage,
170            consumed_tags: 0.0,
171        }
172    }
173
174    pub fn len(&self) -> usize {
175        self.len
176    }
177
178    pub fn total_count(&self) -> f64 {
179        self.consumed_tags
180    }
181
182    pub fn reset(&mut self) {
183        self.consumed_tags = 0.0;
184        self.coverage.iter_mut().for_each(|x| x.fill(N::zero()));
185    }
186
187    pub fn insert<D>(&mut self, tag: &D, count: N)
188    where
189        D: BEDLike,
190    {
191        self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
192        self.intervals.find_full(tag).for_each(|(region, out_idx)| {
193            let i = tag
194                .start()
195                .saturating_sub(region.start())
196                .div_floor(&self.bin_size);
197            let j = (tag.end() - 1 - region.start())
198                .min(region.len() - 1)
199                .div_floor(&self.bin_size);
200            (i..=j).for_each(|in_idx| self.coverage[*out_idx][in_idx as usize] += count);
201        });
202    }
203
204    pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
205        self.intervals.iter().map(|x| x.split_by_len(self.bin_size))
206    }
207
208    pub fn get_coverage(&self) -> &Vec<Vec<N>> {
209        &self.coverage
210    }
211}
212
213#[derive(Debug, Clone)]
214pub struct SparseBinnedCoverage<'a, N> {
215    pub len: usize,
216    pub bin_size: u64,
217    pub consumed_tags: f64,
218    intervals: &'a GIntervalIndexSet,
219    accu_size: Vec<usize>,
220    coverage: BTreeMap<usize, N>,
221}
222
223impl<'a, N: Num + NumCast + NumAssignOps + Copy> SparseBinnedCoverage<'a, N> {
224    pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
225        let mut len = 0;
226        let accu_size = intervals
227            .iter()
228            .map(|x| {
229                let n = x.len().div_ceil(bin_size) as usize;
230                let output = len;
231                len += n;
232                output
233            })
234            .collect();
235        Self {
236            len,
237            bin_size,
238            consumed_tags: 0.0,
239            intervals,
240            accu_size,
241            coverage: BTreeMap::new(),
242        }
243    }
244
245    pub fn len(&self) -> usize {
246        self.len
247    }
248
249    pub fn total_count(&self) -> f64 {
250        self.consumed_tags
251    }
252
253    pub fn reset(&mut self) {
254        self.consumed_tags = 0.0;
255        self.coverage = BTreeMap::new();
256    }
257
258    pub fn insert<D>(&mut self, tag: &D, count: N)
259    where
260        D: BEDLike,
261    {
262        self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
263        self.intervals.find_full(tag).for_each(|(region, out_idx)| {
264            let i = tag
265                .start()
266                .saturating_sub(region.start())
267                .div_floor(&self.bin_size);
268            let j = (tag.end() - 1 - region.start())
269                .min(region.len() - 1)
270                .div_floor(&self.bin_size);
271            let n = self.accu_size[*out_idx];
272            (i..=j).for_each(|in_idx| {
273                let counter = self
274                    .coverage
275                    .entry(n + in_idx as usize)
276                    .or_insert(N::zero());
277                *counter += count;
278            });
279        });
280    }
281
282    pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
283        self.intervals.iter().map(|x| x.split_by_len(self.bin_size))
284    }
285
286    pub fn get_chrom(&self, index: usize) -> Option<&str> {
287        match self.accu_size.binary_search(&index) {
288            Ok(j) => {
289                if j < self.len {
290                    Some(self.intervals[j].chrom())
291                } else {
292                    None
293                }
294            }
295            Err(j) => {
296                if j - 1 < self.len {
297                    Some(self.intervals[j - 1].chrom())
298                } else {
299                    None
300                }
301            }
302        }
303    }
304
305    pub fn get_region(&self, index: usize) -> Option<GenomicRange> {
306        let region = match self.accu_size.binary_search(&index) {
307            Ok(j) => {
308                if j < self.len {
309                    let site = &self.intervals[j];
310                    let chr = site.chrom();
311                    let start = site.start();
312                    let end = (start + self.bin_size).min(site.end());
313                    Some(GenomicRange::new(chr, start, end))
314                } else {
315                    None
316                }
317            }
318            Err(j) => {
319                if j - 1 < self.len {
320                    let site = &self.intervals[j - 1];
321                    let chr = site.chrom();
322                    let prev = self.accu_size[j - 1];
323                    let start = site.start() + ((index - prev) as u64) * self.bin_size;
324                    let end = (start + self.bin_size).min(site.end());
325                    Some(GenomicRange::new(chr, start, end))
326                } else {
327                    None
328                }
329            }
330        };
331        region
332    }
333
334    pub fn get_coverage(&self) -> &BTreeMap<usize, N> {
335        &self.coverage
336    }
337
338    pub fn get_coverage_as_vec(&self) -> Vec<N> {
339        let mut coverage = vec![N::zero(); self.len];
340        self.coverage
341            .iter()
342            .for_each(|(idx, v)| coverage[*idx] = *v);
343        coverage
344    }
345}
346
347#[cfg(test)]
348mod bed_intersect_tests {
349    use super::*;
350    use crate::bed::{BedGraph, MergeBed};
351
352    use itertools::Itertools;
353    use rand::{thread_rng, Rng};
354
355    fn rand_bedgraph(chr: &str) -> BedGraph<f32> {
356        let mut rng = thread_rng();
357        let n: u64 = rng.gen_range(0..10000);
358        let l: u64 = rng.gen_range(5..50);
359        BedGraph::new(chr, n, n + l, 1.0)
360    }
361
362    #[test]
363    fn test_coverage() {
364        let regions: GIntervalIndexSet = vec![
365            GenomicRange::new("chr1".to_string(), 200, 500),
366            GenomicRange::new("chr1".to_string(), 1000, 2000),
367            GenomicRange::new("chr1".to_string(), 10000, 11000),
368            GenomicRange::new("chr10".to_string(), 10, 20),
369            GenomicRange::new("chr1".to_string(), 200, 500),
370        ]
371        .into_iter()
372        .collect();
373        let tags: Vec<GenomicRange> = vec![
374            GenomicRange::new("chr1".to_string(), 100, 210),
375            GenomicRange::new("chr1".to_string(), 100, 500),
376            GenomicRange::new("chr1".to_string(), 100, 5000),
377            GenomicRange::new("chr1".to_string(), 100, 200),
378            GenomicRange::new("chr1".to_string(), 1000, 1001),
379        ];
380
381        let mut cov1 = Coverage::new(&regions);
382        tags.iter().for_each(|x| cov1.insert(x, 1));
383        let result1: Vec<u64> = cov1.get_coverage().to_vec();
384        let mut cov2 = SparseCoverage::new(&regions);
385        tags.iter().for_each(|x| cov2.insert(x, 1));
386        let result2: Vec<u64> = cov2.get_coverage_as_vec();
387
388        assert_eq!(result1, result2);
389        assert_eq!(result1, vec![3, 2, 0, 0, 3]);
390
391        let mut cov3 = BinnedCoverage::new(&regions, 100);
392        tags.iter().for_each(|x| cov3.insert(x, 1));
393        let result3: Vec<u64> = cov3.get_coverage().iter().flatten().map(|x| *x).collect();
394        let mut cov4 = SparseBinnedCoverage::new(&regions, 100);
395        tags.iter().for_each(|x| cov4.insert(x, 1));
396        let result4: Vec<u64> = cov4.get_coverage_as_vec();
397
398        assert_eq!(result3, result4);
399        assert_eq!(
400            result3,
401            vec![3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 2,]
402        );
403    }
404
405    #[test]
406    fn test_sparse_coverage() {
407        let regions = [
408            GenomicRange::new("chr1", 0, 2000),
409            GenomicRange::new("chr2", 100, 2100),
410            GenomicRange::new("chr3", 3000, 3500),
411        ]
412        .into_iter()
413        .collect();
414        let mut cov = SparseBinnedCoverage::new(&regions, 400);
415
416        cov.insert(&GenomicRange::new("chr1", 3, 5), 1);
417        cov.insert(&GenomicRange::new("chr2", 0, 500), 1);
418        cov.insert(&GenomicRange::new("chr2", 0, 501), 1);
419        cov.insert(&GenomicRange::new("chr3", 3400, 3401), 1);
420        cov.insert(&GenomicRange::new("chr4", 0, 501), 1);
421        cov.insert(&GenomicRange::new("chr3", 0, 501), 1);
422
423        assert_eq!(
424            vec![(0, 1), (5, 2), (6, 1), (11, 1)],
425            cov.get_coverage()
426                .iter()
427                .map(|(i, x)| (*i, *x))
428                .collect::<Vec<(usize, u64)>>()
429        );
430
431        let expected: Vec<_> = cov
432            .regions()
433            .flatten()
434            .zip(cov.get_coverage_as_vec().iter())
435            .flat_map(|(region, x)| if *x == 0 { None } else { Some((region, *x)) })
436            .collect();
437        assert_eq!(
438            expected,
439            cov.get_coverage()
440                .iter()
441                .map(|(i, v)| (cov.get_region(*i).unwrap(), *v))
442                .collect::<Vec<_>>(),
443        );
444    }
445
446    #[test]
447    fn test_sparse_bin_coverage() {
448        fn get_bedgraph(cov: SparseBinnedCoverage<'_, f32>) -> Vec<BedGraph<f32>> {
449            let expected: Vec<_> = cov
450                .regions()
451                .flatten()
452                .zip(cov.get_coverage_as_vec().iter())
453                .flat_map(|(region, x)| {
454                    if *x == 0.0 {
455                        None
456                    } else {
457                        Some(BedGraph::from_bed(&region, *x))
458                    }
459                })
460                .collect();
461            let chunks = expected.into_iter().chunk_by(|x| x.value);
462            chunks
463                .into_iter()
464                .flat_map(|(_, groups)| {
465                    groups.into_iter().merge_sorted_bed_with(|beds| {
466                        let mut iter = beds.into_iter();
467                        let mut first = iter.next().unwrap();
468                        if let Some(last) = iter.last() {
469                            first.set_end(last.end());
470                        }
471                        first
472                    })
473                })
474                .collect()
475        }
476
477        fn clip(x: &mut BedGraph<f32>, bin_size: u64) {
478            if x.start() % bin_size != 0 {
479                x.set_start(x.start() - x.start() % bin_size);
480            }
481            if x.end() % bin_size != 0 {
482                x.set_end(x.end() + bin_size - x.end() % bin_size);
483            }
484        }
485
486        fn compare(expected: Vec<BedGraph<f32>>, actual: Vec<BedGraph<f32>>) {
487            let (mis_a, mis_b): (Vec<_>, Vec<_>) = expected
488                .into_iter()
489                .zip(actual)
490                .flat_map(|(a, b)| if a != b { Some((a, b)) } else { None })
491                .unzip();
492            assert!(
493                mis_a.is_empty(),
494                "Bin_counting: {:?}\n\nMerging: {:?}",
495                &mis_a[..10],
496                &mis_b[..10]
497            );
498        }
499
500        let regions = [GenomicRange::new("chr1", 0, 1000000)]
501            .into_iter()
502            .collect();
503        let reads = (0..100000)
504            .map(|_| rand_bedgraph("chr1"))
505            .sorted_by(|a, b| a.compare(b))
506            .collect::<Vec<_>>();
507
508        let mut cov = SparseBinnedCoverage::new(&regions, 1);
509        reads.iter().for_each(|x| cov.insert(x, x.value));
510        compare(
511            get_bedgraph(cov),
512            reads.clone().into_iter().merge_sorted_bedgraph().collect(),
513        );
514
515        let mut cov = SparseBinnedCoverage::new(&regions, 71);
516        reads.iter().for_each(|x| cov.insert(x, x.value));
517        compare(
518            get_bedgraph(cov),
519            reads
520                .iter()
521                .map(|x| {
522                    let mut x = x.clone();
523                    clip(&mut x, 71);
524                    x
525                })
526                .merge_sorted_bedgraph()
527                .collect(),
528        );
529    }
530}