bed_utils/
coverage.rs

1use std::collections::BTreeMap;
2use num::Integer;
3use num_traits::{Num, NumAssignOps, NumCast};
4
5use crate::bed::{GenomicRange, BEDLike, map::GIntervalIndexSet};
6
7#[derive(Debug, Clone)]
8pub struct Coverage<'a, N> {
9    total_count: f64,
10    intervals: &'a GIntervalIndexSet,
11    coverage: Vec<N>,
12}
13
14impl <'a, N: Num + NumCast + NumAssignOps + Copy> Coverage<'a, N> {
15    pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
16        Self {
17            total_count: 0.0,
18            intervals,
19            coverage: vec![N::zero(); intervals.len()],
20        }
21    }
22
23    pub fn len(&self) -> usize { self.coverage.len() }
24
25    pub fn total_count(&self) -> f64 { self.total_count }
26
27    pub fn reset(&mut self) {
28        self.total_count = 0.0;
29        self.coverage.fill(N::zero());
30    }
31
32    pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
33    where
34        D: BEDLike,
35    {
36        self.intervals.find_index_of(tag)
37    }
38
39    pub fn insert<D>(&mut self, tag: &D, multiplicity: N)
40    where
41        D: BEDLike,
42    {
43        self.total_count += <f64 as NumCast>::from(multiplicity).unwrap();
44        self.intervals.find_index_of(tag).for_each(|idx| self.coverage[idx] += multiplicity);
45    }
46
47    pub fn insert_at_index<D>(&mut self, index: usize, count: N)
48    {
49        self.total_count += <f64 as NumCast>::from(count).unwrap();
50        self.coverage[index] += count;
51    }
52
53    pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_
54    {
55        self.intervals.iter()
56    }
57
58    pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
59        self.intervals.get(index)
60    }
61
62    pub fn get_coverage(&self) -> &Vec<N> { &self.coverage }
63}
64
65#[derive(Clone)]
66pub struct SparseCoverage<'a, N> {
67    total_count: f64,
68    intervals: &'a GIntervalIndexSet,
69    coverage: BTreeMap<usize, N>,
70}
71
72impl <'a, N: Num + NumCast + NumAssignOps + Copy> SparseCoverage<'a, N> {
73    pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
74        Self {
75            total_count: 0.0,
76            intervals,
77            coverage: BTreeMap::new(),
78        }
79    }
80
81    pub fn len(&self) -> usize { self.intervals.len() }
82
83    pub fn total_count(&self) -> f64 { self.total_count }
84
85    pub fn reset(&mut self) {
86        self.total_count = 0.0;
87        self.coverage = BTreeMap::new();
88    }
89
90    pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
91    where
92        D: BEDLike,
93    {
94        self.intervals.find_index_of(tag)
95    }
96
97    pub fn insert<D>(&mut self, tag: &D, count: N)
98    where
99        D: BEDLike,
100    {
101        self.total_count += <f64 as NumCast>::from(count).unwrap();
102        self.intervals.find_index_of(tag).for_each(|idx| {
103            self.coverage.entry(idx).and_modify(|x| *x += count).or_insert(count);
104        });
105    }
106
107    pub fn insert_at_index<D>(&mut self, index: usize, count: N)
108    {
109        self.total_count += <f64 as NumCast>::from(count).unwrap();
110        self.coverage.entry(index).and_modify(|x| *x += count).or_insert(count);
111    }
112
113    pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
114        self.intervals.iter()
115    }
116
117    pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
118        self.intervals.get(index)
119    }
120
121    pub fn get_coverage(&self) -> &BTreeMap<usize, N> { &self.coverage }
122
123    pub fn get_coverage_as_vec(&self) -> Vec<N> {
124        let mut coverage = vec![N::zero(); self.intervals.len()];
125        self.coverage.iter().for_each(|(idx, v)| coverage[*idx] = *v);
126        coverage
127    }
128}
129
130#[derive(Debug, Clone)]
131pub struct BinnedCoverage<'a, N> {
132    len: usize,
133    bin_size: u64,
134    consumed_tags: f64,
135    intervals: &'a GIntervalIndexSet,
136    coverage: Vec<Vec<N>>,
137}
138
139impl <'a, N: Num + NumCast + NumAssignOps + Copy> BinnedCoverage<'a, N> {
140    pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
141        let coverage: Vec<Vec<N>> = intervals.iter()
142            .map(|x| vec![N::zero(); x.len().div_ceil(bin_size) as usize])
143            .collect();
144        let len = coverage.iter().map(|x| x.len()).sum();
145        Self {intervals, len, bin_size, coverage, consumed_tags: 0.0}
146    }
147
148    pub fn len(&self) -> usize { self.len }
149
150    pub fn total_count(&self) -> f64 { self.consumed_tags }
151
152    pub fn reset(&mut self) {
153        self.consumed_tags = 0.0;
154        self.coverage.iter_mut().for_each(|x| x.fill(N::zero()));
155    }
156
157    pub fn insert<D>(&mut self, tag: &D, count: N)
158    where
159        D: BEDLike,
160    {
161        self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
162        self.intervals.find_full(tag).for_each(|(region, out_idx)| {
163            let i = tag.start().saturating_sub(region.start()).div_floor(&self.bin_size);
164            let j = (tag.end() - 1 - region.start())
165                .min(region.len() - 1).div_floor(&self.bin_size);
166            (i..=j).for_each(|in_idx| self.coverage[*out_idx][in_idx as usize] += count);
167        });
168    }
169
170    pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_
171    {
172        self.intervals.iter()
173            .map(|x| x.split_by_len(self.bin_size))
174    }
175
176    pub fn get_coverage(&self) -> &Vec<Vec<N>> { &self.coverage }
177}
178
179#[derive(Debug, Clone)]
180pub struct SparseBinnedCoverage<'a, N> {
181    pub len: usize,
182    pub bin_size: u64,
183    pub consumed_tags: f64,
184    intervals: &'a GIntervalIndexSet,
185    accu_size: Vec<usize>,
186    coverage: BTreeMap<usize, N>,
187}
188
189impl <'a, N: Num + NumCast + NumAssignOps + Copy> SparseBinnedCoverage<'a, N> {
190    pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
191        let mut len = 0;
192        let accu_size = intervals.iter().map(|x| {
193            let n = x.len().div_ceil(bin_size) as usize;
194            let output = len;
195            len += n;
196            output
197        }).collect();
198        Self {
199            len, bin_size, consumed_tags: 0.0, intervals, accu_size,
200            coverage: BTreeMap::new()
201        }
202    }
203
204    pub fn len(&self) -> usize { self.len }
205
206    pub fn total_count(&self) -> f64 { self.consumed_tags }
207
208    pub fn reset(&mut self) {
209        self.consumed_tags = 0.0;
210        self.coverage = BTreeMap::new();
211    }
212
213    pub fn insert<D>(&mut self, tag: &D, count: N)
214    where
215        D: BEDLike,
216    {
217        self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
218        self.intervals.find_full(tag).for_each(|(region, out_idx)| {
219            let i = tag.start().saturating_sub(region.start()).div_floor(&self.bin_size);
220            let j = (tag.end() - 1 - region.start())
221                .min(region.len() - 1).div_floor(&self.bin_size);
222            let n = self.accu_size[*out_idx];
223            (i..=j).for_each(|in_idx| {
224                let counter = self.coverage.entry(n + in_idx as usize).or_insert(N::zero());
225                *counter += count;
226            });
227        });
228    }
229
230    pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
231        self.intervals.iter()
232            .map(|x| x.split_by_len(self.bin_size))
233    }
234
235    pub fn get_chrom(&self, index: usize) -> Option<&str> {
236        match self.accu_size.binary_search(&index) {
237            Ok(j) => {
238                if j < self.len {
239                    Some(self.intervals[j].chrom())
240                } else {
241                    None
242                }
243            },
244            Err(j) => {
245                if j - 1 < self.len {
246                    Some(self.intervals[j-1].chrom())
247                } else {
248                    None
249                }
250            },
251        }
252    }
253
254    pub fn get_region(&self, index: usize) -> Option<GenomicRange> {
255        let region = match self.accu_size.binary_search(&index) {
256            Ok(j) => {
257                if j < self.len {
258                    let site = &self.intervals[j];
259                    let chr = site.chrom();
260                    let start = site.start();
261                    let end = (start + self.bin_size).min(site.end());
262                    Some(GenomicRange::new(chr, start, end))
263                } else {
264                    None
265                }
266            },
267            Err(j) => {
268                if j - 1 < self.len {
269                    let site = &self.intervals[j-1];
270                    let chr = site.chrom();
271                    let prev = self.accu_size[j-1];
272                    let start = site.start() + ((index - prev) as u64) * self.bin_size;
273                    let end = (start + self.bin_size).min(site.end());
274                    Some(GenomicRange::new(chr, start, end))
275                } else {
276                    None
277                }
278            }
279        };
280        region
281    }
282
283    pub fn get_coverage(&self) -> &BTreeMap<usize, N> { &self.coverage }
284
285    pub fn get_coverage_as_vec(&self) -> Vec<N> {
286        let mut coverage = vec![N::zero(); self.len];
287        self.coverage.iter().for_each(|(idx, v)| coverage[*idx] = *v);
288        coverage
289    }
290}
291
292#[cfg(test)]
293mod bed_intersect_tests {
294    use crate::bed::{merge_sorted_bed_with, merge_sorted_bedgraph, BedGraph};
295    use super::*;
296
297    use itertools::Itertools;
298    use rand::{thread_rng, Rng};
299
300    fn rand_bedgraph(chr: &str) -> BedGraph<f32> {
301        let mut rng = thread_rng();
302        let n: u64 = rng.gen_range(0..10000);
303        let l: u64 = rng.gen_range(5..50);
304        BedGraph::new(chr, n, n + l, 1.0)
305    }
306
307    #[test]
308    fn test_coverage() {
309        let regions: GIntervalIndexSet = vec![
310            GenomicRange::new("chr1".to_string(), 200, 500),
311            GenomicRange::new("chr1".to_string(), 1000, 2000),
312            GenomicRange::new("chr1".to_string(), 10000, 11000),
313            GenomicRange::new("chr10".to_string(), 10, 20),
314            GenomicRange::new("chr1".to_string(), 200, 500),
315        ].into_iter().collect();
316        let tags: Vec<GenomicRange> = vec![
317            GenomicRange::new("chr1".to_string(), 100, 210),
318            GenomicRange::new("chr1".to_string(), 100, 500),
319            GenomicRange::new("chr1".to_string(), 100, 5000),
320            GenomicRange::new("chr1".to_string(), 100, 200),
321            GenomicRange::new("chr1".to_string(), 1000, 1001),
322        ];
323
324        let mut cov1 = Coverage::new(&regions);
325        tags.iter().for_each(|x| cov1.insert(x, 1));
326        let result1: Vec<u64> = cov1.get_coverage().to_vec();
327        let mut cov2 = SparseCoverage::new(&regions);
328        tags.iter().for_each(|x| cov2.insert(x, 1));
329        let result2: Vec<u64> = cov2.get_coverage_as_vec();
330
331        assert_eq!(result1, result2);
332        assert_eq!(result1, vec![3, 2, 0, 0, 3]);
333
334        let mut cov3 = BinnedCoverage::new(&regions, 100);
335        tags.iter().for_each(|x| cov3.insert(x, 1));
336        let result3: Vec<u64> = cov3.get_coverage().iter().flatten().map(|x| *x).collect();
337        let mut cov4 = SparseBinnedCoverage::new(&regions, 100);
338        tags.iter().for_each(|x| cov4.insert(x, 1));
339        let result4: Vec<u64> = cov4.get_coverage_as_vec();
340
341        assert_eq!(result3, result4);
342        assert_eq!(
343            result3,
344            vec![
345                3, 2, 2,
346                2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
347                0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
348                0,
349                3, 2, 2,
350            ]
351        );
352    }
353
354    #[test]
355    fn test_sparse_coverage() {
356        let regions = [
357            GenomicRange::new("chr1", 0, 2000),
358            GenomicRange::new("chr2", 100, 2100),
359            GenomicRange::new("chr3", 3000, 3500),
360            ].into_iter().collect();
361        let mut cov = SparseBinnedCoverage::new(&regions, 400);
362
363        cov.insert(&GenomicRange::new("chr1", 3, 5), 1);
364        cov.insert(&GenomicRange::new("chr2", 0, 500), 1);
365        cov.insert(&GenomicRange::new("chr2", 0, 501), 1);
366        cov.insert(&GenomicRange::new("chr3", 3400, 3401), 1);
367        cov.insert(&GenomicRange::new("chr4", 0, 501), 1);
368        cov.insert(&GenomicRange::new("chr3", 0, 501), 1);
369
370        assert_eq!(
371            vec![(0, 1), (5, 2), (6, 1), (11, 1)],
372            cov.get_coverage().iter().map(|(i, x)| (*i, *x)).collect::<Vec<(usize, u64)>>()
373        );
374
375        let expected: Vec<_> = cov.regions().flatten().zip(
376            cov.get_coverage_as_vec().iter()
377        ).flat_map(|(region, x)| if *x == 0 {
378            None
379        } else {
380            Some((region, *x))
381        }).collect();
382        assert_eq!(
383            expected,
384            cov.get_coverage().iter().map(|(i, v)| (cov.get_region(*i).unwrap(), *v)).collect::<Vec<_>>(),
385        );
386    }
387
388    #[test]
389    fn test_sparse_bin_coverage() {
390        fn get_bedgraph(cov: SparseBinnedCoverage<'_, f32>) -> Vec<BedGraph<f32>> {
391            let expected: Vec<_> = cov.regions().flatten().zip(
392                cov.get_coverage_as_vec().iter()
393            ).flat_map(|(region, x)| if *x == 0.0 {
394                None
395            } else {
396                Some(BedGraph::from_bed(&region, *x))
397            }).collect();
398            let chunks = expected.into_iter().chunk_by(|x| x.value);
399            chunks
400                .into_iter()
401                .flat_map(|(_, groups)| {
402                    merge_sorted_bed_with(groups, |beds| {
403                        let mut iter = beds.into_iter();
404                        let mut first = iter.next().unwrap();
405                        if let Some(last) = iter.last() {
406                            first.set_end(last.end());
407                        }
408                        first
409                    })
410                })
411                .collect()
412        }
413
414        fn clip(x: &mut BedGraph<f32>, bin_size: u64) {
415            if x.start() % bin_size != 0 {
416                x.set_start(x.start() - x.start() % bin_size);
417            }
418            if x.end() % bin_size != 0 {
419                x.set_end(x.end() + bin_size - x.end() % bin_size);
420            }
421        }
422
423        fn compare(expected: Vec<BedGraph<f32>>, actual: Vec<BedGraph<f32>>) {
424            let (mis_a, mis_b): (Vec<_>, Vec<_>) = expected.into_iter().zip(actual).flat_map(|(a, b)| {
425                if a != b {
426                    Some((a,b))
427                } else {
428                    None
429                }
430            }).unzip();
431            assert!(mis_a.is_empty(), "Bin_counting: {:?}\n\nMerging: {:?}", &mis_a[..10], &mis_b[..10]);
432        }
433
434        let regions = [
435            GenomicRange::new("chr1", 0, 1000000),
436            ].into_iter().collect();
437        let reads = (0..100000).map(|_| rand_bedgraph("chr1"))
438            .sorted_by(|a, b| a.compare(b))
439            .collect::<Vec<_>>();
440
441        let mut cov = SparseBinnedCoverage::new(&regions, 1);
442        reads.iter().for_each(|x| cov.insert(x, x.value));
443        compare(get_bedgraph(cov), merge_sorted_bedgraph(reads.clone()).collect());
444
445        let mut cov = SparseBinnedCoverage::new(&regions, 71);
446        reads.iter().for_each(|x| cov.insert(x, x.value));
447        compare(
448            get_bedgraph(cov),
449            merge_sorted_bedgraph(
450                reads.iter().map(|x| {
451                    let mut x = x.clone();
452                    clip(&mut x, 71);
453                    x
454                }).collect::<Vec<_>>()
455            ).collect()
456        );
457    }
458}