1use std::collections::BTreeMap;
2use num::Integer;
3use num_traits::{Num, NumAssignOps, NumCast};
4
5use crate::bed::{GenomicRange, BEDLike, map::GIntervalIndexSet};
6
7#[derive(Debug, Clone)]
8pub struct Coverage<'a, N> {
9 total_count: f64,
10 intervals: &'a GIntervalIndexSet,
11 coverage: Vec<N>,
12}
13
14impl <'a, N: Num + NumCast + NumAssignOps + Copy> Coverage<'a, N> {
15 pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
16 Self {
17 total_count: 0.0,
18 intervals,
19 coverage: vec![N::zero(); intervals.len()],
20 }
21 }
22
23 pub fn len(&self) -> usize { self.coverage.len() }
24
25 pub fn total_count(&self) -> f64 { self.total_count }
26
27 pub fn reset(&mut self) {
28 self.total_count = 0.0;
29 self.coverage.fill(N::zero());
30 }
31
32 pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
33 where
34 D: BEDLike,
35 {
36 self.intervals.find_index_of(tag)
37 }
38
39 pub fn insert<D>(&mut self, tag: &D, multiplicity: N)
40 where
41 D: BEDLike,
42 {
43 self.total_count += <f64 as NumCast>::from(multiplicity).unwrap();
44 self.intervals.find_index_of(tag).for_each(|idx| self.coverage[idx] += multiplicity);
45 }
46
47 pub fn insert_at_index<D>(&mut self, index: usize, count: N)
48 {
49 self.total_count += <f64 as NumCast>::from(count).unwrap();
50 self.coverage[index] += count;
51 }
52
53 pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_
54 {
55 self.intervals.iter()
56 }
57
58 pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
59 self.intervals.get(index)
60 }
61
62 pub fn get_coverage(&self) -> &Vec<N> { &self.coverage }
63}
64
65#[derive(Clone)]
66pub struct SparseCoverage<'a, N> {
67 total_count: f64,
68 intervals: &'a GIntervalIndexSet,
69 coverage: BTreeMap<usize, N>,
70}
71
72impl <'a, N: Num + NumCast + NumAssignOps + Copy> SparseCoverage<'a, N> {
73 pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
74 Self {
75 total_count: 0.0,
76 intervals,
77 coverage: BTreeMap::new(),
78 }
79 }
80
81 pub fn len(&self) -> usize { self.intervals.len() }
82
83 pub fn total_count(&self) -> f64 { self.total_count }
84
85 pub fn reset(&mut self) {
86 self.total_count = 0.0;
87 self.coverage = BTreeMap::new();
88 }
89
90 pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
91 where
92 D: BEDLike,
93 {
94 self.intervals.find_index_of(tag)
95 }
96
97 pub fn insert<D>(&mut self, tag: &D, count: N)
98 where
99 D: BEDLike,
100 {
101 self.total_count += <f64 as NumCast>::from(count).unwrap();
102 self.intervals.find_index_of(tag).for_each(|idx| {
103 self.coverage.entry(idx).and_modify(|x| *x += count).or_insert(count);
104 });
105 }
106
107 pub fn insert_at_index<D>(&mut self, index: usize, count: N)
108 {
109 self.total_count += <f64 as NumCast>::from(count).unwrap();
110 self.coverage.entry(index).and_modify(|x| *x += count).or_insert(count);
111 }
112
113 pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
114 self.intervals.iter()
115 }
116
117 pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
118 self.intervals.get(index)
119 }
120
121 pub fn get_coverage(&self) -> &BTreeMap<usize, N> { &self.coverage }
122
123 pub fn get_coverage_as_vec(&self) -> Vec<N> {
124 let mut coverage = vec![N::zero(); self.intervals.len()];
125 self.coverage.iter().for_each(|(idx, v)| coverage[*idx] = *v);
126 coverage
127 }
128}
129
130#[derive(Debug, Clone)]
131pub struct BinnedCoverage<'a, N> {
132 len: usize,
133 bin_size: u64,
134 consumed_tags: f64,
135 intervals: &'a GIntervalIndexSet,
136 coverage: Vec<Vec<N>>,
137}
138
139impl <'a, N: Num + NumCast + NumAssignOps + Copy> BinnedCoverage<'a, N> {
140 pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
141 let coverage: Vec<Vec<N>> = intervals.iter()
142 .map(|x| vec![N::zero(); x.len().div_ceil(bin_size) as usize])
143 .collect();
144 let len = coverage.iter().map(|x| x.len()).sum();
145 Self {intervals, len, bin_size, coverage, consumed_tags: 0.0}
146 }
147
148 pub fn len(&self) -> usize { self.len }
149
150 pub fn total_count(&self) -> f64 { self.consumed_tags }
151
152 pub fn reset(&mut self) {
153 self.consumed_tags = 0.0;
154 self.coverage.iter_mut().for_each(|x| x.fill(N::zero()));
155 }
156
157 pub fn insert<D>(&mut self, tag: &D, count: N)
158 where
159 D: BEDLike,
160 {
161 self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
162 self.intervals.find_full(tag).for_each(|(region, out_idx)| {
163 let i = tag.start().saturating_sub(region.start()).div_floor(&self.bin_size);
164 let j = (tag.end() - 1 - region.start())
165 .min(region.len() - 1).div_floor(&self.bin_size);
166 (i..=j).for_each(|in_idx| self.coverage[*out_idx][in_idx as usize] += count);
167 });
168 }
169
170 pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_
171 {
172 self.intervals.iter()
173 .map(|x| x.split_by_len(self.bin_size))
174 }
175
176 pub fn get_coverage(&self) -> &Vec<Vec<N>> { &self.coverage }
177}
178
179#[derive(Debug, Clone)]
180pub struct SparseBinnedCoverage<'a, N> {
181 pub len: usize,
182 pub bin_size: u64,
183 pub consumed_tags: f64,
184 intervals: &'a GIntervalIndexSet,
185 accu_size: Vec<usize>,
186 coverage: BTreeMap<usize, N>,
187}
188
189impl <'a, N: Num + NumCast + NumAssignOps + Copy> SparseBinnedCoverage<'a, N> {
190 pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
191 let mut len = 0;
192 let accu_size = intervals.iter().map(|x| {
193 let n = x.len().div_ceil(bin_size) as usize;
194 let output = len;
195 len += n;
196 output
197 }).collect();
198 Self {
199 len, bin_size, consumed_tags: 0.0, intervals, accu_size,
200 coverage: BTreeMap::new()
201 }
202 }
203
204 pub fn len(&self) -> usize { self.len }
205
206 pub fn total_count(&self) -> f64 { self.consumed_tags }
207
208 pub fn reset(&mut self) {
209 self.consumed_tags = 0.0;
210 self.coverage = BTreeMap::new();
211 }
212
213 pub fn insert<D>(&mut self, tag: &D, count: N)
214 where
215 D: BEDLike,
216 {
217 self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
218 self.intervals.find_full(tag).for_each(|(region, out_idx)| {
219 let i = tag.start().saturating_sub(region.start()).div_floor(&self.bin_size);
220 let j = (tag.end() - 1 - region.start())
221 .min(region.len() - 1).div_floor(&self.bin_size);
222 let n = self.accu_size[*out_idx];
223 (i..=j).for_each(|in_idx| {
224 let counter = self.coverage.entry(n + in_idx as usize).or_insert(N::zero());
225 *counter += count;
226 });
227 });
228 }
229
230 pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
231 self.intervals.iter()
232 .map(|x| x.split_by_len(self.bin_size))
233 }
234
235 pub fn get_chrom(&self, index: usize) -> Option<&str> {
236 match self.accu_size.binary_search(&index) {
237 Ok(j) => {
238 if j < self.len {
239 Some(self.intervals[j].chrom())
240 } else {
241 None
242 }
243 },
244 Err(j) => {
245 if j - 1 < self.len {
246 Some(self.intervals[j-1].chrom())
247 } else {
248 None
249 }
250 },
251 }
252 }
253
254 pub fn get_region(&self, index: usize) -> Option<GenomicRange> {
255 let region = match self.accu_size.binary_search(&index) {
256 Ok(j) => {
257 if j < self.len {
258 let site = &self.intervals[j];
259 let chr = site.chrom();
260 let start = site.start();
261 let end = (start + self.bin_size).min(site.end());
262 Some(GenomicRange::new(chr, start, end))
263 } else {
264 None
265 }
266 },
267 Err(j) => {
268 if j - 1 < self.len {
269 let site = &self.intervals[j-1];
270 let chr = site.chrom();
271 let prev = self.accu_size[j-1];
272 let start = site.start() + ((index - prev) as u64) * self.bin_size;
273 let end = (start + self.bin_size).min(site.end());
274 Some(GenomicRange::new(chr, start, end))
275 } else {
276 None
277 }
278 }
279 };
280 region
281 }
282
283 pub fn get_coverage(&self) -> &BTreeMap<usize, N> { &self.coverage }
284
285 pub fn get_coverage_as_vec(&self) -> Vec<N> {
286 let mut coverage = vec![N::zero(); self.len];
287 self.coverage.iter().for_each(|(idx, v)| coverage[*idx] = *v);
288 coverage
289 }
290}
291
292#[cfg(test)]
293mod bed_intersect_tests {
294 use crate::bed::{merge_sorted_bed_with, merge_sorted_bedgraph, BedGraph};
295 use super::*;
296
297 use itertools::Itertools;
298 use rand::{thread_rng, Rng};
299
300 fn rand_bedgraph(chr: &str) -> BedGraph<f32> {
301 let mut rng = thread_rng();
302 let n: u64 = rng.gen_range(0..10000);
303 let l: u64 = rng.gen_range(5..50);
304 BedGraph::new(chr, n, n + l, 1.0)
305 }
306
307 #[test]
308 fn test_coverage() {
309 let regions: GIntervalIndexSet = vec![
310 GenomicRange::new("chr1".to_string(), 200, 500),
311 GenomicRange::new("chr1".to_string(), 1000, 2000),
312 GenomicRange::new("chr1".to_string(), 10000, 11000),
313 GenomicRange::new("chr10".to_string(), 10, 20),
314 GenomicRange::new("chr1".to_string(), 200, 500),
315 ].into_iter().collect();
316 let tags: Vec<GenomicRange> = vec![
317 GenomicRange::new("chr1".to_string(), 100, 210),
318 GenomicRange::new("chr1".to_string(), 100, 500),
319 GenomicRange::new("chr1".to_string(), 100, 5000),
320 GenomicRange::new("chr1".to_string(), 100, 200),
321 GenomicRange::new("chr1".to_string(), 1000, 1001),
322 ];
323
324 let mut cov1 = Coverage::new(®ions);
325 tags.iter().for_each(|x| cov1.insert(x, 1));
326 let result1: Vec<u64> = cov1.get_coverage().to_vec();
327 let mut cov2 = SparseCoverage::new(®ions);
328 tags.iter().for_each(|x| cov2.insert(x, 1));
329 let result2: Vec<u64> = cov2.get_coverage_as_vec();
330
331 assert_eq!(result1, result2);
332 assert_eq!(result1, vec![3, 2, 0, 0, 3]);
333
334 let mut cov3 = BinnedCoverage::new(®ions, 100);
335 tags.iter().for_each(|x| cov3.insert(x, 1));
336 let result3: Vec<u64> = cov3.get_coverage().iter().flatten().map(|x| *x).collect();
337 let mut cov4 = SparseBinnedCoverage::new(®ions, 100);
338 tags.iter().for_each(|x| cov4.insert(x, 1));
339 let result4: Vec<u64> = cov4.get_coverage_as_vec();
340
341 assert_eq!(result3, result4);
342 assert_eq!(
343 result3,
344 vec![
345 3, 2, 2,
346 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
347 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
348 0,
349 3, 2, 2,
350 ]
351 );
352 }
353
354 #[test]
355 fn test_sparse_coverage() {
356 let regions = [
357 GenomicRange::new("chr1", 0, 2000),
358 GenomicRange::new("chr2", 100, 2100),
359 GenomicRange::new("chr3", 3000, 3500),
360 ].into_iter().collect();
361 let mut cov = SparseBinnedCoverage::new(®ions, 400);
362
363 cov.insert(&GenomicRange::new("chr1", 3, 5), 1);
364 cov.insert(&GenomicRange::new("chr2", 0, 500), 1);
365 cov.insert(&GenomicRange::new("chr2", 0, 501), 1);
366 cov.insert(&GenomicRange::new("chr3", 3400, 3401), 1);
367 cov.insert(&GenomicRange::new("chr4", 0, 501), 1);
368 cov.insert(&GenomicRange::new("chr3", 0, 501), 1);
369
370 assert_eq!(
371 vec![(0, 1), (5, 2), (6, 1), (11, 1)],
372 cov.get_coverage().iter().map(|(i, x)| (*i, *x)).collect::<Vec<(usize, u64)>>()
373 );
374
375 let expected: Vec<_> = cov.regions().flatten().zip(
376 cov.get_coverage_as_vec().iter()
377 ).flat_map(|(region, x)| if *x == 0 {
378 None
379 } else {
380 Some((region, *x))
381 }).collect();
382 assert_eq!(
383 expected,
384 cov.get_coverage().iter().map(|(i, v)| (cov.get_region(*i).unwrap(), *v)).collect::<Vec<_>>(),
385 );
386 }
387
388 #[test]
389 fn test_sparse_bin_coverage() {
390 fn get_bedgraph(cov: SparseBinnedCoverage<'_, f32>) -> Vec<BedGraph<f32>> {
391 let expected: Vec<_> = cov.regions().flatten().zip(
392 cov.get_coverage_as_vec().iter()
393 ).flat_map(|(region, x)| if *x == 0.0 {
394 None
395 } else {
396 Some(BedGraph::from_bed(®ion, *x))
397 }).collect();
398 let chunks = expected.into_iter().chunk_by(|x| x.value);
399 chunks
400 .into_iter()
401 .flat_map(|(_, groups)| {
402 merge_sorted_bed_with(groups, |beds| {
403 let mut iter = beds.into_iter();
404 let mut first = iter.next().unwrap();
405 if let Some(last) = iter.last() {
406 first.set_end(last.end());
407 }
408 first
409 })
410 })
411 .collect()
412 }
413
414 fn clip(x: &mut BedGraph<f32>, bin_size: u64) {
415 if x.start() % bin_size != 0 {
416 x.set_start(x.start() - x.start() % bin_size);
417 }
418 if x.end() % bin_size != 0 {
419 x.set_end(x.end() + bin_size - x.end() % bin_size);
420 }
421 }
422
423 fn compare(expected: Vec<BedGraph<f32>>, actual: Vec<BedGraph<f32>>) {
424 let (mis_a, mis_b): (Vec<_>, Vec<_>) = expected.into_iter().zip(actual).flat_map(|(a, b)| {
425 if a != b {
426 Some((a,b))
427 } else {
428 None
429 }
430 }).unzip();
431 assert!(mis_a.is_empty(), "Bin_counting: {:?}\n\nMerging: {:?}", &mis_a[..10], &mis_b[..10]);
432 }
433
434 let regions = [
435 GenomicRange::new("chr1", 0, 1000000),
436 ].into_iter().collect();
437 let reads = (0..100000).map(|_| rand_bedgraph("chr1"))
438 .sorted_by(|a, b| a.compare(b))
439 .collect::<Vec<_>>();
440
441 let mut cov = SparseBinnedCoverage::new(®ions, 1);
442 reads.iter().for_each(|x| cov.insert(x, x.value));
443 compare(get_bedgraph(cov), merge_sorted_bedgraph(reads.clone()).collect());
444
445 let mut cov = SparseBinnedCoverage::new(®ions, 71);
446 reads.iter().for_each(|x| cov.insert(x, x.value));
447 compare(
448 get_bedgraph(cov),
449 merge_sorted_bedgraph(
450 reads.iter().map(|x| {
451 let mut x = x.clone();
452 clip(&mut x, 71);
453 x
454 }).collect::<Vec<_>>()
455 ).collect()
456 );
457 }
458}