1use num::Integer;
2use num_traits::{Num, NumAssignOps, NumCast};
3use std::collections::BTreeMap;
4
5use crate::bed::{map::GIntervalIndexSet, BEDLike, GenomicRange};
6
7#[derive(Debug, Clone)]
8pub struct Coverage<'a, N> {
9 total_count: f64,
10 intervals: &'a GIntervalIndexSet,
11 coverage: Vec<N>,
12}
13
14impl<'a, N: Num + NumCast + NumAssignOps + Copy> Coverage<'a, N> {
15 pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
16 Self {
17 total_count: 0.0,
18 intervals,
19 coverage: vec![N::zero(); intervals.len()],
20 }
21 }
22
23 pub fn len(&self) -> usize {
24 self.coverage.len()
25 }
26
27 pub fn total_count(&self) -> f64 {
28 self.total_count
29 }
30
31 pub fn reset(&mut self) {
32 self.total_count = 0.0;
33 self.coverage.fill(N::zero());
34 }
35
36 pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
37 where
38 D: BEDLike,
39 {
40 self.intervals.find_index_of(tag)
41 }
42
43 pub fn insert<D>(&mut self, tag: &D, multiplicity: N)
44 where
45 D: BEDLike,
46 {
47 self.total_count += <f64 as NumCast>::from(multiplicity).unwrap();
48 self.intervals
49 .find_index_of(tag)
50 .for_each(|idx| self.coverage[idx] += multiplicity);
51 }
52
53 pub fn insert_at_index<D>(&mut self, index: usize, count: N) {
54 self.total_count += <f64 as NumCast>::from(count).unwrap();
55 self.coverage[index] += count;
56 }
57
58 pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
59 self.intervals.iter()
60 }
61
62 pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
63 self.intervals.get(index)
64 }
65
66 pub fn get_coverage(&self) -> &Vec<N> {
67 &self.coverage
68 }
69}
70
71#[derive(Clone)]
72pub struct SparseCoverage<'a, N> {
73 total_count: f64,
74 intervals: &'a GIntervalIndexSet,
75 coverage: BTreeMap<usize, N>,
76}
77
78impl<'a, N: Num + NumCast + NumAssignOps + Copy> SparseCoverage<'a, N> {
79 pub fn new(intervals: &'a GIntervalIndexSet) -> Self {
80 Self {
81 total_count: 0.0,
82 intervals,
83 coverage: BTreeMap::new(),
84 }
85 }
86
87 pub fn len(&self) -> usize {
88 self.intervals.len()
89 }
90
91 pub fn total_count(&self) -> f64 {
92 self.total_count
93 }
94
95 pub fn reset(&mut self) {
96 self.total_count = 0.0;
97 self.coverage = BTreeMap::new();
98 }
99
100 pub fn get_index<D>(&self, tag: &D) -> impl Iterator<Item = usize> + '_
101 where
102 D: BEDLike,
103 {
104 self.intervals.find_index_of(tag)
105 }
106
107 pub fn insert<D>(&mut self, tag: &D, count: N)
108 where
109 D: BEDLike,
110 {
111 self.total_count += <f64 as NumCast>::from(count).unwrap();
112 self.intervals.find_index_of(tag).for_each(|idx| {
113 self.coverage
114 .entry(idx)
115 .and_modify(|x| *x += count)
116 .or_insert(count);
117 });
118 }
119
120 pub fn insert_at_index<D>(&mut self, index: usize, count: N) {
121 self.total_count += <f64 as NumCast>::from(count).unwrap();
122 self.coverage
123 .entry(index)
124 .and_modify(|x| *x += count)
125 .or_insert(count);
126 }
127
128 pub fn regions(&self) -> impl Iterator<Item = &GenomicRange> + '_ {
129 self.intervals.iter()
130 }
131
132 pub fn get_region(&self, index: usize) -> Option<&GenomicRange> {
133 self.intervals.get(index)
134 }
135
136 pub fn get_coverage(&self) -> &BTreeMap<usize, N> {
137 &self.coverage
138 }
139
140 pub fn get_coverage_as_vec(&self) -> Vec<N> {
141 let mut coverage = vec![N::zero(); self.intervals.len()];
142 self.coverage
143 .iter()
144 .for_each(|(idx, v)| coverage[*idx] = *v);
145 coverage
146 }
147}
148
149#[derive(Debug, Clone)]
150pub struct BinnedCoverage<'a, N> {
151 len: usize,
152 bin_size: u64,
153 consumed_tags: f64,
154 intervals: &'a GIntervalIndexSet,
155 coverage: Vec<Vec<N>>,
156}
157
158impl<'a, N: Num + NumCast + NumAssignOps + Copy> BinnedCoverage<'a, N> {
159 pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
160 let coverage: Vec<Vec<N>> = intervals
161 .iter()
162 .map(|x| vec![N::zero(); x.len().div_ceil(bin_size) as usize])
163 .collect();
164 let len = coverage.iter().map(|x| x.len()).sum();
165 Self {
166 intervals,
167 len,
168 bin_size,
169 coverage,
170 consumed_tags: 0.0,
171 }
172 }
173
174 pub fn len(&self) -> usize {
175 self.len
176 }
177
178 pub fn total_count(&self) -> f64 {
179 self.consumed_tags
180 }
181
182 pub fn reset(&mut self) {
183 self.consumed_tags = 0.0;
184 self.coverage.iter_mut().for_each(|x| x.fill(N::zero()));
185 }
186
187 pub fn insert<D>(&mut self, tag: &D, count: N)
188 where
189 D: BEDLike,
190 {
191 self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
192 self.intervals.find_full(tag).for_each(|(region, out_idx)| {
193 let i = tag
194 .start()
195 .saturating_sub(region.start())
196 .div_floor(&self.bin_size);
197 let j = (tag.end() - 1 - region.start())
198 .min(region.len() - 1)
199 .div_floor(&self.bin_size);
200 (i..=j).for_each(|in_idx| self.coverage[*out_idx][in_idx as usize] += count);
201 });
202 }
203
204 pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
205 self.intervals.iter().map(|x| x.split_by_len(self.bin_size))
206 }
207
208 pub fn get_coverage(&self) -> &Vec<Vec<N>> {
209 &self.coverage
210 }
211}
212
213#[derive(Debug, Clone)]
214pub struct SparseBinnedCoverage<'a, N> {
215 pub len: usize,
216 pub bin_size: u64,
217 pub consumed_tags: f64,
218 intervals: &'a GIntervalIndexSet,
219 accu_size: Vec<usize>,
220 coverage: BTreeMap<usize, N>,
221}
222
223impl<'a, N: Num + NumCast + NumAssignOps + Copy> SparseBinnedCoverage<'a, N> {
224 pub fn new(intervals: &'a GIntervalIndexSet, bin_size: u64) -> Self {
225 let mut len = 0;
226 let accu_size = intervals
227 .iter()
228 .map(|x| {
229 let n = x.len().div_ceil(bin_size) as usize;
230 let output = len;
231 len += n;
232 output
233 })
234 .collect();
235 Self {
236 len,
237 bin_size,
238 consumed_tags: 0.0,
239 intervals,
240 accu_size,
241 coverage: BTreeMap::new(),
242 }
243 }
244
245 pub fn len(&self) -> usize {
246 self.len
247 }
248
249 pub fn total_count(&self) -> f64 {
250 self.consumed_tags
251 }
252
253 pub fn reset(&mut self) {
254 self.consumed_tags = 0.0;
255 self.coverage = BTreeMap::new();
256 }
257
258 pub fn insert<D>(&mut self, tag: &D, count: N)
259 where
260 D: BEDLike,
261 {
262 self.consumed_tags += <f64 as NumCast>::from(count).unwrap();
263 self.intervals.find_full(tag).for_each(|(region, out_idx)| {
264 let i = tag
265 .start()
266 .saturating_sub(region.start())
267 .div_floor(&self.bin_size);
268 let j = (tag.end() - 1 - region.start())
269 .min(region.len() - 1)
270 .div_floor(&self.bin_size);
271 let n = self.accu_size[*out_idx];
272 (i..=j).for_each(|in_idx| {
273 let counter = self
274 .coverage
275 .entry(n + in_idx as usize)
276 .or_insert(N::zero());
277 *counter += count;
278 });
279 });
280 }
281
282 pub fn regions(&self) -> impl Iterator<Item = impl Iterator<Item = GenomicRange> + '_> + '_ {
283 self.intervals.iter().map(|x| x.split_by_len(self.bin_size))
284 }
285
286 pub fn get_chrom(&self, index: usize) -> Option<&str> {
287 match self.accu_size.binary_search(&index) {
288 Ok(j) => {
289 if j < self.len {
290 Some(self.intervals[j].chrom())
291 } else {
292 None
293 }
294 }
295 Err(j) => {
296 if j - 1 < self.len {
297 Some(self.intervals[j - 1].chrom())
298 } else {
299 None
300 }
301 }
302 }
303 }
304
305 pub fn get_region(&self, index: usize) -> Option<GenomicRange> {
306 let region = match self.accu_size.binary_search(&index) {
307 Ok(j) => {
308 if j < self.len {
309 let site = &self.intervals[j];
310 let chr = site.chrom();
311 let start = site.start();
312 let end = (start + self.bin_size).min(site.end());
313 Some(GenomicRange::new(chr, start, end))
314 } else {
315 None
316 }
317 }
318 Err(j) => {
319 if j - 1 < self.len {
320 let site = &self.intervals[j - 1];
321 let chr = site.chrom();
322 let prev = self.accu_size[j - 1];
323 let start = site.start() + ((index - prev) as u64) * self.bin_size;
324 let end = (start + self.bin_size).min(site.end());
325 Some(GenomicRange::new(chr, start, end))
326 } else {
327 None
328 }
329 }
330 };
331 region
332 }
333
334 pub fn get_coverage(&self) -> &BTreeMap<usize, N> {
335 &self.coverage
336 }
337
338 pub fn get_coverage_as_vec(&self) -> Vec<N> {
339 let mut coverage = vec![N::zero(); self.len];
340 self.coverage
341 .iter()
342 .for_each(|(idx, v)| coverage[*idx] = *v);
343 coverage
344 }
345}
346
347#[cfg(test)]
348mod bed_intersect_tests {
349 use super::*;
350 use crate::bed::{BedGraph, MergeBed};
351
352 use itertools::Itertools;
353 use rand::{thread_rng, Rng};
354
355 fn rand_bedgraph(chr: &str) -> BedGraph<f32> {
356 let mut rng = thread_rng();
357 let n: u64 = rng.gen_range(0..10000);
358 let l: u64 = rng.gen_range(5..50);
359 BedGraph::new(chr, n, n + l, 1.0)
360 }
361
362 #[test]
363 fn test_coverage() {
364 let regions: GIntervalIndexSet = vec![
365 GenomicRange::new("chr1".to_string(), 200, 500),
366 GenomicRange::new("chr1".to_string(), 1000, 2000),
367 GenomicRange::new("chr1".to_string(), 10000, 11000),
368 GenomicRange::new("chr10".to_string(), 10, 20),
369 GenomicRange::new("chr1".to_string(), 200, 500),
370 ]
371 .into_iter()
372 .collect();
373 let tags: Vec<GenomicRange> = vec![
374 GenomicRange::new("chr1".to_string(), 100, 210),
375 GenomicRange::new("chr1".to_string(), 100, 500),
376 GenomicRange::new("chr1".to_string(), 100, 5000),
377 GenomicRange::new("chr1".to_string(), 100, 200),
378 GenomicRange::new("chr1".to_string(), 1000, 1001),
379 ];
380
381 let mut cov1 = Coverage::new(®ions);
382 tags.iter().for_each(|x| cov1.insert(x, 1));
383 let result1: Vec<u64> = cov1.get_coverage().to_vec();
384 let mut cov2 = SparseCoverage::new(®ions);
385 tags.iter().for_each(|x| cov2.insert(x, 1));
386 let result2: Vec<u64> = cov2.get_coverage_as_vec();
387
388 assert_eq!(result1, result2);
389 assert_eq!(result1, vec![3, 2, 0, 0, 3]);
390
391 let mut cov3 = BinnedCoverage::new(®ions, 100);
392 tags.iter().for_each(|x| cov3.insert(x, 1));
393 let result3: Vec<u64> = cov3.get_coverage().iter().flatten().map(|x| *x).collect();
394 let mut cov4 = SparseBinnedCoverage::new(®ions, 100);
395 tags.iter().for_each(|x| cov4.insert(x, 1));
396 let result4: Vec<u64> = cov4.get_coverage_as_vec();
397
398 assert_eq!(result3, result4);
399 assert_eq!(
400 result3,
401 vec![3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 2, 2,]
402 );
403 }
404
405 #[test]
406 fn test_sparse_coverage() {
407 let regions = [
408 GenomicRange::new("chr1", 0, 2000),
409 GenomicRange::new("chr2", 100, 2100),
410 GenomicRange::new("chr3", 3000, 3500),
411 ]
412 .into_iter()
413 .collect();
414 let mut cov = SparseBinnedCoverage::new(®ions, 400);
415
416 cov.insert(&GenomicRange::new("chr1", 3, 5), 1);
417 cov.insert(&GenomicRange::new("chr2", 0, 500), 1);
418 cov.insert(&GenomicRange::new("chr2", 0, 501), 1);
419 cov.insert(&GenomicRange::new("chr3", 3400, 3401), 1);
420 cov.insert(&GenomicRange::new("chr4", 0, 501), 1);
421 cov.insert(&GenomicRange::new("chr3", 0, 501), 1);
422
423 assert_eq!(
424 vec![(0, 1), (5, 2), (6, 1), (11, 1)],
425 cov.get_coverage()
426 .iter()
427 .map(|(i, x)| (*i, *x))
428 .collect::<Vec<(usize, u64)>>()
429 );
430
431 let expected: Vec<_> = cov
432 .regions()
433 .flatten()
434 .zip(cov.get_coverage_as_vec().iter())
435 .flat_map(|(region, x)| if *x == 0 { None } else { Some((region, *x)) })
436 .collect();
437 assert_eq!(
438 expected,
439 cov.get_coverage()
440 .iter()
441 .map(|(i, v)| (cov.get_region(*i).unwrap(), *v))
442 .collect::<Vec<_>>(),
443 );
444 }
445
446 #[test]
447 fn test_sparse_bin_coverage() {
448 fn get_bedgraph(cov: SparseBinnedCoverage<'_, f32>) -> Vec<BedGraph<f32>> {
449 let expected: Vec<_> = cov
450 .regions()
451 .flatten()
452 .zip(cov.get_coverage_as_vec().iter())
453 .flat_map(|(region, x)| {
454 if *x == 0.0 {
455 None
456 } else {
457 Some(BedGraph::from_bed(®ion, *x))
458 }
459 })
460 .collect();
461 let chunks = expected.into_iter().chunk_by(|x| x.value);
462 chunks
463 .into_iter()
464 .flat_map(|(_, groups)| {
465 groups.into_iter().merge_sorted_bed_with(|beds| {
466 let mut iter = beds.into_iter();
467 let mut first = iter.next().unwrap();
468 if let Some(last) = iter.last() {
469 first.set_end(last.end());
470 }
471 first
472 })
473 })
474 .collect()
475 }
476
477 fn clip(x: &mut BedGraph<f32>, bin_size: u64) {
478 if x.start() % bin_size != 0 {
479 x.set_start(x.start() - x.start() % bin_size);
480 }
481 if x.end() % bin_size != 0 {
482 x.set_end(x.end() + bin_size - x.end() % bin_size);
483 }
484 }
485
486 fn compare(expected: Vec<BedGraph<f32>>, actual: Vec<BedGraph<f32>>) {
487 let (mis_a, mis_b): (Vec<_>, Vec<_>) = expected
488 .into_iter()
489 .zip(actual)
490 .flat_map(|(a, b)| if a != b { Some((a, b)) } else { None })
491 .unzip();
492 assert!(
493 mis_a.is_empty(),
494 "Bin_counting: {:?}\n\nMerging: {:?}",
495 &mis_a[..10],
496 &mis_b[..10]
497 );
498 }
499
500 let regions = [GenomicRange::new("chr1", 0, 1000000)]
501 .into_iter()
502 .collect();
503 let reads = (0..100000)
504 .map(|_| rand_bedgraph("chr1"))
505 .sorted_by(|a, b| a.compare(b))
506 .collect::<Vec<_>>();
507
508 let mut cov = SparseBinnedCoverage::new(®ions, 1);
509 reads.iter().for_each(|x| cov.insert(x, x.value));
510 compare(
511 get_bedgraph(cov),
512 reads.clone().into_iter().merge_sorted_bedgraph().collect(),
513 );
514
515 let mut cov = SparseBinnedCoverage::new(®ions, 71);
516 reads.iter().for_each(|x| cov.insert(x, x.value));
517 compare(
518 get_bedgraph(cov),
519 reads
520 .iter()
521 .map(|x| {
522 let mut x = x.clone();
523 clip(&mut x, 71);
524 x
525 })
526 .merge_sorted_bedgraph()
527 .collect(),
528 );
529 }
530}