1use itertools::Itertools;
2use num::Num;
3use num_traits::NumAssignOps;
4
5use super::BedGraph;
6use crate::bed::{GenomicRange, Score, Strand};
7
8use std::{cmp::Ordering, iter::Sum, ops::Neg};
9
10pub trait BEDLike {
12 fn chrom(&self) -> &str;
14
15 fn set_chrom(&mut self, chrom: &str) -> &mut Self;
17
18 fn start(&self) -> u64;
20
21 fn set_start(&mut self, start: u64) -> &mut Self;
23
24 fn end(&self) -> u64;
26
27 fn set_end(&mut self, end: u64) -> &mut Self;
29
30 fn name(&self) -> Option<&str> {
32 None
33 }
34
35 fn score(&self) -> Option<Score> {
37 None
38 }
39
40 fn strand(&self) -> Option<Strand> {
42 None
43 }
44
45 fn len(&self) -> u64 {
48 self.end().saturating_sub(self.start())
49 }
50
51 fn compare(&self, other: &Self) -> Ordering {
52 self.chrom()
53 .cmp(other.chrom())
54 .then(self.start().cmp(&other.start()))
55 .then(self.end().cmp(&other.end()))
56 }
57
58 fn overlap<B: BEDLike>(&self, other: &B) -> Option<GenomicRange> {
60 if self.chrom() != other.chrom() {
61 None
62 } else {
63 let start = self.start().max(other.start());
64 let end = self.end().min(other.end());
65 if start >= end {
66 None
67 } else {
68 Some(GenomicRange::new(self.chrom(), start, end))
69 }
70 }
71 }
72
73 fn n_overlap<B: BEDLike>(&self, other: &B) -> u64 {
75 self.overlap(other).map_or(0, |x| x.len())
76 }
77
78 fn to_genomic_range(&self) -> GenomicRange {
80 GenomicRange::new(self.chrom(), self.start(), self.end())
81 }
82
83 fn split_by_len(&self, bin_size: u64) -> impl Iterator<Item = GenomicRange> {
86 let start = self.start();
87 let end = self.end();
88 (start..end)
89 .step_by(bin_size as usize)
90 .map(move |x| GenomicRange::new(self.chrom(), x, (x + bin_size).min(end)))
91 }
92
93 fn rsplit_by_len(&self, bin_size: u64) -> impl Iterator<Item = GenomicRange> {
97 let start = self.start();
98 let end = self.end();
99 (start + 1..=end)
100 .rev()
101 .step_by(bin_size as usize)
102 .map(move |x| GenomicRange::new(self.chrom(), x.saturating_sub(bin_size).max(start), x))
103 }
104}
105
106pub struct MergeBed<I, B, F> {
107 sorted_bed_iter: I,
108 merger: F,
109 accum: Option<((String, u64, u64), Vec<B>)>,
110}
111
112impl<I, F, B, O> Iterator for MergeBed<I, B, F>
113where
114 I: Iterator<Item = B>,
115 B: BEDLike,
116 F: Fn(Vec<B>) -> O,
117{
118 type Item = O;
119 fn next(&mut self) -> Option<Self::Item> {
120 loop {
121 match self.sorted_bed_iter.next() {
122 None => {
123 return if self.accum.is_none() {
124 None
125 } else {
126 let (_, accum) = std::mem::replace(&mut self.accum, None).unwrap();
127 Some((self.merger)(accum))
128 }
129 }
130 Some(record) => match self.accum.as_mut() {
131 None => {
132 self.accum = Some((
133 (record.chrom().to_string(), record.start(), record.end()),
134 vec![record],
135 ))
136 }
137 Some(((chr, s, e), accum)) => {
138 let chr_ = record.chrom();
139 let s_ = record.start();
140 let e_ = record.end();
141 if chr != chr_ || s_ > *e {
142 let (_, acc) = std::mem::replace(
143 &mut self.accum,
144 Some(((chr_.to_string(), s_, e_), vec![record])),
145 )
146 .unwrap();
147 return Some((self.merger)(acc));
148 } else if s_ < *s {
149 panic!("input is not sorted")
150 } else if e_ > *e {
151 *e = e_;
152 accum.push(record);
153 } else {
154 accum.push(record);
155 }
156 }
157 },
158 }
159 }
160 }
161}
162
163pub fn merge_sorted_bed_with<In, I, B, O, F>(sorted_bed_iter: In, merger: F) -> MergeBed<I, B, F>
166where
167 In: IntoIterator<IntoIter = I>,
168 I: Iterator<Item = B>,
169 B: BEDLike,
170 F: Fn(Vec<B>) -> O,
171{
172 MergeBed {
173 sorted_bed_iter: sorted_bed_iter.into_iter(),
174 merger,
175 accum: None,
176 }
177}
178
179pub fn merge_sorted_bed<I, B>(sorted_iter: I) -> impl Iterator<Item = GenomicRange>
182where
183 I: IntoIterator<Item = B>,
184 B: BEDLike,
185{
186 merge_sorted_bed_with(sorted_iter, |x| {
187 GenomicRange::new(
188 x[0].chrom(),
189 x.iter().map(|x| x.start()).min().unwrap(),
190 x.iter().map(|x| x.end()).max().unwrap(),
191 )
192 })
193}
194
195pub fn merge_sorted_bedgraph<I, V>(sorted_iter: I) -> impl Iterator<Item = BedGraph<V>>
196where
197 I: IntoIterator<Item = BedGraph<V>>,
198 V: Num + NumAssignOps + Sum + Neg<Output = V> + PartialOrd + Copy,
199{
200 merge_sorted_bed_with(sorted_iter, |bdgs| {
201 let point_groups = bdgs
203 .iter()
204 .flat_map(|bedgraph| {
205 [
206 (bedgraph.start(), bedgraph.value),
207 (bedgraph.end(), bedgraph.value.neg()),
208 ]
209 })
210 .sorted_unstable_by_key(|x| x.0)
211 .chunk_by(|x| (x.0, x.1 < V::zero()));
212 let mut point_groups = point_groups.into_iter();
213
214 let chrom = bdgs[0].chrom();
215 let ((mut prev_pos, _), first_group) = point_groups.next().unwrap();
216 let mut acc_val = first_group.into_iter().map(|x| x.1).sum();
217 let mut prev_bedgraph = BedGraph::new(chrom, prev_pos, prev_pos, acc_val);
218
219 let mut result = point_groups
220 .flat_map(|((pos, _), group)| {
221 let value_sum = group.into_iter().map(|x| x.1).sum();
222 let mut bedgraph = None;
223
224 if prev_pos != pos {
225 if acc_val == prev_bedgraph.value {
226 prev_bedgraph.set_end(pos);
227 } else {
228 bedgraph = Some(prev_bedgraph.clone());
229 prev_bedgraph = BedGraph::new(chrom, prev_pos, pos, acc_val);
230 }
231 }
232
233 acc_val += value_sum;
234 prev_pos = pos;
235 bedgraph
236 })
237 .collect::<Vec<_>>();
238 result.push(prev_bedgraph);
239 result
240 })
241 .flatten()
242}
243
244#[cfg(test)]
245mod bed_tests {
246 use super::*;
247 use crate::extsort::ExternalSorterBuilder;
248 use itertools::Itertools;
249 use rand::Rng;
250
251 fn rand_bed(n: usize) -> Vec<GenomicRange> {
252 let mut rng = rand::thread_rng();
253 let mut result = Vec::with_capacity(n);
254 for _ in 0..n {
255 let bed = GenomicRange::new(
256 format!("chr{}", rng.gen_range(1..20)),
257 rng.gen_range(0..10000),
258 rng.gen_range(1000000..2000000),
259 );
260 result.push(bed);
261 }
262 result
263 }
264
265 #[test]
266 fn test_sort() {
267 let data1 = vec![
268 GenomicRange::new("chr1", 1020, 1230),
269 GenomicRange::new("chr1", 0, 500),
270 GenomicRange::new("chr2", 500, 1000),
271 GenomicRange::new("chr1", 500, 1000),
272 GenomicRange::new("chr1", 1000, 1230),
273 GenomicRange::new("chr1", 1000, 1230),
274 ];
275 let data2 = rand_bed(100000);
276
277 [data1, data2].into_iter().for_each(|mut data| {
278 let sorted1 = ExternalSorterBuilder::new()
279 .num_threads(2)
280 .with_chunk_size(10000)
281 .build()
282 .unwrap()
283 .sort(data.clone().into_iter())
284 .unwrap()
285 .collect::<Result<Vec<_>, _>>()
286 .unwrap();
287 let sorted2 = ExternalSorterBuilder::new()
288 .with_chunk_size(10000)
289 .with_compression(4)
290 .build()
291 .unwrap()
292 .sort(data.clone().into_iter())
293 .unwrap()
294 .collect::<Result<Vec<_>, _>>()
295 .unwrap();
296 data.sort();
297 assert_eq!(data, sorted1);
298 assert_eq!(data, sorted2);
299 })
300 }
301
302 #[test]
303 fn test_split() {
304 assert_eq!(
305 GenomicRange::new("chr1", 0, 1230)
306 .split_by_len(500)
307 .collect::<Vec<_>>(),
308 vec![
309 GenomicRange::new("chr1", 0, 500),
310 GenomicRange::new("chr1", 500, 1000),
311 GenomicRange::new("chr1", 1000, 1230),
312 ],
313 );
314 assert_eq!(
315 GenomicRange::new("chr1", 0, 1230)
316 .rsplit_by_len(500)
317 .collect::<Vec<_>>(),
318 vec![
319 GenomicRange::new("chr1", 730, 1230),
320 GenomicRange::new("chr1", 230, 730),
321 GenomicRange::new("chr1", 0, 230),
322 ],
323 );
324 assert_eq!(
325 GenomicRange::new("chr1", 500, 2500)
326 .split_by_len(500)
327 .collect::<Vec<_>>(),
328 GenomicRange::new("chr1", 500, 2500)
329 .rsplit_by_len(500)
330 .sorted()
331 .collect::<Vec<_>>(),
332 );
333 assert_eq!(
334 GenomicRange::new("chr1", 500, 2500)
335 .split_by_len(1)
336 .collect::<Vec<_>>(),
337 GenomicRange::new("chr1", 500, 2500)
338 .rsplit_by_len(1)
339 .sorted()
340 .collect::<Vec<_>>(),
341 );
342 }
343
344 #[test]
345 fn test_overlap() {
346 assert_eq!(
347 GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 150, 300)),
348 Some(GenomicRange::new("chr1", 150, 200)),
349 );
350
351 assert_eq!(
352 GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 110, 190)),
353 Some(GenomicRange::new("chr1", 110, 190)),
354 );
355
356 assert_eq!(
357 GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 90, 99)),
358 None,
359 );
360
361 assert_eq!(
362 GenomicRange::new("chr1", 111, 180).overlap(&GenomicRange::new("chr1", 110, 190)),
363 Some(GenomicRange::new("chr1", 111, 180)),
364 );
365
366 assert_eq!(
367 GenomicRange::new("chr1", 111, 200).overlap(&GenomicRange::new("chr1", 110, 190)),
368 Some(GenomicRange::new("chr1", 111, 190)),
369 );
370 }
371
372 #[test]
373 fn test_merge() {
374 let input = [
375 (0, 100),
376 (10, 20),
377 (50, 150),
378 (120, 160),
379 (155, 200),
380 (155, 220),
381 (500, 1000),
382 (2000, 2100),
383 (2100, 2200),
384 ]
385 .into_iter()
386 .map(|(a, b)| std::io::Result::Ok(GenomicRange::new("chr1", a, b)));
387 let expect: Vec<GenomicRange> = [(0, 220), (500, 1000), (2000, 2200)]
388 .into_iter()
389 .map(|(a, b)| GenomicRange::new("chr1", a, b))
390 .collect();
391 assert_eq!(
392 merge_sorted_bed(input.clone().map(|x| x.unwrap())).collect::<Vec<_>>(),
393 expect
394 );
395 }
396
397 #[test]
398 fn test_merge_bedgraph() {
399 let reads = vec![
400 BedGraph::new("chr1", 0, 100, 1),
401 BedGraph::new("chr1", 0, 100, 1),
402 BedGraph::new("chr1", 2, 100, 2),
403 BedGraph::new("chr1", 30, 60, 3),
404 BedGraph::new("chr1", 40, 50, 4),
405 BedGraph::new("chr1", 60, 65, 5),
406 ];
407
408 let actual: Vec<_> = merge_sorted_bedgraph(reads).collect();
409 let expected = vec![
410 BedGraph::new("chr1", 0, 2, 2),
411 BedGraph::new("chr1", 2, 30, 4),
412 BedGraph::new("chr1", 30, 40, 7),
413 BedGraph::new("chr1", 40, 50, 11),
414 BedGraph::new("chr1", 50, 60, 7),
415 BedGraph::new("chr1", 60, 65, 9),
416 BedGraph::new("chr1", 65, 100, 4),
417 ];
418 assert_eq!(expected, actual);
419 }
420}