bed_utils/bed/
bed_trait.rs

1use itertools::Itertools;
2use num::Num;
3use num_traits::NumAssignOps;
4
5use super::BedGraph;
6use crate::bed::{GenomicRange, Score, Strand};
7
8use std::{cmp::Ordering, iter::Sum, ops::Neg};
9
10/// Common BED fields
11pub trait BEDLike {
12    /// Return the chromosome name of the record
13    fn chrom(&self) -> &str;
14
15    /// Change the chromosome name of the record
16    fn set_chrom(&mut self, chrom: &str) -> &mut Self;
17
18    /// Return the 0-based start position of the record
19    fn start(&self) -> u64;
20
21    /// Change the 0-based start position of the record
22    fn set_start(&mut self, start: u64) -> &mut Self;
23
24    /// Return the end position (non-inclusive) of the record
25    fn end(&self) -> u64;
26
27    /// Change the end position (non-inclusive) of the record
28    fn set_end(&mut self, end: u64) -> &mut Self;
29
30    /// Return the name of the record
31    fn name(&self) -> Option<&str> {
32        None
33    }
34
35    /// Return the score of the record
36    fn score(&self) -> Option<Score> {
37        None
38    }
39
40    /// Return the strand of the record
41    fn strand(&self) -> Option<Strand> {
42        None
43    }
44
45    /// Return the length of the record. Return 0 if the end position is smaller
46    /// than the start position.
47    fn len(&self) -> u64 {
48        self.end().saturating_sub(self.start())
49    }
50
51    fn compare(&self, other: &Self) -> Ordering {
52        self.chrom()
53            .cmp(other.chrom())
54            .then(self.start().cmp(&other.start()))
55            .then(self.end().cmp(&other.end()))
56    }
57
58    /// Return the overlap
59    fn overlap<B: BEDLike>(&self, other: &B) -> Option<GenomicRange> {
60        if self.chrom() != other.chrom() {
61            None
62        } else {
63            let start = self.start().max(other.start());
64            let end = self.end().min(other.end());
65            if start >= end {
66                None
67            } else {
68                Some(GenomicRange::new(self.chrom(), start, end))
69            }
70        }
71    }
72
73    /// Return the size of overlap between two records
74    fn n_overlap<B: BEDLike>(&self, other: &B) -> u64 {
75        self.overlap(other).map_or(0, |x| x.len())
76    }
77
78    /// Convert the record to a `GenomicRange`
79    fn to_genomic_range(&self) -> GenomicRange {
80        GenomicRange::new(self.chrom(), self.start(), self.end())
81    }
82
83    /// Split into consecutive records with the specified length. The length of
84    /// the last record may be shorter.
85    fn split_by_len(&self, bin_size: u64) -> impl Iterator<Item = GenomicRange> {
86        let start = self.start();
87        let end = self.end();
88        (start..end)
89            .step_by(bin_size as usize)
90            .map(move |x| GenomicRange::new(self.chrom(), x, (x + bin_size).min(end)))
91    }
92
93    /// Split into consecutive records with the specified length starting from the end.
94    /// The result is in reverse order compared to `split_by_len`. The length of the last
95    /// record may be shorter.
96    fn rsplit_by_len(&self, bin_size: u64) -> impl Iterator<Item = GenomicRange> {
97        let start = self.start();
98        let end = self.end();
99        (start + 1..=end)
100            .rev()
101            .step_by(bin_size as usize)
102            .map(move |x| GenomicRange::new(self.chrom(), x.saturating_sub(bin_size).max(start), x))
103    }
104}
105
106pub struct MergeBed<I, B, F> {
107    sorted_bed_iter: I,
108    merger: F,
109    accum: Option<((String, u64, u64), Vec<B>)>,
110}
111
112impl<I, F, B, O> Iterator for MergeBed<I, B, F>
113where
114    I: Iterator<Item = B>,
115    B: BEDLike,
116    F: Fn(Vec<B>) -> O,
117{
118    type Item = O;
119    fn next(&mut self) -> Option<Self::Item> {
120        loop {
121            match self.sorted_bed_iter.next() {
122                None => {
123                    return if self.accum.is_none() {
124                        None
125                    } else {
126                        let (_, accum) = std::mem::replace(&mut self.accum, None).unwrap();
127                        Some((self.merger)(accum))
128                    }
129                }
130                Some(record) => match self.accum.as_mut() {
131                    None => {
132                        self.accum = Some((
133                            (record.chrom().to_string(), record.start(), record.end()),
134                            vec![record],
135                        ))
136                    }
137                    Some(((chr, s, e), accum)) => {
138                        let chr_ = record.chrom();
139                        let s_ = record.start();
140                        let e_ = record.end();
141                        if chr != chr_ || s_ > *e {
142                            let (_, acc) = std::mem::replace(
143                                &mut self.accum,
144                                Some(((chr_.to_string(), s_, e_), vec![record])),
145                            )
146                            .unwrap();
147                            return Some((self.merger)(acc));
148                        } else if s_ < *s {
149                            panic!("input is not sorted")
150                        } else if e_ > *e {
151                            *e = e_;
152                            accum.push(record);
153                        } else {
154                            accum.push(record);
155                        }
156                    }
157                },
158            }
159        }
160    }
161}
162
163/// Merge sorted BED records. Overlapping records are processed according to the
164/// function provided.
165pub fn merge_sorted_bed_with<In, I, B, O, F>(sorted_bed_iter: In, merger: F) -> MergeBed<I, B, F>
166where
167    In: IntoIterator<IntoIter = I>,
168    I: Iterator<Item = B>,
169    B: BEDLike,
170    F: Fn(Vec<B>) -> O,
171{
172    MergeBed {
173        sorted_bed_iter: sorted_bed_iter.into_iter(),
174        merger,
175        accum: None,
176    }
177}
178
179/// Merge sorted BED records. Overlapping records are concatenated into a single
180/// record.
181pub fn merge_sorted_bed<I, B>(sorted_iter: I) -> impl Iterator<Item = GenomicRange>
182where
183    I: IntoIterator<Item = B>,
184    B: BEDLike,
185{
186    merge_sorted_bed_with(sorted_iter, |x| {
187        GenomicRange::new(
188            x[0].chrom(),
189            x.iter().map(|x| x.start()).min().unwrap(),
190            x.iter().map(|x| x.end()).max().unwrap(),
191        )
192    })
193}
194
195pub fn merge_sorted_bedgraph<I, V>(sorted_iter: I) -> impl Iterator<Item = BedGraph<V>>
196where
197    I: IntoIterator<Item = BedGraph<V>>,
198    V: Num + NumAssignOps + Sum + Neg<Output = V> + PartialOrd + Copy,
199{
200    merge_sorted_bed_with(sorted_iter, |bdgs| {
201        // group by start and end points
202        let point_groups = bdgs
203            .iter()
204            .flat_map(|bedgraph| {
205                [
206                    (bedgraph.start(), bedgraph.value),
207                    (bedgraph.end(), bedgraph.value.neg()),
208                ]
209            })
210            .sorted_unstable_by_key(|x| x.0)
211            .chunk_by(|x| (x.0, x.1 < V::zero()));
212        let mut point_groups = point_groups.into_iter();
213
214        let chrom = bdgs[0].chrom();
215        let ((mut prev_pos, _), first_group) = point_groups.next().unwrap();
216        let mut acc_val = first_group.into_iter().map(|x| x.1).sum();
217        let mut prev_bedgraph = BedGraph::new(chrom, prev_pos, prev_pos, acc_val);
218
219        let mut result = point_groups
220            .flat_map(|((pos, _), group)| {
221                let value_sum = group.into_iter().map(|x| x.1).sum();
222                let mut bedgraph = None;
223
224                if prev_pos != pos {
225                    if acc_val == prev_bedgraph.value {
226                        prev_bedgraph.set_end(pos);
227                    } else {
228                        bedgraph = Some(prev_bedgraph.clone());
229                        prev_bedgraph = BedGraph::new(chrom, prev_pos, pos, acc_val);
230                    }
231                }
232
233                acc_val += value_sum;
234                prev_pos = pos;
235                bedgraph
236            })
237            .collect::<Vec<_>>();
238        result.push(prev_bedgraph);
239        result
240    })
241    .flatten()
242}
243
244#[cfg(test)]
245mod bed_tests {
246    use super::*;
247    use crate::extsort::ExternalSorterBuilder;
248    use itertools::Itertools;
249    use rand::Rng;
250
251    fn rand_bed(n: usize) -> Vec<GenomicRange> {
252        let mut rng = rand::thread_rng();
253        let mut result = Vec::with_capacity(n);
254        for _ in 0..n {
255            let bed = GenomicRange::new(
256                format!("chr{}", rng.gen_range(1..20)),
257                rng.gen_range(0..10000),
258                rng.gen_range(1000000..2000000),
259            );
260            result.push(bed);
261        }
262        result
263    }
264
265    #[test]
266    fn test_sort() {
267        let data1 = vec![
268            GenomicRange::new("chr1", 1020, 1230),
269            GenomicRange::new("chr1", 0, 500),
270            GenomicRange::new("chr2", 500, 1000),
271            GenomicRange::new("chr1", 500, 1000),
272            GenomicRange::new("chr1", 1000, 1230),
273            GenomicRange::new("chr1", 1000, 1230),
274        ];
275        let data2 = rand_bed(100000);
276
277        [data1, data2].into_iter().for_each(|mut data| {
278            let sorted1 = ExternalSorterBuilder::new()
279                .num_threads(2)
280                .with_chunk_size(10000)
281                .build()
282                .unwrap()
283                .sort(data.clone().into_iter())
284                .unwrap()
285                .collect::<Result<Vec<_>, _>>()
286                .unwrap();
287            let sorted2 = ExternalSorterBuilder::new()
288                .with_chunk_size(10000)
289                .with_compression(4)
290                .build()
291                .unwrap()
292                .sort(data.clone().into_iter())
293                .unwrap()
294                .collect::<Result<Vec<_>, _>>()
295                .unwrap();
296            data.sort();
297            assert_eq!(data, sorted1);
298            assert_eq!(data, sorted2);
299        })
300    }
301
302    #[test]
303    fn test_split() {
304        assert_eq!(
305            GenomicRange::new("chr1", 0, 1230)
306                .split_by_len(500)
307                .collect::<Vec<_>>(),
308            vec![
309                GenomicRange::new("chr1", 0, 500),
310                GenomicRange::new("chr1", 500, 1000),
311                GenomicRange::new("chr1", 1000, 1230),
312            ],
313        );
314        assert_eq!(
315            GenomicRange::new("chr1", 0, 1230)
316                .rsplit_by_len(500)
317                .collect::<Vec<_>>(),
318            vec![
319                GenomicRange::new("chr1", 730, 1230),
320                GenomicRange::new("chr1", 230, 730),
321                GenomicRange::new("chr1", 0, 230),
322            ],
323        );
324        assert_eq!(
325            GenomicRange::new("chr1", 500, 2500)
326                .split_by_len(500)
327                .collect::<Vec<_>>(),
328            GenomicRange::new("chr1", 500, 2500)
329                .rsplit_by_len(500)
330                .sorted()
331                .collect::<Vec<_>>(),
332        );
333        assert_eq!(
334            GenomicRange::new("chr1", 500, 2500)
335                .split_by_len(1)
336                .collect::<Vec<_>>(),
337            GenomicRange::new("chr1", 500, 2500)
338                .rsplit_by_len(1)
339                .sorted()
340                .collect::<Vec<_>>(),
341        );
342    }
343
344    #[test]
345    fn test_overlap() {
346        assert_eq!(
347            GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 150, 300)),
348            Some(GenomicRange::new("chr1", 150, 200)),
349        );
350
351        assert_eq!(
352            GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 110, 190)),
353            Some(GenomicRange::new("chr1", 110, 190)),
354        );
355
356        assert_eq!(
357            GenomicRange::new("chr1", 100, 200).overlap(&GenomicRange::new("chr1", 90, 99)),
358            None,
359        );
360
361        assert_eq!(
362            GenomicRange::new("chr1", 111, 180).overlap(&GenomicRange::new("chr1", 110, 190)),
363            Some(GenomicRange::new("chr1", 111, 180)),
364        );
365
366        assert_eq!(
367            GenomicRange::new("chr1", 111, 200).overlap(&GenomicRange::new("chr1", 110, 190)),
368            Some(GenomicRange::new("chr1", 111, 190)),
369        );
370    }
371
372    #[test]
373    fn test_merge() {
374        let input = [
375            (0, 100),
376            (10, 20),
377            (50, 150),
378            (120, 160),
379            (155, 200),
380            (155, 220),
381            (500, 1000),
382            (2000, 2100),
383            (2100, 2200),
384        ]
385        .into_iter()
386        .map(|(a, b)| std::io::Result::Ok(GenomicRange::new("chr1", a, b)));
387        let expect: Vec<GenomicRange> = [(0, 220), (500, 1000), (2000, 2200)]
388            .into_iter()
389            .map(|(a, b)| GenomicRange::new("chr1", a, b))
390            .collect();
391        assert_eq!(
392            merge_sorted_bed(input.clone().map(|x| x.unwrap())).collect::<Vec<_>>(),
393            expect
394        );
395    }
396
397    #[test]
398    fn test_merge_bedgraph() {
399        let reads = vec![
400            BedGraph::new("chr1", 0, 100, 1),
401            BedGraph::new("chr1", 0, 100, 1),
402            BedGraph::new("chr1", 2, 100, 2),
403            BedGraph::new("chr1", 30, 60, 3),
404            BedGraph::new("chr1", 40, 50, 4),
405            BedGraph::new("chr1", 60, 65, 5),
406        ];
407
408        let actual: Vec<_> = merge_sorted_bedgraph(reads).collect();
409        let expected = vec![
410            BedGraph::new("chr1", 0, 2, 2),
411            BedGraph::new("chr1", 2, 30, 4),
412            BedGraph::new("chr1", 30, 40, 7),
413            BedGraph::new("chr1", 40, 50, 11),
414            BedGraph::new("chr1", 50, 60, 7),
415            BedGraph::new("chr1", 60, 65, 9),
416            BedGraph::new("chr1", 65, 100, 4),
417        ];
418        assert_eq!(expected, actual);
419    }
420}