Skip to main content

bed_utils/bed/
map.rs

1use std::collections::{BTreeMap, VecDeque};
2use std::ops::Index;
3
4use super::{BEDLike, GenomicRange};
5use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
6
7/// A map from genomic internvals to values, internally represented as a interval tree
8/// for efficient queries.
9/// Note that GIntervalMap permits duplicated records.
10#[derive(Debug, Clone)]
11pub struct GIntervalMap<D>(BTreeMap<String, Lapper<u64, D>>);
12
13impl<D> GIntervalMap<D> {
14    /// Create a new empty `GIntervalMap`.
15    pub fn new() -> Self {
16        Self(BTreeMap::new())
17    }
18
19    /// Return the number of records.
20    pub fn len(&self) -> usize {
21        self.0.values().map(|x| x.len()).sum()
22    }
23
24    /// Iterate over all records in the map. The records are returned in sorted order.
25    /// The order is determined by the chromosome (string comparison) and the start position and end position of the intervals.
26    pub fn iter(&self) -> Iter<'_, D> {
27        Iter(
28            self.0
29                .iter()
30                .map(|(k, v)| (k, v.iter()))
31                .collect::<VecDeque<_>>(),
32        )
33    }
34
35    /// This is very inefficient and should be avoided if possible.
36    pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
37        let chr = bed.chrom();
38        let interval = Interval {
39            start: bed.start(),
40            stop: bed.end(),
41            val: value,
42        };
43        let tree = self
44            .0
45            .entry(chr.to_string())
46            .or_insert(Lapper::new(Vec::new()));
47        tree.insert(interval);
48    }
49
50    /// Return regions that overlaop with the query.
51    pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
52        let chr = bed.chrom().to_string();
53        match self.0.get(&chr) {
54            None => GIntervalQueryIter {
55                chrom: chr,
56                query_iter: None,
57            },
58            Some(tree) => GIntervalQueryIter {
59                chrom: chr,
60                query_iter: Some(tree.find(bed.start(), bed.end())),
61            },
62        }
63    }
64
65    /// Determine if the query overlaps with any record.
66    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
67        self.find(bed).next().is_some()
68    }
69}
70
71impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
72    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
73        let mut hmap: BTreeMap<String, Vec<_>> = BTreeMap::new();
74        for (bed, data) in iter {
75            let chr = bed.chrom();
76            let interval = Interval {
77                start: bed.start(),
78                stop: bed.end(),
79                val: data,
80            };
81            let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
82            vec.push(interval);
83        }
84        let hm = hmap
85            .into_iter()
86            .map(|(chr, vec)| (chr, Lapper::new(vec)))
87            .collect();
88        Self(hm)
89    }
90}
91
92/// An `GIntervalQueryIter` is returned by `Intervaltree::find` and iterates over the entries
93/// overlapping the query
94#[derive(Debug)]
95pub struct GIntervalQueryIter<'a, D> {
96    chrom: String,
97    query_iter: Option<IterFind<'a, u64, D>>,
98}
99
100impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
101    type Item = (GenomicRange, &'a D);
102
103    fn next(&mut self) -> Option<Self::Item> {
104        match self.query_iter {
105            None => return None,
106            Some(ref mut iter) => match iter.next() {
107                None => return None,
108                Some(interval) => {
109                    let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
110                    Some((bed, &interval.val))
111                }
112            },
113        }
114    }
115}
116
117pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
118
119impl<'a, D> Iterator for Iter<'a, D> {
120    type Item = (GenomicRange, &'a D);
121
122    fn next(&mut self) -> Option<Self::Item> {
123        if self.0.is_empty() {
124            return None;
125        }
126        match self.0[0].1.next() {
127            None => {
128                self.0.pop_front();
129                self.next()
130            }
131            Some(interval) => {
132                let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
133                Some((bed, &interval.val))
134            }
135        }
136    }
137}
138
139/// A Bed map that preserves the order.
140#[derive(Debug, Clone)]
141pub struct GIntervalIndexMap<D> {
142    data: Vec<D>,
143    indices: GIntervalMap<usize>,
144}
145
146impl<D> GIntervalIndexMap<D> {
147    pub fn new() -> Self {
148        Self {
149            data: Vec::new(),
150            indices: GIntervalMap::new(),
151        }
152    }
153
154    pub fn len(&self) -> usize {
155        self.data.len()
156    }
157
158    /// Note the results may be returned in arbitrary order.
159    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
160        self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
161    }
162
163    /// Note the results may be returned in arbitrary order.
164    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
165        self.indices.find(bed)
166    }
167
168    pub fn get(&self, index: usize) -> Option<&D> {
169        self.data.get(index)
170    }
171}
172
173impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
174    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
175        let mut data = Vec::new();
176        let indices = iter
177            .into_iter()
178            .enumerate()
179            .map(|(i, (bed, val))| {
180                data.push(val);
181                (bed, i)
182            })
183            .collect();
184        Self { data, indices }
185    }
186}
187
188/// A Bed map that preserves the order.
189#[derive(Debug, Clone)]
190pub struct GIntervalIndexSet {
191    data: Vec<GenomicRange>,
192    indices: GIntervalMap<usize>,
193}
194
195impl Index<usize> for GIntervalIndexSet {
196    type Output = GenomicRange;
197
198    fn index(&self, index: usize) -> &Self::Output {
199        &self.data[index]
200    }
201}
202
203impl GIntervalIndexSet {
204    pub fn len(&self) -> usize {
205        self.data.len()
206    }
207
208    pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
209        self.data.iter()
210    }
211
212    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
213        self.find_full(bed).next().is_some()
214    }
215
216    /// Note the results may be returned in arbitrary order.
217    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
218        self.indices.find(bed).map(|x| x.0)
219    }
220
221    /// Note the results may be returned in arbitrary order.
222    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
223        self.indices.find(bed).map(|x| *x.1)
224    }
225
226    pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
227        self.indices.find(bed)
228    }
229
230    pub fn get(&self, index: usize) -> Option<&GenomicRange> {
231        self.data.get(index)
232    }
233}
234
235impl IntoIterator for GIntervalIndexSet {
236    type IntoIter = std::vec::IntoIter<Self::Item>;
237    type Item = GenomicRange;
238
239    fn into_iter(self) -> Self::IntoIter {
240        self.data.into_iter()
241    }
242}
243
244impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
245    fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
246        let mut data = Vec::new();
247        let indices = iter
248            .into_iter()
249            .enumerate()
250            .map(|(i, bed)| {
251                data.push(bed.to_genomic_range());
252                (bed, i)
253            })
254            .collect();
255        Self { data, indices }
256    }
257}
258
259#[cfg(test)]
260mod bed_intersect_tests {
261    use super::*;
262    use itertools::Itertools;
263
264    #[test]
265    fn test_create() {
266        let beds: Vec<GenomicRange> = [
267            "chr1:200-500",
268            "chr1:200-500",
269            "chr1:1000-2000",
270            "chr1:1000-2000",
271        ]
272        .into_iter()
273        .map(|x| x.parse().unwrap())
274        .collect();
275        let values = vec![2, 3, 5, 5];
276        let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
277        map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
278        map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
279
280        let result: Vec<_> = map.iter().sorted().collect();
281        let expected = vec![
282            (GenomicRange::new("chr1", 200, 500), &2),
283            (GenomicRange::new("chr1", 200, 500), &3),
284            (GenomicRange::new("chr1", 1000, 2000), &5),
285            (GenomicRange::new("chr1", 1000, 2000), &5),
286            (GenomicRange::new("chr1", 1000, 2000), &5),
287            (GenomicRange::new("chr1", 1000, 2000), &6),
288        ];
289        assert_eq!(result, expected);
290    }
291
292    #[test]
293    fn test_intersect() {
294        let bed_set1: Vec<GenomicRange> = ["chr1:200-500", "chr1:200-500", "chr1:1000-2000"]
295            .into_iter()
296            .map(|x| x.parse().unwrap())
297            .collect();
298        let bed_set2: Vec<GenomicRange> = ["chr1:100-210", "chr1:100-200"]
299            .into_iter()
300            .map(|x| x.parse().unwrap())
301            .collect();
302        let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
303
304        assert_eq!(
305            tree.find(&bed_set2[0]).collect::<Vec<_>>(),
306            vec![
307                bed_set1.as_slice()[0].clone(),
308                bed_set1.as_slice()[0].clone()
309            ]
310        );
311
312        let result: Vec<GenomicRange> = bed_set2
313            .into_iter()
314            .filter(|x| tree.is_overlapped(x))
315            .collect();
316        let expected = vec!["chr1:100-210".parse().unwrap()];
317        assert_eq!(result, expected);
318    }
319
320    #[test]
321    fn test_iter() {
322        fn random_bed() -> GenomicRange {
323            let chrom = format!("chr{}", rand::random::<u8>() % 10 + 1);
324            let start = rand::random::<u64>() % 1000;
325            let end = start + rand::random::<u64>() % 100;
326            GenomicRange::new(&chrom, start, end)
327        }
328        let beds: Vec<_> = (0..10000).map(|_| (random_bed(), ())).collect();
329        let map: GIntervalMap<()> = beds.clone().into_iter().collect();
330        assert_eq!(
331            map.iter().map(|(g, _)| g).collect::<Vec<_>>(),
332            beds.into_iter()
333                .map(|(g, _)| g)
334                .sorted()
335                .collect::<Vec<_>>(),
336        );
337    }
338}