bed_utils/bed/
map.rs

1use std::ops::Index;
2use std::collections::{BTreeMap, VecDeque};
3
4use super::{GenomicRange, BEDLike};
5use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
6
7/// A map from genomic internvals to values, internally represented as a interval tree
8/// for efficient queries.
9/// Note that GIntervalMap permits duplicated records.
10#[derive(Debug, Clone)]
11pub struct GIntervalMap<D>(BTreeMap<String, Lapper<u64, D>>);
12
13impl<D> GIntervalMap<D> {
14    /// Create a new empty `GIntervalMap`.
15    pub fn new() -> Self {
16        Self(BTreeMap::new())
17    }
18
19    /// Return the number of records.
20    pub fn len(&self) -> usize {
21        self.0.values().map(|x| x.len()).sum()
22    }
23
24    /// Iterate over all records in the map. The records are returned in sorted order.
25    /// The order is determined by the chromosome (string comparison) and the start position and end position of the intervals.
26    pub fn iter(&self) -> Iter<'_, D> {
27        Iter(self.0.iter().map(|(k, v)| (k, v.iter())).collect::<VecDeque<_>>())
28    }
29
30    /// This is very inefficient and should be avoided if possible.
31    pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
32        let chr = bed.chrom();
33        let interval = Interval { start: bed.start(), stop: bed.end(), val: value };
34        let tree = self.0.entry(chr.to_string()).or_insert(Lapper::new(Vec::new()));
35        tree.insert(interval);
36    }
37
38    /// Return regions that overlaop with the query.
39    pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
40        let chr = bed.chrom().to_string();
41        match self.0.get(&chr) {
42            None => GIntervalQueryIter { chrom: chr, query_iter: None },
43            Some(tree) => GIntervalQueryIter { chrom: chr, query_iter: Some(tree.find(bed.start(), bed.end())) }
44        }
45    }
46
47    /// Determine if the query overlaps with any record.
48    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
49        self.find(bed).next().is_some()
50    }
51}
52
53impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
54    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
55        let mut hmap: BTreeMap<String, Vec<_>> = BTreeMap::new();
56        for (bed, data) in iter {
57            let chr = bed.chrom();
58            let interval = Interval { start: bed.start(), stop: bed.end(), val: data};
59            let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
60            vec.push(interval);
61        }
62        let hm = hmap.into_iter().map(|(chr, vec)| (chr, Lapper::new(vec))).collect();
63        Self(hm)
64    }
65}
66
67/// An `GIntervalQueryIter` is returned by `Intervaltree::find` and iterates over the entries
68/// overlapping the query
69#[derive(Debug)]
70pub struct GIntervalQueryIter<'a, D> {
71    chrom: String,
72    query_iter: Option<IterFind<'a, u64, D>>,
73}
74
75impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
76    type Item = (GenomicRange, &'a D);
77
78    fn next(&mut self) -> Option<Self::Item> {
79        match self.query_iter {
80            None => return None,
81            Some(ref mut iter) => match iter.next() {
82                None => return None,
83                Some(interval) => {
84                    let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
85                    Some((bed, &interval.val))
86                }
87            }
88        }
89    }
90}
91
92pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
93
94impl<'a, D> Iterator for Iter<'a, D> {
95    type Item = (GenomicRange, &'a D);
96
97    fn next(&mut self) -> Option<Self::Item> {
98        if self.0.is_empty() {
99            return None
100        }
101        match self.0[0].1.next() {
102            None => {
103                self.0.pop_front();
104                self.next()
105            }
106            Some(interval) => {
107                let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
108                Some((bed, &interval.val))
109            }
110        }
111    }
112}
113
114/// A Bed map that preserves the order.
115#[derive(Debug, Clone)]
116pub struct GIntervalIndexMap<D> {
117    data: Vec<D>,
118    indices: GIntervalMap<usize>,
119}
120
121impl<D> GIntervalIndexMap<D> {
122    pub fn new() -> Self {
123        Self { data: Vec::new(), indices: GIntervalMap::new() }
124    }
125
126    pub fn len(&self) -> usize {
127        self.data.len()
128    }
129
130    /// Note the results may be returned in arbitrary order.
131    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
132        self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
133    }
134
135    /// Note the results may be returned in arbitrary order.
136    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
137        self.indices.find(bed)
138    }
139
140    pub fn get(&self, index: usize) -> Option<&D> {
141        self.data.get(index)
142    }
143}
144
145impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
146    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
147        let mut data = Vec::new();
148        let indices = iter.into_iter().enumerate().map(|(i, (bed, val))| {
149            data.push(val);
150            (bed, i)
151        }).collect();
152        Self { data, indices }
153    }
154}
155
156
157/// A Bed map that preserves the order.
158#[derive(Debug, Clone)]
159pub struct GIntervalIndexSet {
160    data: Vec<GenomicRange>,
161    indices: GIntervalMap<usize>,
162}
163
164impl Index<usize> for GIntervalIndexSet {
165    type Output = GenomicRange;
166
167    fn index(&self, index: usize) -> &Self::Output {
168        &self.data[index]
169    }
170}
171
172impl GIntervalIndexSet {
173    pub fn len(&self) -> usize {
174        self.data.len()
175    }
176
177    pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
178        self.data.iter()
179    }
180
181    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
182        self.find_full(bed).next().is_some()
183    }
184
185    /// Note the results may be returned in arbitrary order.
186    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
187        self.indices.find(bed).map(|x| x.0)
188    }
189
190    /// Note the results may be returned in arbitrary order.
191    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
192        self.indices.find(bed).map(|x| *x.1)
193    }
194
195    pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
196        self.indices.find(bed)
197    }
198
199    pub fn get(&self, index: usize) -> Option<&GenomicRange> {
200        self.data.get(index)
201    }
202}
203
204impl IntoIterator for GIntervalIndexSet {
205    type IntoIter = std::vec::IntoIter<Self::Item>;
206    type Item = GenomicRange;
207
208    fn into_iter(self) -> Self::IntoIter {
209        self.data.into_iter()
210    }
211}
212
213impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
214    fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
215        let mut data = Vec::new();
216        let indices = iter.into_iter().enumerate().map(|(i, bed)| {
217            data.push(bed.to_genomic_range());
218            (bed, i)
219        }).collect();
220        Self { data, indices }
221    }
222}
223
224#[cfg(test)]
225mod bed_intersect_tests {
226    use super::*;
227    use itertools::Itertools;
228
229    #[test]
230    fn test_create() {
231        let beds: Vec<GenomicRange> = [
232            "chr1:200-500",
233            "chr1:200-500",
234            "chr1:1000-2000",
235            "chr1:1000-2000",
236        ].into_iter().map(|x| x.parse().unwrap()).collect();
237        let values = vec![2, 3, 5, 5];
238        let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
239        map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
240        map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
241
242        let result: Vec<_> = map.iter().sorted().collect();
243        let expected = vec![
244            (GenomicRange::new("chr1", 200, 500), &2),
245            (GenomicRange::new("chr1", 200, 500), &3),
246            (GenomicRange::new("chr1", 1000, 2000), &5), 
247            (GenomicRange::new("chr1", 1000, 2000), &5), 
248            (GenomicRange::new("chr1", 1000, 2000), &5),
249            (GenomicRange::new("chr1", 1000, 2000), &6),
250        ];
251        assert_eq!(result, expected);
252    }
253
254    #[test]
255    fn test_intersect() {
256        let bed_set1: Vec<GenomicRange> = [
257            "chr1:200-500",
258            "chr1:200-500",
259            "chr1:1000-2000",
260        ].into_iter().map(|x| x.parse().unwrap()).collect();
261        let bed_set2: Vec<GenomicRange> = [
262            "chr1:100-210",
263            "chr1:100-200",
264        ].into_iter().map(|x| x.parse().unwrap()).collect();
265        let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
266
267        assert_eq!(
268            tree.find(&bed_set2[0]).collect::<Vec<_>>(),
269            vec![bed_set1.as_slice()[0].clone(), bed_set1.as_slice()[0].clone()]
270        );
271
272        let result: Vec<GenomicRange> = bed_set2.into_iter().filter(|x| tree.is_overlapped(x)).collect();
273        let expected = vec!["chr1:100-210".parse().unwrap()];
274        assert_eq!(result, expected);
275    }
276
277    #[test]
278    fn test_iter() {
279        fn random_bed() -> GenomicRange {
280            let chrom = format!("chr{}", rand::random::<u8>() % 10 + 1);
281            let start = rand::random::<u64>() % 1000;
282            let end = start + rand::random::<u64>() % 100;
283            GenomicRange::new(&chrom, start, end)
284        }
285        let beds: Vec<_> = (0..10000).map(|_| (random_bed(), ())).collect();
286        let map: GIntervalMap<()> = beds.clone().into_iter().collect();
287        assert_eq!(
288            map.iter().map(|(g, _)| g).collect::<Vec<_>>(),
289            beds.into_iter().map(|(g, _)| g).sorted().collect::<Vec<_>>(),
290        );
291    }
292}