bed_utils/bed/
map.rs

1use std::ops::Index;
2use std::collections::{HashMap, VecDeque};
3
4use super::{GenomicRange, BEDLike};
5use crate::intervaltree::{Interval, IterFind, IterLapper, Lapper};
6
7/// A map from genomic internvals to values, internally represented as a interval tree
8/// for efficient queries.
9/// Note that GIntervalMap permits duplicated records.
10#[derive(Debug, Clone)]
11pub struct GIntervalMap<D>(HashMap<String, Lapper<u64, D>>);
12
13impl<D> GIntervalMap<D> {
14    pub fn new() -> Self {
15        Self(HashMap::new())
16    }
17
18    /// Return the number of records.
19    pub fn len(&self) -> usize {
20        self.0.values().map(|x| x.len()).sum()
21    }
22
23    pub fn iter(&self) -> Iter<'_, D> {
24        Iter(self.0.iter().map(|(k, v)| (k, v.iter())).collect::<VecDeque<_>>())
25    }
26
27    /// This is very inefficient and should be avoided if possible.
28    pub fn insert<B: BEDLike>(&mut self, bed: &B, value: D) {
29        let chr = bed.chrom();
30        let interval = Interval { start: bed.start(), stop: bed.end(), val: value };
31        let tree = self.0.entry(chr.to_string()).or_insert(Lapper::new(Vec::new()));
32        tree.insert(interval);
33    }
34
35    /// Return regions that overlaop with the query.
36    pub fn find<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, D> {
37        let chr = bed.chrom().to_string();
38        match self.0.get(&chr) {
39            None => GIntervalQueryIter { chrom: chr, query_iter: None },
40            Some(tree) => GIntervalQueryIter { chrom: chr, query_iter: Some(tree.find(bed.start(), bed.end())) }
41        }
42    }
43
44    /// Determine if the query overlaps with any record.
45    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
46        self.find(bed).next().is_some()
47    }
48}
49
50impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalMap<D> {
51    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
52        let mut hmap: HashMap<String, Vec<_>> = HashMap::new();
53        for (bed, data) in iter {
54            let chr = bed.chrom();
55            let interval = Interval { start: bed.start(), stop: bed.end(), val: data};
56            let vec = hmap.entry(chr.to_string()).or_insert(Vec::new());
57            vec.push(interval);
58        }
59        let hm = hmap.into_iter().map(|(chr, vec)| (chr, Lapper::new(vec))).collect();
60        Self(hm)
61    }
62}
63
64/// An `GIntervalQueryIter` is returned by `Intervaltree::find` and iterates over the entries
65/// overlapping the query
66#[derive(Debug)]
67pub struct GIntervalQueryIter<'a, D> {
68    chrom: String,
69    query_iter: Option<IterFind<'a, u64, D>>,
70}
71
72impl<'a, D: 'a> Iterator for GIntervalQueryIter<'a, D> {
73    type Item = (GenomicRange, &'a D);
74
75    fn next(&mut self) -> Option<Self::Item> {
76        match self.query_iter {
77            None => return None,
78            Some(ref mut iter) => match iter.next() {
79                None => return None,
80                Some(interval) => {
81                    let bed = GenomicRange::new(&self.chrom, interval.start, interval.stop);
82                    Some((bed, &interval.val))
83                }
84            }
85        }
86    }
87}
88
89pub struct Iter<'a, D>(VecDeque<(&'a String, IterLapper<'a, u64, D>)>);
90
91impl<'a, D> Iterator for Iter<'a, D> {
92    type Item = (GenomicRange, &'a D);
93
94    fn next(&mut self) -> Option<Self::Item> {
95        if self.0.is_empty() {
96            return None
97        }
98        match self.0[0].1.next() {
99            None => {
100                self.0.pop_front();
101                self.next()
102            }
103            Some(interval) => {
104                let bed = GenomicRange::new(self.0[0].0, interval.start, interval.stop);
105                Some((bed, &interval.val))
106            }
107        }
108    }
109}
110
111/// A Bed map that preserves the order.
112#[derive(Debug, Clone)]
113pub struct GIntervalIndexMap<D> {
114    data: Vec<D>,
115    indices: GIntervalMap<usize>,
116}
117
118impl<D> GIntervalIndexMap<D> {
119    pub fn new() -> Self {
120        Self { data: Vec::new(), indices: GIntervalMap::new() }
121    }
122
123    pub fn len(&self) -> usize {
124        self.data.len()
125    }
126
127    /// Note the results may be returned in arbitrary order.
128    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = (GenomicRange, &D)> {
129        self.indices.find(bed).map(|(g, i)| (g, &self.data[*i]))
130    }
131
132    /// Note the results may be returned in arbitrary order.
133    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
134        self.indices.find(bed)
135    }
136
137    pub fn get(&self, index: usize) -> Option<&D> {
138        self.data.get(index)
139    }
140}
141
142impl<B: BEDLike, D> FromIterator<(B, D)> for GIntervalIndexMap<D> {
143    fn from_iter<I: IntoIterator<Item = (B, D)>>(iter: I) -> Self {
144        let mut data = Vec::new();
145        let indices = iter.into_iter().enumerate().map(|(i, (bed, val))| {
146            data.push(val);
147            (bed, i)
148        }).collect();
149        Self { data, indices }
150    }
151}
152
153
154/// A Bed map that preserves the order.
155#[derive(Debug, Clone)]
156pub struct GIntervalIndexSet {
157    data: Vec<GenomicRange>,
158    indices: GIntervalMap<usize>,
159}
160
161impl Index<usize> for GIntervalIndexSet {
162    type Output = GenomicRange;
163
164    fn index(&self, index: usize) -> &Self::Output {
165        &self.data[index]
166    }
167}
168
169impl GIntervalIndexSet {
170    pub fn len(&self) -> usize {
171        self.data.len()
172    }
173
174    pub fn iter(&self) -> impl Iterator<Item = &GenomicRange> {
175        self.data.iter()
176    }
177
178    pub fn is_overlapped<B: BEDLike>(&self, bed: &B) -> bool {
179        self.find_full(bed).next().is_some()
180    }
181
182    /// Note the results may be returned in arbitrary order.
183    pub fn find<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = GenomicRange> + '_ {
184        self.indices.find(bed).map(|x| x.0)
185    }
186
187    /// Note the results may be returned in arbitrary order.
188    pub fn find_index_of<B: BEDLike>(&self, bed: &B) -> impl Iterator<Item = usize> + '_ {
189        self.indices.find(bed).map(|x| *x.1)
190    }
191
192    pub fn find_full<B: BEDLike>(&self, bed: &B) -> GIntervalQueryIter<'_, usize> {
193        self.indices.find(bed)
194    }
195
196    pub fn get(&self, index: usize) -> Option<&GenomicRange> {
197        self.data.get(index)
198    }
199}
200
201impl IntoIterator for GIntervalIndexSet {
202    type IntoIter = std::vec::IntoIter<Self::Item>;
203    type Item = GenomicRange;
204
205    fn into_iter(self) -> Self::IntoIter {
206        self.data.into_iter()
207    }
208}
209
210impl<B: BEDLike> FromIterator<B> for GIntervalIndexSet {
211    fn from_iter<I: IntoIterator<Item = B>>(iter: I) -> Self {
212        let mut data = Vec::new();
213        let indices = iter.into_iter().enumerate().map(|(i, bed)| {
214            data.push(bed.to_genomic_range());
215            (bed, i)
216        }).collect();
217        Self { data, indices }
218    }
219}
220
221#[cfg(test)]
222mod bed_intersect_tests {
223    use super::*;
224    use itertools::Itertools;
225
226    #[test]
227    fn test_create() {
228        let beds: Vec<GenomicRange> = [
229            "chr1:200-500",
230            "chr1:200-500",
231            "chr1:1000-2000",
232            "chr1:1000-2000",
233        ].into_iter().map(|x| x.parse().unwrap()).collect();
234        let values = vec![2, 3, 5, 5];
235        let mut map: GIntervalMap<_> = beds.into_iter().zip(values.into_iter()).collect();
236        map.insert(&GenomicRange::new("chr1", 1000, 2000), 5);
237        map.insert(&GenomicRange::new("chr1", 1000, 2000), 6);
238
239        let result: Vec<_> = map.iter().sorted().collect();
240        let expected = vec![
241            (GenomicRange::new("chr1", 200, 500), &2),
242            (GenomicRange::new("chr1", 200, 500), &3),
243            (GenomicRange::new("chr1", 1000, 2000), &5), 
244            (GenomicRange::new("chr1", 1000, 2000), &5), 
245            (GenomicRange::new("chr1", 1000, 2000), &5),
246            (GenomicRange::new("chr1", 1000, 2000), &6),
247        ];
248        assert_eq!(result, expected);
249    }
250
251    #[test]
252    fn test_intersect() {
253        let bed_set1: Vec<GenomicRange> = [
254            "chr1:200-500",
255            "chr1:200-500",
256            "chr1:1000-2000",
257        ].into_iter().map(|x| x.parse().unwrap()).collect();
258        let bed_set2: Vec<GenomicRange> = [
259            "chr1:100-210",
260            "chr1:100-200",
261        ].into_iter().map(|x| x.parse().unwrap()).collect();
262        let tree: GIntervalIndexSet = bed_set1.clone().into_iter().collect();
263
264        assert_eq!(
265            tree.find(&bed_set2[0]).collect::<Vec<_>>(),
266            vec![bed_set1.as_slice()[0].clone(), bed_set1.as_slice()[0].clone()]
267        );
268
269        let result: Vec<GenomicRange> = bed_set2.into_iter().filter(|x| tree.is_overlapped(x)).collect();
270        let expected = vec!["chr1:100-210".parse().unwrap()];
271        assert_eq!(result, expected);
272
273    }
274}