bio/data_structures/
annot_map.rs

1//! Efficient container for locations annotated across a set of named
2//! reference sequences.
3//!
4//! # Example
5//!
6//! ```
7//! extern crate bio_types;
8//! use bio::data_structures::annot_map::AnnotMap;
9//! use bio_types::annot::contig::Contig;
10//! use bio_types::strand::ReqStrand;
11//!
12//! // Insert a String annotation into the annotation map at a specified location.
13//! let mut genes: AnnotMap<String, String> = AnnotMap::new();
14//! let tma22 = Contig::new(
15//!     "chrX".to_owned(),
16//!     461829,
17//!     462426 - 461829,
18//!     ReqStrand::Forward,
19//! );
20//! genes.insert_at("TMA22".to_owned(), &tma22);
21//!
22//! // Find annotations that overlap a specific query
23//! let query = Contig::new("chrX".to_owned(), 462400, 100, ReqStrand::Forward);
24//! let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
25//! assert_eq!(hits, vec!["TMA22"]);
26//! ```
27
28use std::collections::HashMap;
29use std::hash::Hash;
30
31use crate::data_structures::interval_tree;
32use crate::data_structures::interval_tree::{IntervalTree, IntervalTreeIterator};
33use crate::utils::Interval;
34use bio_types::annot::loc::Loc;
35
36/// Efficient container for querying annotations, using `HashMap` and
37/// `IntervalTree`.
38///
39/// The container is parameterized over the type of the reference
40/// sequence names `R` (which is often a `String`) and the type of the
41/// contained objects `T`.
42///
43/// The container finds annotations that overlap a specific query
44/// location. Overlaps are identified without regard for strandedness
45/// and without regard for e.g. spliced-out introns within the
46/// annotation or the query.
47///
48/// Thus, the overlapping annotations identified by querying a
49/// `AnnotMap` may need further filtering.
50#[derive(Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
51pub struct AnnotMap<R, T>
52where
53    R: Hash + Eq,
54{
55    refid_itrees: HashMap<R, IntervalTree<isize, T>>,
56}
57
58impl<R, T> Default for AnnotMap<R, T>
59where
60    R: Eq + Hash,
61{
62    fn default() -> Self {
63        AnnotMap {
64            refid_itrees: HashMap::new(),
65        }
66    }
67}
68
69impl<R, T> AnnotMap<R, T>
70where
71    R: Eq + Hash,
72{
73    /// Create a new, empty `AnnotMap`. Used in conjunction with `insert_at`
74    /// or `insert_loc`.
75    pub fn new() -> Self {
76        Default::default()
77    }
78
79    /// Insert an object into the container at a specified location (`Loc`).
80    ///
81    /// # Arguments
82    ///
83    /// * `data` - any type of data to be inserted at the location / region
84    /// * `location` - any object with the `Loc` trait implemented, determining
85    ///   the Range at which to insert the `data`
86    ///
87    /// # Example
88    ///
89    /// ```
90    /// extern crate bio_types;
91    /// use bio::data_structures::annot_map::AnnotMap;
92    /// use bio_types::annot::contig::Contig;
93    /// use bio_types::strand::ReqStrand;
94    ///
95    /// let mut genes: AnnotMap<String, String> = AnnotMap::new();
96    /// let tma22 = Contig::new(
97    ///     "chrX".to_owned(),
98    ///     461829,
99    ///     462426 - 461829,
100    ///     ReqStrand::Forward,
101    /// );
102    /// genes.insert_at("TMA22".to_owned(), &tma22);
103    /// ```
104    pub fn insert_at<L>(&mut self, data: T, location: &L)
105    where
106        R: Eq + Hash + Clone,
107        L: Loc<RefID = R>,
108    {
109        let itree = self
110            .refid_itrees
111            .entry(location.refid().clone())
112            .or_default();
113        let rng = location.start()..(location.start() + (location.length() as isize));
114        itree.insert(rng, data);
115    }
116
117    /// Create an `Iterator` that will visit all entries that overlap
118    /// a query location.
119    pub fn find<'a, L>(&'a self, location: &'a L) -> AnnotMapIterator<'a, R, T>
120    where
121        L: Loc<RefID = R>,
122    {
123        if let Some(itree) = self.refid_itrees.get(location.refid()) {
124            let interval = location.start()..(location.start() + (location.length() as isize));
125            let itree_iter = itree.find(interval);
126            AnnotMapIterator {
127                itree_iter: Some(itree_iter),
128                refid: location.refid(),
129            }
130        } else {
131            AnnotMapIterator {
132                itree_iter: None,
133                refid: location.refid(),
134            }
135        }
136    }
137}
138
139impl<R, T> AnnotMap<R, T>
140where
141    R: Eq + Hash + Clone,
142    T: Loc<RefID = R>,
143{
144    /// Insert an object with the `Loc` trait into the container at
145    /// its location.
146    ///
147    /// This inserts all of `data` at the Range of length `data.length()`
148    /// that starts at `data.start()`.
149    ///
150    /// # Example
151    ///
152    /// ```
153    /// extern crate bio_types;
154    /// use bio::data_structures::annot_map::AnnotMap;
155    /// use bio_types::annot::contig::Contig;
156    /// use bio_types::strand::ReqStrand;
157    ///
158    /// let mut gene_locs = AnnotMap::new();
159    /// let tma19 = Contig::new(
160    ///     String::from("chrXI"),
161    ///     334412,
162    ///     (334916 - 334412),
163    ///     ReqStrand::Reverse,
164    /// );
165    /// let assert_copy = tma19.clone();
166    /// gene_locs.insert_loc(tma19);
167    /// // Find annotations that overlap a specific query
168    /// let query = Contig::new(String::from("chrXI"), 334400, 100, ReqStrand::Reverse);
169    /// let hits: Vec<&Contig<String, ReqStrand>> = gene_locs.find(&query).map(|e| e.data()).collect();
170    /// assert_eq!(hits, vec![&assert_copy]);
171    /// ```
172    pub fn insert_loc(&mut self, data: T) {
173        let itree = self.refid_itrees.entry(data.refid().clone()).or_default();
174        let rng = data.start()..(data.start() + (data.length() as isize));
175        itree.insert(rng, data);
176    }
177}
178
179/// A view of one annotation in a `AnnotMap` container.
180#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Serialize)]
181pub struct Entry<'a, R, T>
182where
183    R: Eq + Hash,
184{
185    itree_entry: interval_tree::Entry<'a, isize, T>,
186    refid: &'a R,
187}
188
189impl<'a, R, T> Entry<'a, R, T>
190where
191    R: Eq + Hash,
192{
193    /// Return a reference to the data value in the `AnnotMap`.
194    pub fn data(&self) -> &'a T {
195        self.itree_entry.data()
196    }
197
198    /// Return a reference to the interval spanned by the annotation.
199    pub fn interval(&self) -> &'a Interval<isize> {
200        self.itree_entry.interval()
201    }
202
203    /// Return a reference to the identifier of the annotated reference sequence.
204    pub fn refid(&self) -> &'a R {
205        self.refid
206    }
207}
208
209/// An iterator over annotation entries (of type `Entry`) in a
210/// `AnnotMap`.
211///
212/// This struct is created by the `find` function on `AnnotMap`.
213#[derive(Clone, Eq, PartialEq, Hash, Debug, Serialize)]
214pub struct AnnotMapIterator<'a, R, T>
215where
216    R: Eq + Hash,
217{
218    itree_iter: Option<IntervalTreeIterator<'a, isize, T>>,
219    refid: &'a R,
220}
221
222impl<'a, R, T> Iterator for AnnotMapIterator<'a, R, T>
223where
224    R: 'a + Eq + Hash,
225    T: 'a,
226{
227    type Item = Entry<'a, R, T>;
228
229    fn next(&mut self) -> Option<Self::Item> {
230        match self.itree_iter {
231            Some(ref mut iter) => match iter.next() {
232                Some(next_itree) => Some(Entry {
233                    itree_entry: next_itree,
234                    refid: self.refid,
235                }),
236                None => None,
237            },
238            None => None,
239        }
240    }
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    use bio_types::annot::contig::Contig;
248    use bio_types::strand::ReqStrand;
249
250    #[test]
251    fn lookup() {
252        let mut genes: AnnotMap<String, String> = AnnotMap::new();
253        genes.insert_at(
254            "TMA22".to_owned(),
255            &Contig::new(
256                "chrX".to_owned(),
257                461829,
258                462426 - 461829,
259                ReqStrand::Forward,
260            ),
261        );
262        genes.insert_at(
263            "TMA19".to_owned(),
264            &Contig::new(
265                "chrXI".to_owned(),
266                334412,
267                334916 - 334412,
268                ReqStrand::Reverse,
269            ),
270        );
271
272        let query = Contig::new("chrX".to_owned(), 462400, 100, ReqStrand::Forward);
273        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
274        assert_eq!(hits, vec!["TMA22"]);
275
276        let query = Contig::new("chrXI".to_owned(), 334400, 100, ReqStrand::Forward);
277        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
278        assert_eq!(hits, vec!["TMA19"]);
279
280        let query = Contig::new("chrXI".to_owned(), 334916, 100, ReqStrand::Forward);
281        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
282        assert!(hits.is_empty());
283
284        let query = Contig::new("chrX".to_owned(), 461729, 100, ReqStrand::Forward);
285        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
286        assert!(hits.is_empty());
287
288        let query = Contig::new("chrXI".to_owned(), 462400, 100, ReqStrand::Forward);
289        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
290        assert!(hits.is_empty());
291
292        let query = Contig::new("NotFound".to_owned(), 0, 0, ReqStrand::Forward);
293        let hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
294        assert!(hits.is_empty());
295    }
296
297    #[test]
298    fn overlaps() {
299        let mut genes: AnnotMap<String, String> = AnnotMap::new();
300
301        genes.insert_at(
302            "a".to_owned(),
303            &Contig::new("chr01".to_owned(), 1000, 1000, ReqStrand::Forward),
304        );
305        genes.insert_at(
306            "b".to_owned(),
307            &Contig::new("chr01".to_owned(), 1300, 1000, ReqStrand::Forward),
308        );
309        genes.insert_at(
310            "c".to_owned(),
311            &Contig::new("chr01".to_owned(), 1700, 1000, ReqStrand::Forward),
312        );
313        genes.insert_at(
314            "d".to_owned(),
315            &Contig::new("chr01".to_owned(), 2200, 1000, ReqStrand::Forward),
316        );
317
318        let query = Contig::new("chr01".to_owned(), 1050, 100, ReqStrand::Forward);
319        let mut hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
320        hits.sort();
321        assert_eq!(hits, vec!["a"]);
322
323        let query = Contig::new("chr01".to_owned(), 1450, 100, ReqStrand::Forward);
324        let mut hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
325        hits.sort();
326        assert_eq!(hits, vec!["a", "b"]);
327
328        let query = Contig::new("chr01".to_owned(), 1850, 100, ReqStrand::Forward);
329        let mut hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
330        hits.sort();
331        assert_eq!(hits, vec!["a", "b", "c"]);
332
333        let query = Contig::new("chr01".to_owned(), 2250, 100, ReqStrand::Forward);
334        let mut hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
335        hits.sort();
336        assert_eq!(hits, vec!["b", "c", "d"]);
337
338        let query = Contig::new("chr01".to_owned(), 2650, 100, ReqStrand::Forward);
339        let mut hits: Vec<&String> = genes.find(&query).map(|e| e.data()).collect();
340        hits.sort();
341        assert_eq!(hits, vec!["c", "d"]);
342    }
343}