nclist/
lib.rs

1//! A nested containment list (NClist) is a datastructure that can be queried for elements
2//! overlapping intervals. It was invented and published by Alexander V and Alekseyenko Christopher
3//! J. Lee in Bioinformatics in
4//! 2007 (doi: [10.1093/bioinformatics/btl647](https://doi.org/10.1093/bioinformatics/btl647)).
5//!
6//! # How it works
7//! The `NClist` internals rely on the observation that when a set of intervals, where all are
8//! non-contained (based on their interval bounds) in any of the other intervals in the set, are
9//! sorted on their start coordinate are also sorted on their end coordinate. If this requirement
10//! is fulfilled the items overlapping an interval can be found by a binary search on the query
11//! start and returning items until the query end coordinate has been passed, giving a complexity
12//! of `O(log(N) + M)` where N is the size of the set and M is the number of overlaps.
13//!
14//! The only remaining problem is the intervals that **are** contained in another interval. This was
15//! solved by taking out these intervals and storing them in a separate set, linking this set to the
16//! original interval. Now when you search for overlaps you check for contained intervals and also
17//! search these nested sets. This can be implemented recursively (as shown in the paper) or
18//! using a queue (which was used for this crate). This means that the worst case complexity becomes
19//! O(N) if all intervals are contained within its parent
20//!
21//! The linked article also provides details about an on-disk version that can also be efficiently
22//! searched, but this crate implementation is in-memory and stores the items in a (single) `Vec`.
23//!
24//! # How to use
25//! You can create a searchable `NClist<T>` from a `Vec<T>` if you implement the
26//! `Interval` trait for `T` The `Interval` trait also requires that `T` is `Ord`. Creating the
27//! NClist validates that the end coordinate is greater than start. This means negative and
28//! zero-width intervals cannot be used in an `NClist<T>`.
29//!
30//! # Example
31//! ```
32//! use nclist::NClist;
33//! // Given a set of `T` where `T` implements the `Interval` trait
34//! let v = vec![(10..15), (10..20), (1..8)];
35//! // Create the NClist, this consumes v
36//! let nc = NClist::from_vec(v).unwrap();
37//! // count overlaps, the query is provided as a reference to a `std::ops::Range`
38//! assert_eq!(nc.count_overlaps(&(10..12)), 2);
39//! // remember, intervals are half open
40//! assert_eq!(nc.count_overlaps(&(20..30)), 0);
41//!
42//! //or query them using an iterator
43//! let mut q = nc.overlaps(&(7..10));
44//! assert_eq!(q.next(), Some(&(1..8)));
45//! assert_eq!(q.next(), None);
46//!
47//! ```
48//!
49//! # Recommendations for use
50//! The `NClist<T>` is not mutable. Any mutable access to the items could invalidate the
51//! interval bounds (interior mutability using for example a `RefCell` could solve this). Also
52//! insertion and deletion are not supported. I can speculate that an interval tree would also be a
53//! better for this type of access. For usage in bioinformatics where interval data is often
54//! provided as (sorted) lists (gff, gtf, bed) the `NClist<T>` is a perfect fit and has very nice
55//! ergonomics.  Obviously the implementation works better when nesting depth is limited, but
56//! performance in simple tests seem consistently better than rust-bio's IntervalTree
57use std::collections::VecDeque;
58use std::convert::TryFrom;
59use std::ops::Range;
60
61use itertools::Itertools;
62
63/// The interval trait needs to be implemented for `T` before you can create an `NClist<T>`.
64/// An interval is half-open, inclusive start and exclusive end (like `std::ops::Range<T>`), but 
65/// `end > start` must always be true.
66pub trait Interval {
67    /// The coordinate type of the interval. This type must implement `std::ops::Ord`
68    type Coord: Ord;
69
70    /// Return a reference to the start coordinate of the interval (inclusive)
71    fn start(&self) -> &Self::Coord;
72
73    /// Return a reference to the end coordinate of the interval (non-inclusive)
74    fn end(&self) -> &Self::Coord;
75}
76
77/// Interval is already implemented for `std::ops::Range`.
78impl<N> Interval for Range<N> where N: Ord{
79    type Coord = N;
80
81    #[inline(always)]
82    fn start(&self) -> &Self::Coord {
83        &self.start
84    }
85
86    #[inline(always)]
87    fn end(&self) -> &Self::Coord {
88        &self.end
89    }
90}
91
92#[derive(Debug)]
93pub struct NClist<T> where T: Interval {
94    intervals: Vec<T>,
95    contained: Vec<Option<(usize, usize)>>
96}
97
98struct SlicedNClist<'a, T> where T: 'a + Interval {
99    intervals: &'a [T],
100    contained: &'a [Option<(usize, usize)>],
101    stop_at: &'a T::Coord
102}
103
104pub struct Overlaps<'a, T> where T: 'a + Interval {
105    nclist: &'a NClist<T>,
106    range:  &'a Range<T::Coord>,
107    current_pos: usize,
108    current_end: usize,
109    sublists: VecDeque<(usize, usize)>,
110}
111
112pub struct OrderedOverlaps<'a, T> where T: 'a + Interval {
113    nclist: &'a NClist<T>,
114    range:  &'a Range<T::Coord>,
115    current: SlicedNClist<'a, T>,
116    queue: Vec<SlicedNClist<'a, T>>
117}
118
119impl<T> NClist<T> where T: Interval {
120    fn new() -> NClist<T> {
121        NClist { intervals: Vec::new(), contained: vec![Some((0,0))] }
122    }
123
124    pub fn from_vec(mut v: Vec<T>) -> Result<NClist<T>, &'static str> {
125        if v.iter().any(|e| e.end() <= e.start()) {
126            return Err("Cannot use intervals with zero or negative width");
127        }
128        v.sort_by(|a, b| a.start().cmp(b.start())
129                  .then(a.end().cmp(b.end()).reverse()));
130
131        let mut list = NClist::new();
132        let mut sublists = VecDeque::from(vec![NClistBuilder { intervals: v, contained_pos: 0}]);
133
134        while !sublists.is_empty() {
135            build_nclist(&mut sublists, &mut list);
136        }
137        Ok(list)
138    }
139
140    /// Count the number of elements overlapping the `Range` r. Counting overlaps is slightly
141    /// faster than iterating over the overlaps. This method is preferred when only the number of
142    /// overlapping elements is required.
143    pub fn count_overlaps(&self, r: &Range<T::Coord>) -> usize {
144        if r.end <= r.start {
145            return 0;
146        }
147        let mut count = 0;
148        let mut queue = VecDeque::new();
149        queue.push_back(self.contained[0].unwrap());
150        while let Some((start, end)) = queue.pop_front() {
151            self.slice(start, end, &r.start, &r.end)
152                .for_each(|(_, contained)| {
153                    count += 1;
154                    if let Some(subrange) = *contained {
155                        queue.push_back(subrange);
156                    }
157                });
158        }
159        count
160    }
161
162    /// Returns an iterator that returns overlapping elements to query `r`. During iteration
163    /// contained intervals are pushed to a queue an processed in order after yielding the
164    /// non-overlapping regions.
165    pub fn overlaps<'a>(&'a self, r: &'a Range<T::Coord>) -> Overlaps<'a , T> {
166        let current_slice = self.contained[0].as_ref().unwrap();
167        //empty or negative width intervals do not overlap anything
168        let start = if r.end > r.start {
169            self.bin_search_end(current_slice.0, current_slice.1, &r.start)
170        } else {
171            current_slice.1
172        };
173
174        Overlaps { nclist: self, range: r, current_pos: start, current_end: current_slice.1, sublists: VecDeque::new() }
175    }
176
177    /// Returns an iterator that returns overlapping elements to query `r` ordered by start
178    /// coordinate. This is less efficient that returning without ordering, but doesn't require
179    /// allocating storage for all overlapping elements.
180    pub fn overlaps_ordered<'a>(&'a self, r: &'a Range<T::Coord>) -> OrderedOverlaps<'a , T> {
181        let &(mut start, end) = self.contained[0].as_ref().unwrap();
182        if r.end <= r.start {
183            start = end;
184        }
185        OrderedOverlaps { nclist: self, range: r, current: self.slice(start, end, &r.start, &r.end), queue: Vec::new() }
186    }
187
188    /// Return the intervals `Vec`. This will run without allocation and return the intervals in a
189    /// different order then provided.
190    pub fn into_vec(self) -> Vec<T> {
191        self.into()
192    }
193
194    #[inline]
195    fn slice<'a>(&'a self, mut start: usize, end: usize, q:  &T::Coord, q_end: &'a T::Coord) -> SlicedNClist<'a, T> {
196        start += match self.intervals[start..end].binary_search_by(|e| e.end().cmp(q))
197        {
198            Ok(n) => n + 1,
199            Err(n) => n
200        };
201        SlicedNClist { intervals: &self.intervals[start..end], contained: &self.contained[start+1..end+1], stop_at: q_end }
202    }
203
204    #[inline]
205    fn bin_search_end(&self, start: usize, end: usize, q: &T::Coord) -> usize {
206        match self.intervals[start..end].binary_search_by(|e| e.end().cmp(q)) {
207            Ok(n) => n + 1,
208            Err(n) => n
209        }
210    }
211}
212
213impl<'a, T> Iterator for SlicedNClist<'a, T> where T: Interval {
214    type Item = (&'a T, &'a Option<(usize, usize)>);
215    #[inline]
216    fn next(&mut self) -> Option<Self::Item> {
217        if let Some((i, ref mut intervals)) = self.intervals.split_first() {
218            if i.start() >= self.stop_at {
219                None
220            } else {
221                let (c, ref mut contained) = self.contained.split_first().unwrap();
222                std::mem::replace(&mut self.intervals, intervals);
223                std::mem::replace(&mut self.contained, contained);
224                Some((i, c))
225            }
226        } else {
227            None
228        }
229    }
230}
231
232impl<'a, T> Iterator for Overlaps<'a, T> where T: Interval {
233    type Item = &'a T;
234    #[inline]
235    fn next(&mut self) -> Option<Self::Item> {
236        let remaining = self.current_end - self.current_pos;
237
238        if remaining == 0 || *self.nclist.intervals[self.current_pos].start() >= self.range.end {
239            if let Some((mut new_start, new_end)) = self.sublists.pop_front() {
240                new_start += self.nclist.bin_search_end(new_start, new_end, &self.range.start);
241                self.current_pos = new_start;
242                self.current_end = new_end;
243                self.next()
244            } else {
245                None
246            }
247        } else {
248            let pos = self.current_pos;
249            self.current_pos += 1;
250            if let Some(next_sublist) = self.nclist.contained[self.current_pos] {
251                self.sublists.push_back(next_sublist);
252            }
253            Some(&self.nclist.intervals[pos])
254        }
255    }
256}
257
258impl<'a, T> Iterator for OrderedOverlaps<'a, T> where T: Interval {
259    type Item = &'a T;
260    #[inline]
261    fn next(&mut self) -> Option<Self::Item> {
262        if let Some((interval, contained)) = self.current.next() {
263            if let Some((start, end)) = *contained {
264                let mut ns = self.nclist.slice(start, end, &self.range.start, &self.range.end);
265                std::mem::swap(&mut self.current, &mut ns);
266                self.queue.push(ns);
267            }
268            Some(interval)
269        } else if let Some(sublist) = self.queue.pop() {
270            self.current = sublist;
271            self.next()
272        } else {
273            None
274        }
275    }
276}
277
278/// Internal intermediate sublist used for creating `NClist<T>`
279struct NClistBuilder<T> {
280    intervals: Vec<T>,
281    contained_pos: usize,
282}
283
284fn build_nclist<T: Interval>(sublists: &mut VecDeque<NClistBuilder<T>>, result: &mut NClist<T>) {
285    if let Some(sublist) = sublists.pop_front() {
286        //iterate over all ranges and take out contained intervals
287        let mut it = sublist.intervals.into_iter().peekable();
288
289        let sublist_start = result.intervals.len();
290        while let Some(e) = it.next() {
291            let interval_pos = result.intervals.len();
292            let contained: Vec<_> = it.peeking_take_while(|n| n.end() < e.end()).collect();
293            if !contained.is_empty() {
294                sublists.push_back(NClistBuilder {intervals: contained, contained_pos: interval_pos + 1});
295            }
296            result.intervals.push(e);
297            result.contained.push(None);
298        }
299
300        //store the position and the length of the sublist
301        result.contained[sublist.contained_pos] = Some((sublist_start, result.intervals.len()));
302    }
303}
304
305impl<T> TryFrom<Vec<T>> for NClist<T> where T: Interval {
306    type Error = &'static str;
307    fn try_from(v: Vec<T>) -> Result<Self, Self::Error> {
308        NClist::from_vec(v)
309    }
310}
311
312/// Return the intervals `Vec`. This will run without allocation and return the intervals in a
313/// different order then provided.
314impl<T> Into<Vec<T>> for NClist<T> where T: Interval {
315    fn into(self) -> Vec<T> {
316        self.intervals
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    // This test is copied from Rust's stdlib. This software relies on the fact that a binary
325    // search returns the last matching element. If the stdlib implementation changes this should
326    // be caught.
327    //
328    // See:
329    // https://github.com/rust-lang/rust/blob/975e83a32ad8c2c894391711d227786614d61a50/src/libcore/tests/slice.rs#L68
330    #[test]
331    fn test_binary_search_implementation_details() {
332        let b = [1, 1, 2, 2, 3, 3, 3];
333        assert_eq!(b.binary_search(&1), Ok(1));
334        assert_eq!(b.binary_search(&2), Ok(3));
335        assert_eq!(b.binary_search(&3), Ok(6));
336        let b = [1, 1, 1, 1, 1, 3, 3, 3, 3];
337        assert_eq!(b.binary_search(&1), Ok(4));
338        assert_eq!(b.binary_search(&3), Ok(8));
339        let b = [1, 1, 1, 1, 3, 3, 3, 3, 3];
340        assert_eq!(b.binary_search(&1), Ok(3));
341        assert_eq!(b.binary_search(&3), Ok(8));
342    }
343
344    #[test]
345    fn from_vec() {
346        let list: Vec<Range<u64>> = vec![(10..15), (10..20), (1..8)].into_iter().collect();
347        let nclist = NClist::from_vec(list).unwrap();
348
349        assert_eq!(nclist.intervals.len(), 3);
350        assert!(nclist.contained[0].is_some());
351        assert!(nclist.contained[1].is_none());
352        assert!(nclist.contained[2].is_some());
353        assert!(nclist.contained[3].is_none());
354
355        let list: Vec<Range<u64>> = Vec::new();
356        let nclist = NClist::from_vec(list).unwrap();
357        assert_eq!(nclist.intervals.len(), 0);
358    }
359
360    #[test]
361    fn interval_width() {
362        let list: Vec<Range<u64>> = vec![(5..20), (19..20), (7..7)].into_iter().collect();
363        assert!(NClist::from_vec(list).is_err());
364        let list: Vec<Range<u64>> = vec![(5..20), (20..19), (7..8)].into_iter().collect();
365        assert!(NClist::from_vec(list).is_err());
366    }
367
368    #[test]
369    fn illegal_width_queries() {
370        let list: Vec<Range<u64>> = vec![(5..20), (19..20), (7..8)].into_iter().collect();
371        let nclist = NClist::from_vec(list).unwrap();
372        assert_eq!(nclist.count_overlaps(&(7..7)), 0);
373        assert_eq!(nclist.count_overlaps(&(8..7)), 0);
374        assert_eq!(nclist.count_overlaps(&(19..19)), 0);
375        assert_eq!(nclist.overlaps(&(7..7)).count(), 0);
376        assert_eq!(nclist.overlaps(&(8..7)).count(), 0);
377        assert_eq!(nclist.overlaps(&(19..19)).count(), 0);
378        assert_eq!(nclist.overlaps_ordered(&(7..7)).count(), 0);
379        assert_eq!(nclist.overlaps_ordered(&(8..7)).count(), 0);
380        assert_eq!(nclist.overlaps_ordered(&(19..19)).count(), 0);
381    }
382
383    #[test]
384    fn count() {
385        let list: Vec<Range<u64>> = vec![(10..15), (10..20), (1..8)].into_iter().collect();
386        let nclist = NClist::from_vec(list).unwrap();
387
388        assert_eq!(nclist.intervals.len(), 3);
389        assert_eq!(nclist.count_overlaps(&(5..20)), 3);
390        assert_eq!(nclist.count_overlaps(&(14..18)), 2);
391        assert_eq!(nclist.count_overlaps(&(150..180)), 0);
392        assert_eq!(nclist.count_overlaps(&(10..10)), 0);
393        assert_eq!(nclist.count_overlaps(&(10..11)), 2);
394        assert_eq!(nclist.count_overlaps(&(9..10)), 0);
395        assert_eq!(nclist.count_overlaps(&(8..9)), 0);
396        assert_eq!(nclist.count_overlaps(&(8..10)), 0);
397        assert_eq!(nclist.count_overlaps(&(20..100)), 0);
398
399        let list: Vec<Range<u64>> = Vec::new();
400        let nclist = NClist::from_vec(list).unwrap();
401        assert_eq!(nclist.count_overlaps(&(100..200)), 0);
402
403    }
404    #[test]
405    fn overlaps() {
406        let list: Vec<Range<u64>> = vec![(10..15), (10..20), (1..8)].into_iter().collect();
407        let nclist = NClist::from_vec(list).unwrap();
408
409        assert_eq!(nclist.intervals.len(), 3);
410        assert_eq!(nclist.overlaps(&(5..20)).count(), 3);
411
412        let mut q = nclist.overlaps_ordered(&(5..20));
413        assert_eq!(q.next(), Some(&(1..8)));
414        assert_eq!(q.next(), Some(&(10..20)));
415        assert_eq!(q.next(), Some(&(10..15)));
416        assert_eq!(q.next(), None);
417
418        assert_eq!(nclist.overlaps(&(20..100)).count(), 0);
419        assert_eq!(nclist.overlaps(&(8..10)).count(), 0);
420        assert_eq!(nclist.overlaps(&(8..9)).count(), 0);
421    }
422
423    #[test]
424    fn duplicate_intervals() {
425        let list: Vec<Range<u64>> = vec![(10..15), (11..13), (10..20), (1..8), (11..13), (16..18)].into_iter().collect();
426        let nclist = NClist::from_vec(list).unwrap();
427        println!("{:?}", nclist);
428
429        assert_eq!(nclist.overlaps(&(5..20)).count(), 6);
430        assert_eq!(nclist.overlaps(&(11..13)).count(), 4);
431
432        let mut q = nclist.overlaps_ordered(&(11..17));
433        assert_eq!(q.next(), Some(&(10..20)));
434        assert_eq!(q.next(), Some(&(10..15)));
435        assert_eq!(q.next(), Some(&(11..13)));
436        assert_eq!(q.next(), Some(&(11..13)));
437        assert_eq!(q.next(), Some(&(16..18)));
438        assert_eq!(q.next(), None);
439
440        assert_eq!(nclist.overlaps(&(20..100)).count(), 0);
441        assert_eq!(nclist.overlaps(&(8..10)).count(), 0);
442        assert_eq!(nclist.overlaps(&(8..9)).count(), 0);
443    }
444}