gtars_overlaprs/
bits.rs

1use num_traits::{
2    PrimInt, Unsigned,
3    identities::{one, zero},
4};
5
6use super::Overlapper;
7use gtars_core::models::Interval;
8
9/// A Binary Interval Search data structure for fast genomic interval overlap queries.
10///
11/// From the journal article: <https://academic.oup.com/bioinformatics/article/29/1/1/273289>
12///
13/// BITS (Binary Interval Search) is an efficient data structure for finding overlapping
14/// intervals using binary search. It maintains sorted lists of interval start and end
15/// positions, enabling fast identification of intervals that overlap with a query range.
16///
17/// # Examples
18///
19/// ```
20/// use gtars_overlaprs::{Bits, Overlapper, Interval};
21///
22/// // Create intervals for read alignments
23/// let reads = vec![
24///     Interval { start: 100u32, end: 150, val: "read1" },
25///     Interval { start: 200, end: 250, val: "read2" },
26///     Interval { start: 225, end: 275, val: "read3" },
27/// ];
28///
29/// let bits = Bits::build(reads);
30///
31/// // Query for reads overlapping position 210-240
32/// let overlaps = bits.find(210, 240);
33/// assert_eq!(overlaps.len(), 2); // read2 and read3
34///
35/// // Count overlaps without allocating
36/// let count = bits.count(210, 240);
37/// assert_eq!(count, 2);
38/// ```
39///
40/// # Advanced Features
41///
42/// ## Sequential Queries with `seek`
43///
44/// For sorted queries, use `seek` with a cursor for better performance:
45///
46/// ```
47/// use gtars_overlaprs::{Bits, Overlapper, Interval};
48///
49/// let intervals = (0u32..100).step_by(5)
50///     .map(|x| Interval { start: x, end: x + 2, val: true })
51///     .collect::<Vec<_>>();
52/// let bits = Bits::build(intervals);
53///
54/// let mut cursor = 0;
55/// for i in 10u32..20 {
56///     let overlaps: Vec<_> = bits.seek(i, i + 5, &mut cursor).collect();
57///     // Process overlaps...
58/// }
59/// ```
60///
61/// # See Also
62///
63/// - [`Overlapper`] - The trait that `Bits` implements
64/// - [`crate::AIList`] - An alternative implementation optimized for high-coverage regions
65#[derive(Debug, Clone)]
66pub struct Bits<I, T>
67where
68    I: PrimInt + Unsigned + Send + Sync,
69    T: Eq + Clone + Send + Sync,
70{
71    /// List of intervals
72    pub intervals: Vec<Interval<I, T>>,
73    /// Sorted list of start positions,
74    starts: Vec<I>,
75    /// Sorted list of end positions,
76    ends: Vec<I>,
77    /// The length of the longest interval
78    max_len: I,
79    /// The calculated number of positions covered by the intervals
80    cov: Option<I>,
81    /// Whether or not overlaps have been merged
82    pub overlaps_merged: bool,
83}
84
85impl<I, T> Overlapper<I, T> for Bits<I, T>
86where
87    I: PrimInt + Unsigned + Send + Sync,
88    T: Eq + Clone + Send + Sync,
89{
90    /// Create a new instance of Bits by passing in a vector of Intervals. This vector will
91    /// immediately be sorted by start order.
92    /// ```
93    /// use gtars_overlaprs::{Bits, Overlapper};
94    /// use gtars_core::models::Interval;
95    ///
96    /// let data = (0..20).step_by(5)
97    ///                   .map(|x| Interval{start: x, end: x + 10, val: true})
98    ///                   .collect::<Vec<Interval<usize, bool>>>();
99    /// let bits = Bits::build(data);
100    /// ```
101    fn build(mut intervals: Vec<Interval<I, T>>) -> Self
102    where
103        Self: Sized,
104    {
105        intervals.sort();
106        let (mut starts, mut ends): (Vec<_>, Vec<_>) =
107            intervals.iter().map(|x| (x.start, x.end)).unzip();
108        starts.sort();
109        ends.sort();
110        let mut max_len = zero::<I>();
111        for interval in intervals.iter() {
112            let i_len = interval
113                .end
114                .checked_sub(&interval.start)
115                .unwrap_or_else(zero::<I>);
116            if i_len > max_len {
117                max_len = i_len;
118            }
119        }
120        Bits {
121            intervals,
122            starts,
123            ends,
124            max_len,
125            cov: None,
126            overlaps_merged: false,
127        }
128    }
129
130    /// Find all intervals that overlap start .. stop
131    /// ```
132    /// use gtars_overlaprs::{Bits, Overlapper};
133    /// use gtars_core::models::Interval;
134    ///
135    /// let bits = Bits::build((0..100).step_by(5)
136    ///                                 .map(|x| Interval{start: x, end: x+2 , val: true})
137    ///                                 .collect::<Vec<Interval<usize, bool>>>());
138    /// assert_eq!(bits.find_iter(5, 11).count(), 2);
139    /// ```
140    #[inline]
141    fn find(&self, start: I, stop: I) -> Vec<Interval<I, T>> {
142        let finder = IterFind {
143            inner: self,
144            off: Self::lower_bound(
145                start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
146                &self.intervals,
147            ),
148            start,
149            stop,
150        };
151
152        // TODO: we are literally just collection the iterator, which feels like smell
153        // but how do we write the trait in such a way that we return an actual
154        // iterator instead of a vector of intervals?
155        finder.into_iter().cloned().collect()
156    }
157
158    fn find_iter<'a>(
159        &'a self,
160        start: I,
161        stop: I,
162    ) -> Box<dyn Iterator<Item = &'a Interval<I, T>> + 'a> {
163        let finder = IterFind {
164            inner: self,
165            off: Self::lower_bound(
166                start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
167                &self.intervals,
168            ),
169            start,
170            stop,
171        };
172        Box::new(finder)
173    }
174}
175
176impl<I, T> Bits<I, T>
177where
178    I: PrimInt + Unsigned + Send + Sync,
179    T: Eq + Clone + Send + Sync,
180{
181    /// Insert a new interval after the BITS has been created. This is very
182    /// inefficient and should be avoided if possible.
183    ///
184    /// SIDE EFFECTS: This clears cov() and overlaps_merged
185    /// meaning that those will have to be recomputed after a insert
186    /// ```
187    /// use gtars_overlaprs::{Bits, Overlapper};
188    /// use gtars_core::models::Interval;
189    ///
190    /// let data : Vec<Interval<usize, usize>>= vec!{
191    ///     Interval{start:0,  end:5,  val:1},
192    ///     Interval{start:6,  end:10, val:2},
193    /// };
194    /// let mut bits = Bits::build(data);
195    /// bits.insert(Interval{start:0, end:20, val:5});
196    /// assert_eq!(bits.len(), 3);
197    /// assert_eq!(bits.find_iter(1,3).collect::<Vec<&Interval<usize,usize>>>(),
198    ///     vec![
199    ///         &Interval{start:0, end:5, val:1},
200    ///         &Interval{start:0, end:20, val:5},
201    ///     ]
202    /// );
203    ///
204    /// ```
205    pub fn insert(&mut self, elem: Interval<I, T>) {
206        let starts_insert_index = Self::bsearch_seq(elem.start, &self.starts);
207        let stops_insert_index = Self::bsearch_seq(elem.end, &self.ends);
208        let intervals_insert_index = Self::bsearch_seq_ref(&elem, &self.intervals);
209        let i_len = elem.end.checked_sub(&elem.start).unwrap_or_else(zero::<I>);
210        if i_len > self.max_len {
211            self.max_len = i_len;
212        }
213        self.starts.insert(starts_insert_index, elem.start);
214        self.ends.insert(stops_insert_index, elem.end);
215        self.intervals.insert(intervals_insert_index, elem);
216        self.cov = None;
217        self.overlaps_merged = false;
218    }
219
220    /// Get the number over intervals in Bits
221    #[inline]
222    pub fn len(&self) -> usize {
223        self.intervals.len()
224    }
225
226    /// Check if BITS is empty (i.e. has no intervals)
227    #[inline]
228    pub fn is_empty(&self) -> bool {
229        self.intervals.is_empty()
230    }
231
232    /// Return an iterator over the intervals in Bits
233    #[inline]
234    pub fn iter(&'_ self) -> IterBits<'_, I, T> {
235        IterBits {
236            inner: self,
237            pos: 0,
238        }
239    }
240
241    /// Determine the first index that we should start checking for overlaps for via a binary
242    /// search.
243    /// Assumes that the maximum interval length in `intervals` has been subtracted from
244    /// `start`, otherwise the result is undefined
245    #[inline]
246    pub fn lower_bound(start: I, intervals: &[Interval<I, T>]) -> usize {
247        let mut size = intervals.len();
248        let mut low = 0;
249
250        while size > 0 {
251            let half = size / 2;
252            let other_half = size - half;
253            let probe = low + half;
254            let other_low = low + other_half;
255            let v = &intervals[probe];
256            size = half;
257            low = if v.start < start { other_low } else { low }
258        }
259        low
260    }
261
262    /// Binary search for the insertion position of a key in a sorted slice.
263    ///
264    /// Returns the index where `key` should be inserted to maintain sort order.
265    /// This is a convenience wrapper around [`bsearch_seq_ref`](Self::bsearch_seq_ref).
266    ///
267    /// # Arguments
268    ///
269    /// * `key` - The value to search for
270    /// * `elems` - A sorted slice to search in
271    ///
272    /// # Returns
273    ///
274    /// The index where `key` should be inserted.
275    #[inline]
276    pub fn bsearch_seq<K>(key: K, elems: &[K]) -> usize
277    where
278        K: PartialEq + PartialOrd,
279    {
280        Self::bsearch_seq_ref(&key, elems)
281    }
282
283    /// Binary search for the insertion position of a key reference in a sorted slice.
284    ///
285    /// Returns the index where `key` should be inserted to maintain sort order.
286    /// Uses an efficient binary search algorithm optimized for branch prediction.
287    ///
288    /// # Arguments
289    ///
290    /// * `key` - A reference to the value to search for
291    /// * `elems` - A sorted slice to search in
292    ///
293    /// # Returns
294    ///
295    /// The index where `key` should be inserted to maintain sort order:
296    /// - `0` if the key should be inserted at the beginning
297    /// - `elems.len()` if the key should be inserted at the end
298    /// - Otherwise, the first index where `elems[index] >= key`
299    #[inline]
300    pub fn bsearch_seq_ref<K>(key: &K, elems: &[K]) -> usize
301    where
302        K: PartialEq + PartialOrd,
303    {
304        if elems.is_empty() || elems[0] >= *key {
305            return 0;
306        } else if elems[elems.len() - 1] < *key {
307            return elems.len();
308        }
309
310        let mut cursor = 0;
311        let mut length = elems.len();
312        while length > 1 {
313            let half = length >> 1;
314            length -= half;
315            cursor += (usize::from(elems[cursor + half - 1] < *key)) * half;
316        }
317        cursor
318    }
319
320    /// Count all intervals that overlap start .. stop. This performs two binary search in order to
321    /// find all the excluded elements, and then deduces the intersection from there. See
322    /// [BITS](https://arxiv.org/pdf/1208.3407.pdf) for more details.
323    /// ```
324    /// use gtars_overlaprs::{Bits, Overlapper};
325    /// use gtars_core::models::Interval;
326    ///
327    /// let bits = Bits::build((0..100).step_by(5)
328    ///                                 .map(|x| Interval{start: x, end: x+2 , val: true})
329    ///                                 .collect::<Vec<Interval<usize, bool>>>());
330    /// assert_eq!(bits.count(5, 11), 2);
331    /// ```
332    #[inline]
333    pub fn count(&self, start: I, stop: I) -> usize {
334        let len = self.intervals.len();
335        // Plus one to account for half-openness of bits intervals compared to BITS paper
336        let first = Self::bsearch_seq(start + one::<I>(), &self.ends);
337        let last = Self::bsearch_seq(stop, &self.starts);
338        let num_cant_after = len - last;
339        len - first - num_cant_after
340    }
341
342    /// Find all intevals that overlap start .. stop. This method will work when queries
343    /// to this Bits are in sorted (start) order. It uses a linear search from the last query
344    /// instead of a binary search. A reference to a cursor must be passed in. This reference will
345    /// be modified and should be reused in the next query. This allows seek to not need to make
346    /// the Bits object mutable, and thus use the same Bits accross threads.
347    /// ```
348    /// use gtars_overlaprs::{Bits, Overlapper};
349    /// use gtars_core::models::Interval;
350    ///
351    /// let bits = Bits::build((0..100).step_by(5)
352    ///                                 .map(|x| Interval{start: x, end: x+2 , val: true})
353    ///                                 .collect::<Vec<Interval<usize, bool>>>());
354    /// let mut cursor = 0;
355    /// for i in bits.iter() {
356    ///    assert_eq!(bits.seek(i.start, i.end, &mut cursor).count(), 1);
357    /// }
358    /// ```
359    #[inline]
360    pub fn seek<'a>(&'a self, start: I, stop: I, cursor: &mut usize) -> IterFind<'a, I, T> {
361        if *cursor == 0 || (*cursor < self.intervals.len() && self.intervals[*cursor].start > start)
362        {
363            *cursor = Self::lower_bound(
364                start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
365                &self.intervals,
366            );
367        }
368
369        while *cursor + 1 < self.intervals.len()
370            && self.intervals[*cursor + 1].start
371                < start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>)
372        {
373            *cursor += 1;
374        }
375
376        IterFind {
377            inner: self,
378            off: *cursor,
379            start,
380            stop,
381        }
382    }
383}
384
385/// An iterator over intervals in a [`Bits`] structure that overlap with a query range.
386///
387/// This struct is created by the [`find_iter`](Overlapper::find_iter) method on [`Bits`],
388/// or by the [`seek`](Bits::seek) method for sequential queries. It yields references to
389/// intervals that overlap with the specified query range without allocating a vector.
390///
391/// # Examples
392///
393/// ```
394/// use gtars_overlaprs::{Bits, Overlapper, Interval};
395///
396/// let intervals = vec![
397///     Interval { start: 10u32, end: 20, val: "a" },
398///     Interval { start: 15, end: 25, val: "b" },
399/// ];
400///
401/// let bits = Bits::build(intervals);
402///
403/// // The iterator is created by find_iter
404/// for interval in bits.find_iter(12, 18) {
405///     println!("Found: {}", interval.val);
406/// }
407/// ```
408#[derive(Debug)]
409pub struct IterFind<'a, I, T>
410where
411    T: Eq + Clone + Send + Sync + 'a,
412    I: PrimInt + Unsigned + Send + Sync,
413{
414    inner: &'a Bits<I, T>,
415    off: usize,
416    start: I,
417    stop: I,
418}
419
420impl<'a, I, T> Iterator for IterFind<'a, I, T>
421where
422    T: Eq + Clone + Send + Sync + 'a,
423    I: PrimInt + Unsigned + Send + Sync,
424{
425    type Item = &'a Interval<I, T>;
426
427    #[inline]
428    // interval.start < stop && interval.end > start
429    fn next(&mut self) -> Option<Self::Item> {
430        while self.off < self.inner.intervals.len() {
431            //let mut generator = self.inner.intervals[self.off..].iter();
432            //while let Some(interval) = generator.next() {
433            let interval = &self.inner.intervals[self.off];
434            self.off += 1;
435            if interval.overlap(self.start, self.stop) {
436                return Some(interval);
437            } else if interval.start >= self.stop {
438                break;
439            }
440        }
441        None
442    }
443}
444
445/// An iterator over all intervals in a [`Bits`] structure, in sorted order.
446///
447/// This struct is created by the [`iter`](Bits::iter) method. It yields references to all
448/// intervals in the `Bits` structure in sorted order by start position, regardless of overlap.
449///
450/// # Examples
451///
452/// ```
453/// use gtars_overlaprs::{Bits, Overlapper, Interval};
454///
455/// let intervals = vec![
456///     Interval { start: 10u32, end: 20, val: 1 },
457///     Interval { start: 15, end: 25, val: 2 },
458///     Interval { start: 30, end: 40, val: 3 },
459/// ];
460///
461/// let bits = Bits::build(intervals);
462///
463/// // Iterate over all intervals
464/// for interval in bits.iter() {
465///     println!("Interval: {}-{}", interval.start, interval.end);
466/// }
467/// ```
468pub struct IterBits<'a, I, T>
469where
470    T: Eq + Clone + Send + Sync + 'a,
471    I: PrimInt + Unsigned + Send + Sync,
472{
473    inner: &'a Bits<I, T>,
474    pos: usize,
475}
476
477impl<'a, I, T> Iterator for IterBits<'a, I, T>
478where
479    T: Eq + Clone + Send + Sync + 'a,
480    I: PrimInt + Unsigned + Send + Sync,
481{
482    type Item = &'a Interval<I, T>;
483
484    fn next(&mut self) -> Option<Self::Item> {
485        if self.pos >= self.inner.intervals.len() {
486            None
487        } else {
488            self.pos += 1;
489            self.inner.intervals.get(self.pos - 1)
490        }
491    }
492}
493
494impl<I, T> IntoIterator for Bits<I, T>
495where
496    T: Eq + Clone + Send + Sync,
497    I: PrimInt + Unsigned + Send + Sync,
498{
499    type Item = Interval<I, T>;
500    type IntoIter = ::std::vec::IntoIter<Self::Item>;
501
502    fn into_iter(self) -> Self::IntoIter {
503        self.intervals.into_iter()
504    }
505}
506
507impl<'a, I, T> IntoIterator for &'a Bits<I, T>
508where
509    T: Eq + Clone + Send + Sync + 'a,
510    I: PrimInt + Unsigned + Send + Sync,
511{
512    type Item = &'a Interval<I, T>;
513    type IntoIter = std::slice::Iter<'a, Interval<I, T>>;
514
515    fn into_iter(self) -> std::slice::Iter<'a, Interval<I, T>> {
516        self.intervals.iter()
517    }
518}
519
520impl<'a, I, T> IntoIterator for &'a mut Bits<I, T>
521where
522    T: Eq + Clone + Send + Sync + 'a,
523    I: PrimInt + Unsigned + Send + Sync,
524{
525    type Item = &'a mut Interval<I, T>;
526    type IntoIter = std::slice::IterMut<'a, Interval<I, T>>;
527
528    fn into_iter(self) -> std::slice::IterMut<'a, Interval<I, T>> {
529        self.intervals.iter_mut()
530    }
531}
532
533#[cfg(test)]
534mod tests {
535    use super::*;
536
537    use pretty_assertions::{assert_eq, assert_ne};
538    use rstest::{fixture, rstest};
539
540    #[fixture]
541    fn intervals() -> Vec<Interval<u32, &'static str>> {
542        vec![
543            Interval {
544                start: 1,
545                end: 5,
546                val: "a",
547            },
548            Interval {
549                start: 3,
550                end: 7,
551                val: "b",
552            },
553            Interval {
554                start: 6,
555                end: 10,
556                val: "c",
557            },
558            Interval {
559                start: 8,
560                end: 12,
561                val: "d",
562            },
563        ]
564    }
565
566    #[rstest]
567    fn test_build_and_len(intervals: Vec<Interval<u32, &'static str>>) {
568        let ailist = Bits::build(intervals.clone());
569        assert_eq!(ailist.len(), intervals.len());
570        assert_ne!(ailist.is_empty(), true);
571    }
572
573    #[rstest]
574    fn test_find_overlapping_intervals(intervals: Vec<Interval<u32, &'static str>>) {
575        let ailist = Bits::build(intervals);
576
577        // Query that overlaps with "a" and "b"
578        let results = ailist.find(2, 4);
579        let vals: Vec<&str> = results.iter().map(|i| i.val).collect();
580        assert_eq!(vals.contains(&"a"), true);
581        assert_eq!(vals.contains(&"b"), true);
582        assert_eq!(vals.contains(&"c"), false);
583
584        // Query that overlaps with "c" and "d"
585        let results = ailist.find(9, 11);
586        let vals: Vec<&str> = results.iter().map(|i| i.val).collect();
587        assert_eq!(vals.contains(&"c"), true);
588        assert_eq!(vals.contains(&"d"), true);
589        assert_eq!(vals.contains(&"a"), false);
590    }
591
592    #[rstest]
593    fn test_find_no_overlap(intervals: Vec<Interval<u32, &'static str>>) {
594        let ailist = Bits::build(intervals);
595
596        // Query outside all intervals
597        let results = ailist.find(13, 15);
598        assert_eq!(results.is_empty(), true);
599    }
600
601    #[rstest]
602    fn test_empty_ailist() {
603        let ailist: Bits<u32, &str> = Bits::build(vec![]);
604
605        assert_eq!(ailist.len(), 0);
606        assert_eq!(ailist.is_empty(), true);
607
608        let results = ailist.find(1, 2);
609
610        assert_eq!(results.is_empty(), true);
611    }
612}