gtars_overlaprs/bits.rs
1use num_traits::{
2 PrimInt, Unsigned,
3 identities::{one, zero},
4};
5
6use super::Overlapper;
7use gtars_core::models::Interval;
8
9/// A Binary Interval Search data structure for fast genomic interval overlap queries.
10///
11/// From the journal article: <https://academic.oup.com/bioinformatics/article/29/1/1/273289>
12///
13/// BITS (Binary Interval Search) is an efficient data structure for finding overlapping
14/// intervals using binary search. It maintains sorted lists of interval start and end
15/// positions, enabling fast identification of intervals that overlap with a query range.
16///
17/// # Examples
18///
19/// ```
20/// use gtars_overlaprs::{Bits, Overlapper, Interval};
21///
22/// // Create intervals for read alignments
23/// let reads = vec![
24/// Interval { start: 100u32, end: 150, val: "read1" },
25/// Interval { start: 200, end: 250, val: "read2" },
26/// Interval { start: 225, end: 275, val: "read3" },
27/// ];
28///
29/// let bits = Bits::build(reads);
30///
31/// // Query for reads overlapping position 210-240
32/// let overlaps = bits.find(210, 240);
33/// assert_eq!(overlaps.len(), 2); // read2 and read3
34///
35/// // Count overlaps without allocating
36/// let count = bits.count(210, 240);
37/// assert_eq!(count, 2);
38/// ```
39///
40/// # Advanced Features
41///
42/// ## Sequential Queries with `seek`
43///
44/// For sorted queries, use `seek` with a cursor for better performance:
45///
46/// ```
47/// use gtars_overlaprs::{Bits, Overlapper, Interval};
48///
49/// let intervals = (0u32..100).step_by(5)
50/// .map(|x| Interval { start: x, end: x + 2, val: true })
51/// .collect::<Vec<_>>();
52/// let bits = Bits::build(intervals);
53///
54/// let mut cursor = 0;
55/// for i in 10u32..20 {
56/// let overlaps: Vec<_> = bits.seek(i, i + 5, &mut cursor).collect();
57/// // Process overlaps...
58/// }
59/// ```
60///
61/// # See Also
62///
63/// - [`Overlapper`] - The trait that `Bits` implements
64/// - [`crate::AIList`] - An alternative implementation optimized for high-coverage regions
65#[derive(Debug, Clone)]
66pub struct Bits<I, T>
67where
68 I: PrimInt + Unsigned + Send + Sync,
69 T: Eq + Clone + Send + Sync,
70{
71 /// List of intervals
72 pub intervals: Vec<Interval<I, T>>,
73 /// Sorted list of start positions,
74 starts: Vec<I>,
75 /// Sorted list of end positions,
76 ends: Vec<I>,
77 /// The length of the longest interval
78 max_len: I,
79 /// The calculated number of positions covered by the intervals
80 cov: Option<I>,
81 /// Whether or not overlaps have been merged
82 pub overlaps_merged: bool,
83}
84
85impl<I, T> Overlapper<I, T> for Bits<I, T>
86where
87 I: PrimInt + Unsigned + Send + Sync,
88 T: Eq + Clone + Send + Sync,
89{
90 /// Create a new instance of Bits by passing in a vector of Intervals. This vector will
91 /// immediately be sorted by start order.
92 /// ```
93 /// use gtars_overlaprs::{Bits, Overlapper};
94 /// use gtars_core::models::Interval;
95 ///
96 /// let data = (0..20).step_by(5)
97 /// .map(|x| Interval{start: x, end: x + 10, val: true})
98 /// .collect::<Vec<Interval<usize, bool>>>();
99 /// let bits = Bits::build(data);
100 /// ```
101 fn build(mut intervals: Vec<Interval<I, T>>) -> Self
102 where
103 Self: Sized,
104 {
105 intervals.sort();
106 let (mut starts, mut ends): (Vec<_>, Vec<_>) =
107 intervals.iter().map(|x| (x.start, x.end)).unzip();
108 starts.sort();
109 ends.sort();
110 let mut max_len = zero::<I>();
111 for interval in intervals.iter() {
112 let i_len = interval
113 .end
114 .checked_sub(&interval.start)
115 .unwrap_or_else(zero::<I>);
116 if i_len > max_len {
117 max_len = i_len;
118 }
119 }
120 Bits {
121 intervals,
122 starts,
123 ends,
124 max_len,
125 cov: None,
126 overlaps_merged: false,
127 }
128 }
129
130 /// Find all intervals that overlap start .. stop
131 /// ```
132 /// use gtars_overlaprs::{Bits, Overlapper};
133 /// use gtars_core::models::Interval;
134 ///
135 /// let bits = Bits::build((0..100).step_by(5)
136 /// .map(|x| Interval{start: x, end: x+2 , val: true})
137 /// .collect::<Vec<Interval<usize, bool>>>());
138 /// assert_eq!(bits.find_iter(5, 11).count(), 2);
139 /// ```
140 #[inline]
141 fn find(&self, start: I, stop: I) -> Vec<Interval<I, T>> {
142 let finder = IterFind {
143 inner: self,
144 off: Self::lower_bound(
145 start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
146 &self.intervals,
147 ),
148 start,
149 stop,
150 };
151
152 // TODO: we are literally just collection the iterator, which feels like smell
153 // but how do we write the trait in such a way that we return an actual
154 // iterator instead of a vector of intervals?
155 finder.into_iter().cloned().collect()
156 }
157
158 fn find_iter<'a>(
159 &'a self,
160 start: I,
161 stop: I,
162 ) -> Box<dyn Iterator<Item = &'a Interval<I, T>> + 'a> {
163 let finder = IterFind {
164 inner: self,
165 off: Self::lower_bound(
166 start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
167 &self.intervals,
168 ),
169 start,
170 stop,
171 };
172 Box::new(finder)
173 }
174}
175
176impl<I, T> Bits<I, T>
177where
178 I: PrimInt + Unsigned + Send + Sync,
179 T: Eq + Clone + Send + Sync,
180{
181 /// Insert a new interval after the BITS has been created. This is very
182 /// inefficient and should be avoided if possible.
183 ///
184 /// SIDE EFFECTS: This clears cov() and overlaps_merged
185 /// meaning that those will have to be recomputed after a insert
186 /// ```
187 /// use gtars_overlaprs::{Bits, Overlapper};
188 /// use gtars_core::models::Interval;
189 ///
190 /// let data : Vec<Interval<usize, usize>>= vec!{
191 /// Interval{start:0, end:5, val:1},
192 /// Interval{start:6, end:10, val:2},
193 /// };
194 /// let mut bits = Bits::build(data);
195 /// bits.insert(Interval{start:0, end:20, val:5});
196 /// assert_eq!(bits.len(), 3);
197 /// assert_eq!(bits.find_iter(1,3).collect::<Vec<&Interval<usize,usize>>>(),
198 /// vec![
199 /// &Interval{start:0, end:5, val:1},
200 /// &Interval{start:0, end:20, val:5},
201 /// ]
202 /// );
203 ///
204 /// ```
205 pub fn insert(&mut self, elem: Interval<I, T>) {
206 let starts_insert_index = Self::bsearch_seq(elem.start, &self.starts);
207 let stops_insert_index = Self::bsearch_seq(elem.end, &self.ends);
208 let intervals_insert_index = Self::bsearch_seq_ref(&elem, &self.intervals);
209 let i_len = elem.end.checked_sub(&elem.start).unwrap_or_else(zero::<I>);
210 if i_len > self.max_len {
211 self.max_len = i_len;
212 }
213 self.starts.insert(starts_insert_index, elem.start);
214 self.ends.insert(stops_insert_index, elem.end);
215 self.intervals.insert(intervals_insert_index, elem);
216 self.cov = None;
217 self.overlaps_merged = false;
218 }
219
220 /// Get the number over intervals in Bits
221 #[inline]
222 pub fn len(&self) -> usize {
223 self.intervals.len()
224 }
225
226 /// Check if BITS is empty (i.e. has no intervals)
227 #[inline]
228 pub fn is_empty(&self) -> bool {
229 self.intervals.is_empty()
230 }
231
232 /// Return an iterator over the intervals in Bits
233 #[inline]
234 pub fn iter(&'_ self) -> IterBits<'_, I, T> {
235 IterBits {
236 inner: self,
237 pos: 0,
238 }
239 }
240
241 /// Determine the first index that we should start checking for overlaps for via a binary
242 /// search.
243 /// Assumes that the maximum interval length in `intervals` has been subtracted from
244 /// `start`, otherwise the result is undefined
245 #[inline]
246 pub fn lower_bound(start: I, intervals: &[Interval<I, T>]) -> usize {
247 let mut size = intervals.len();
248 let mut low = 0;
249
250 while size > 0 {
251 let half = size / 2;
252 let other_half = size - half;
253 let probe = low + half;
254 let other_low = low + other_half;
255 let v = &intervals[probe];
256 size = half;
257 low = if v.start < start { other_low } else { low }
258 }
259 low
260 }
261
262 /// Binary search for the insertion position of a key in a sorted slice.
263 ///
264 /// Returns the index where `key` should be inserted to maintain sort order.
265 /// This is a convenience wrapper around [`bsearch_seq_ref`](Self::bsearch_seq_ref).
266 ///
267 /// # Arguments
268 ///
269 /// * `key` - The value to search for
270 /// * `elems` - A sorted slice to search in
271 ///
272 /// # Returns
273 ///
274 /// The index where `key` should be inserted.
275 #[inline]
276 pub fn bsearch_seq<K>(key: K, elems: &[K]) -> usize
277 where
278 K: PartialEq + PartialOrd,
279 {
280 Self::bsearch_seq_ref(&key, elems)
281 }
282
283 /// Binary search for the insertion position of a key reference in a sorted slice.
284 ///
285 /// Returns the index where `key` should be inserted to maintain sort order.
286 /// Uses an efficient binary search algorithm optimized for branch prediction.
287 ///
288 /// # Arguments
289 ///
290 /// * `key` - A reference to the value to search for
291 /// * `elems` - A sorted slice to search in
292 ///
293 /// # Returns
294 ///
295 /// The index where `key` should be inserted to maintain sort order:
296 /// - `0` if the key should be inserted at the beginning
297 /// - `elems.len()` if the key should be inserted at the end
298 /// - Otherwise, the first index where `elems[index] >= key`
299 #[inline]
300 pub fn bsearch_seq_ref<K>(key: &K, elems: &[K]) -> usize
301 where
302 K: PartialEq + PartialOrd,
303 {
304 if elems.is_empty() || elems[0] >= *key {
305 return 0;
306 } else if elems[elems.len() - 1] < *key {
307 return elems.len();
308 }
309
310 let mut cursor = 0;
311 let mut length = elems.len();
312 while length > 1 {
313 let half = length >> 1;
314 length -= half;
315 cursor += (usize::from(elems[cursor + half - 1] < *key)) * half;
316 }
317 cursor
318 }
319
320 /// Count all intervals that overlap start .. stop. This performs two binary search in order to
321 /// find all the excluded elements, and then deduces the intersection from there. See
322 /// [BITS](https://arxiv.org/pdf/1208.3407.pdf) for more details.
323 /// ```
324 /// use gtars_overlaprs::{Bits, Overlapper};
325 /// use gtars_core::models::Interval;
326 ///
327 /// let bits = Bits::build((0..100).step_by(5)
328 /// .map(|x| Interval{start: x, end: x+2 , val: true})
329 /// .collect::<Vec<Interval<usize, bool>>>());
330 /// assert_eq!(bits.count(5, 11), 2);
331 /// ```
332 #[inline]
333 pub fn count(&self, start: I, stop: I) -> usize {
334 let len = self.intervals.len();
335 // Plus one to account for half-openness of bits intervals compared to BITS paper
336 let first = Self::bsearch_seq(start + one::<I>(), &self.ends);
337 let last = Self::bsearch_seq(stop, &self.starts);
338 let num_cant_after = len - last;
339 len - first - num_cant_after
340 }
341
342 /// Find all intevals that overlap start .. stop. This method will work when queries
343 /// to this Bits are in sorted (start) order. It uses a linear search from the last query
344 /// instead of a binary search. A reference to a cursor must be passed in. This reference will
345 /// be modified and should be reused in the next query. This allows seek to not need to make
346 /// the Bits object mutable, and thus use the same Bits accross threads.
347 /// ```
348 /// use gtars_overlaprs::{Bits, Overlapper};
349 /// use gtars_core::models::Interval;
350 ///
351 /// let bits = Bits::build((0..100).step_by(5)
352 /// .map(|x| Interval{start: x, end: x+2 , val: true})
353 /// .collect::<Vec<Interval<usize, bool>>>());
354 /// let mut cursor = 0;
355 /// for i in bits.iter() {
356 /// assert_eq!(bits.seek(i.start, i.end, &mut cursor).count(), 1);
357 /// }
358 /// ```
359 #[inline]
360 pub fn seek<'a>(&'a self, start: I, stop: I, cursor: &mut usize) -> IterFind<'a, I, T> {
361 if *cursor == 0 || (*cursor < self.intervals.len() && self.intervals[*cursor].start > start)
362 {
363 *cursor = Self::lower_bound(
364 start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>),
365 &self.intervals,
366 );
367 }
368
369 while *cursor + 1 < self.intervals.len()
370 && self.intervals[*cursor + 1].start
371 < start.checked_sub(&self.max_len).unwrap_or_else(zero::<I>)
372 {
373 *cursor += 1;
374 }
375
376 IterFind {
377 inner: self,
378 off: *cursor,
379 start,
380 stop,
381 }
382 }
383}
384
385/// An iterator over intervals in a [`Bits`] structure that overlap with a query range.
386///
387/// This struct is created by the [`find_iter`](Overlapper::find_iter) method on [`Bits`],
388/// or by the [`seek`](Bits::seek) method for sequential queries. It yields references to
389/// intervals that overlap with the specified query range without allocating a vector.
390///
391/// # Examples
392///
393/// ```
394/// use gtars_overlaprs::{Bits, Overlapper, Interval};
395///
396/// let intervals = vec![
397/// Interval { start: 10u32, end: 20, val: "a" },
398/// Interval { start: 15, end: 25, val: "b" },
399/// ];
400///
401/// let bits = Bits::build(intervals);
402///
403/// // The iterator is created by find_iter
404/// for interval in bits.find_iter(12, 18) {
405/// println!("Found: {}", interval.val);
406/// }
407/// ```
408#[derive(Debug)]
409pub struct IterFind<'a, I, T>
410where
411 T: Eq + Clone + Send + Sync + 'a,
412 I: PrimInt + Unsigned + Send + Sync,
413{
414 inner: &'a Bits<I, T>,
415 off: usize,
416 start: I,
417 stop: I,
418}
419
420impl<'a, I, T> Iterator for IterFind<'a, I, T>
421where
422 T: Eq + Clone + Send + Sync + 'a,
423 I: PrimInt + Unsigned + Send + Sync,
424{
425 type Item = &'a Interval<I, T>;
426
427 #[inline]
428 // interval.start < stop && interval.end > start
429 fn next(&mut self) -> Option<Self::Item> {
430 while self.off < self.inner.intervals.len() {
431 //let mut generator = self.inner.intervals[self.off..].iter();
432 //while let Some(interval) = generator.next() {
433 let interval = &self.inner.intervals[self.off];
434 self.off += 1;
435 if interval.overlap(self.start, self.stop) {
436 return Some(interval);
437 } else if interval.start >= self.stop {
438 break;
439 }
440 }
441 None
442 }
443}
444
445/// An iterator over all intervals in a [`Bits`] structure, in sorted order.
446///
447/// This struct is created by the [`iter`](Bits::iter) method. It yields references to all
448/// intervals in the `Bits` structure in sorted order by start position, regardless of overlap.
449///
450/// # Examples
451///
452/// ```
453/// use gtars_overlaprs::{Bits, Overlapper, Interval};
454///
455/// let intervals = vec![
456/// Interval { start: 10u32, end: 20, val: 1 },
457/// Interval { start: 15, end: 25, val: 2 },
458/// Interval { start: 30, end: 40, val: 3 },
459/// ];
460///
461/// let bits = Bits::build(intervals);
462///
463/// // Iterate over all intervals
464/// for interval in bits.iter() {
465/// println!("Interval: {}-{}", interval.start, interval.end);
466/// }
467/// ```
468pub struct IterBits<'a, I, T>
469where
470 T: Eq + Clone + Send + Sync + 'a,
471 I: PrimInt + Unsigned + Send + Sync,
472{
473 inner: &'a Bits<I, T>,
474 pos: usize,
475}
476
477impl<'a, I, T> Iterator for IterBits<'a, I, T>
478where
479 T: Eq + Clone + Send + Sync + 'a,
480 I: PrimInt + Unsigned + Send + Sync,
481{
482 type Item = &'a Interval<I, T>;
483
484 fn next(&mut self) -> Option<Self::Item> {
485 if self.pos >= self.inner.intervals.len() {
486 None
487 } else {
488 self.pos += 1;
489 self.inner.intervals.get(self.pos - 1)
490 }
491 }
492}
493
494impl<I, T> IntoIterator for Bits<I, T>
495where
496 T: Eq + Clone + Send + Sync,
497 I: PrimInt + Unsigned + Send + Sync,
498{
499 type Item = Interval<I, T>;
500 type IntoIter = ::std::vec::IntoIter<Self::Item>;
501
502 fn into_iter(self) -> Self::IntoIter {
503 self.intervals.into_iter()
504 }
505}
506
507impl<'a, I, T> IntoIterator for &'a Bits<I, T>
508where
509 T: Eq + Clone + Send + Sync + 'a,
510 I: PrimInt + Unsigned + Send + Sync,
511{
512 type Item = &'a Interval<I, T>;
513 type IntoIter = std::slice::Iter<'a, Interval<I, T>>;
514
515 fn into_iter(self) -> std::slice::Iter<'a, Interval<I, T>> {
516 self.intervals.iter()
517 }
518}
519
520impl<'a, I, T> IntoIterator for &'a mut Bits<I, T>
521where
522 T: Eq + Clone + Send + Sync + 'a,
523 I: PrimInt + Unsigned + Send + Sync,
524{
525 type Item = &'a mut Interval<I, T>;
526 type IntoIter = std::slice::IterMut<'a, Interval<I, T>>;
527
528 fn into_iter(self) -> std::slice::IterMut<'a, Interval<I, T>> {
529 self.intervals.iter_mut()
530 }
531}
532
533#[cfg(test)]
534mod tests {
535 use super::*;
536
537 use pretty_assertions::{assert_eq, assert_ne};
538 use rstest::{fixture, rstest};
539
540 #[fixture]
541 fn intervals() -> Vec<Interval<u32, &'static str>> {
542 vec![
543 Interval {
544 start: 1,
545 end: 5,
546 val: "a",
547 },
548 Interval {
549 start: 3,
550 end: 7,
551 val: "b",
552 },
553 Interval {
554 start: 6,
555 end: 10,
556 val: "c",
557 },
558 Interval {
559 start: 8,
560 end: 12,
561 val: "d",
562 },
563 ]
564 }
565
566 #[rstest]
567 fn test_build_and_len(intervals: Vec<Interval<u32, &'static str>>) {
568 let ailist = Bits::build(intervals.clone());
569 assert_eq!(ailist.len(), intervals.len());
570 assert_ne!(ailist.is_empty(), true);
571 }
572
573 #[rstest]
574 fn test_find_overlapping_intervals(intervals: Vec<Interval<u32, &'static str>>) {
575 let ailist = Bits::build(intervals);
576
577 // Query that overlaps with "a" and "b"
578 let results = ailist.find(2, 4);
579 let vals: Vec<&str> = results.iter().map(|i| i.val).collect();
580 assert_eq!(vals.contains(&"a"), true);
581 assert_eq!(vals.contains(&"b"), true);
582 assert_eq!(vals.contains(&"c"), false);
583
584 // Query that overlaps with "c" and "d"
585 let results = ailist.find(9, 11);
586 let vals: Vec<&str> = results.iter().map(|i| i.val).collect();
587 assert_eq!(vals.contains(&"c"), true);
588 assert_eq!(vals.contains(&"d"), true);
589 assert_eq!(vals.contains(&"a"), false);
590 }
591
592 #[rstest]
593 fn test_find_no_overlap(intervals: Vec<Interval<u32, &'static str>>) {
594 let ailist = Bits::build(intervals);
595
596 // Query outside all intervals
597 let results = ailist.find(13, 15);
598 assert_eq!(results.is_empty(), true);
599 }
600
601 #[rstest]
602 fn test_empty_ailist() {
603 let ailist: Bits<u32, &str> = Bits::build(vec![]);
604
605 assert_eq!(ailist.len(), 0);
606 assert_eq!(ailist.is_empty(), true);
607
608 let results = ailist.find(1, 2);
609
610 assert_eq!(results.is_empty(), true);
611 }
612}