Skip to main content

cyanea_omics/
interval.rs

1//! Sorted interval collection with overlap queries.
2//!
3//! [`IntervalSet`] stores [`GenomicInterval`]s in a `Vec` and supports
4//! overlap/containment queries. After calling [`IntervalSet::build_index()`],
5//! queries are accelerated by per-chromosome interval trees for O(log n + k)
6//! performance.
7
8use std::collections::{BTreeMap, BTreeSet};
9
10use cyanea_core::Summarizable;
11
12use crate::genomic::GenomicInterval;
13use crate::interval_tree::{Interval, IntervalTree};
14
15/// A collection of genomic intervals with overlap query support.
16///
17/// Queries use O(n) linear scan by default. Call [`IntervalSet::build_index()`]
18/// to construct per-chromosome interval trees for O(log n + k) queries.
19pub struct IntervalSet {
20    intervals: Vec<GenomicInterval>,
21    sorted: bool,
22    /// Per-chromosome interval trees. Values store indices into `intervals`.
23    trees: Option<BTreeMap<String, IntervalTree<usize>>>,
24}
25
26impl IntervalSet {
27    /// Create an empty interval set.
28    pub fn new() -> Self {
29        Self {
30            intervals: Vec::new(),
31            sorted: true,
32            trees: None,
33        }
34    }
35
36    /// Create an interval set from existing intervals.
37    pub fn from_intervals(intervals: Vec<GenomicInterval>) -> Self {
38        Self {
39            sorted: false,
40            intervals,
41            trees: None,
42        }
43    }
44
45    /// Add an interval. Marks the set as unsorted and invalidates the index.
46    pub fn push(&mut self, interval: GenomicInterval) {
47        self.sorted = false;
48        self.trees = None;
49        self.intervals.push(interval);
50    }
51
52    /// Number of intervals in the set.
53    pub fn len(&self) -> usize {
54        self.intervals.len()
55    }
56
57    /// Whether the set is empty.
58    pub fn is_empty(&self) -> bool {
59        self.intervals.is_empty()
60    }
61
62    /// Sort intervals by chromosome then start position.
63    pub fn sort(&mut self) {
64        self.intervals
65            .sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start)));
66        self.sorted = true;
67    }
68
69    /// Build per-chromosome interval tree indices for O(log n + k) queries.
70    ///
71    /// This is called automatically on first query if the set is sorted,
72    /// but can also be called explicitly. Invalidated by [`push()`](IntervalSet::push).
73    pub fn build_index(&mut self) {
74        let mut by_chrom: BTreeMap<String, Vec<Interval<usize>>> = BTreeMap::new();
75        for (idx, iv) in self.intervals.iter().enumerate() {
76            by_chrom
77                .entry(iv.chrom.clone())
78                .or_default()
79                .push(Interval::new(iv.start, iv.end, idx));
80        }
81
82        let trees: BTreeMap<String, IntervalTree<usize>> = by_chrom
83            .into_iter()
84            .map(|(chrom, intervals)| (chrom, IntervalTree::from_unsorted(intervals)))
85            .collect();
86
87        self.trees = Some(trees);
88    }
89
90    /// Return all intervals that overlap the query.
91    ///
92    /// Uses the interval tree index if available, otherwise falls back to O(n) scan.
93    pub fn overlapping(&self, query: &GenomicInterval) -> Vec<&GenomicInterval> {
94        if let Some(trees) = &self.trees {
95            if let Some(tree) = trees.get(&query.chrom) {
96                return tree
97                    .query(query.start, query.end)
98                    .into_iter()
99                    .map(|hit| &self.intervals[hit.data])
100                    .collect();
101            }
102            return Vec::new();
103        }
104
105        // Fallback: linear scan
106        self.intervals
107            .iter()
108            .filter(|iv| iv.overlaps(query))
109            .collect()
110    }
111
112    /// Return all intervals containing a given position.
113    ///
114    /// Uses the interval tree index if available, otherwise falls back to O(n) scan.
115    pub fn containing(&self, chrom: &str, position: u64) -> Vec<&GenomicInterval> {
116        if let Some(trees) = &self.trees {
117            if let Some(tree) = trees.get(chrom) {
118                return tree
119                    .query(position, position + 1)
120                    .into_iter()
121                    .map(|hit| &self.intervals[hit.data])
122                    .collect();
123            }
124            return Vec::new();
125        }
126
127        // Fallback: linear scan
128        self.intervals
129            .iter()
130            .filter(|iv| iv.chrom == chrom && iv.contains(position))
131            .collect()
132    }
133
134    /// Merge all overlapping intervals and return a new, sorted set.
135    ///
136    /// Adjacent intervals on the same chromosome that overlap or abut are
137    /// merged into a single interval.
138    pub fn merge_overlapping(&self) -> IntervalSet {
139        if self.intervals.is_empty() {
140            return IntervalSet::new();
141        }
142
143        let mut sorted: Vec<GenomicInterval> = self.intervals.clone();
144        sorted.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start)));
145
146        let mut merged = Vec::new();
147        let mut current = sorted[0].clone();
148
149        for iv in &sorted[1..] {
150            if iv.chrom == current.chrom && iv.start <= current.end {
151                current.end = current.end.max(iv.end);
152            } else {
153                merged.push(current);
154                current = iv.clone();
155            }
156        }
157        merged.push(current);
158
159        IntervalSet {
160            intervals: merged,
161            sorted: true,
162            trees: None,
163        }
164    }
165
166    /// Borrow the intervals as a slice.
167    pub fn intervals(&self) -> &[GenomicInterval] {
168        &self.intervals
169    }
170
171    /// Consume the set and return the inner intervals.
172    pub fn into_intervals(self) -> Vec<GenomicInterval> {
173        self.intervals
174    }
175
176    /// Total bases covered on a given chromosome (after merging overlaps).
177    pub fn coverage(&self, chrom: &str) -> u64 {
178        let chrom_intervals: Vec<GenomicInterval> = self
179            .intervals
180            .iter()
181            .filter(|iv| iv.chrom == chrom)
182            .cloned()
183            .collect();
184
185        if chrom_intervals.is_empty() {
186            return 0;
187        }
188
189        let subset = IntervalSet::from_intervals(chrom_intervals);
190        let merged = subset.merge_overlapping();
191        merged.intervals.iter().map(|iv| iv.len()).sum()
192    }
193}
194
195impl Default for IntervalSet {
196    fn default() -> Self {
197        Self::new()
198    }
199}
200
201impl Summarizable for IntervalSet {
202    fn summary(&self) -> String {
203        let chroms: BTreeSet<&str> = self.intervals.iter().map(|iv| iv.chrom.as_str()).collect();
204        format!(
205            "IntervalSet: {} intervals across {} chromosomes",
206            self.len(),
207            chroms.len()
208        )
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    fn iv(chrom: &str, start: u64, end: u64) -> GenomicInterval {
217        GenomicInterval::new(chrom, start, end).unwrap()
218    }
219
220    #[test]
221    fn test_empty_set() {
222        let set = IntervalSet::new();
223        assert!(set.is_empty());
224        assert_eq!(set.len(), 0);
225    }
226
227    #[test]
228    fn test_push_and_sort() {
229        let mut set = IntervalSet::new();
230        set.push(iv("chr1", 200, 300));
231        set.push(iv("chr1", 100, 150));
232        assert!(!set.sorted);
233
234        set.sort();
235        assert!(set.sorted);
236        assert_eq!(set.intervals[0].start, 100);
237        assert_eq!(set.intervals[1].start, 200);
238    }
239
240    #[test]
241    fn test_overlapping() {
242        let set = IntervalSet::from_intervals(vec![
243            iv("chr1", 100, 200),
244            iv("chr1", 300, 400),
245            iv("chr2", 100, 200),
246        ]);
247
248        let query = iv("chr1", 150, 250);
249        let hits = set.overlapping(&query);
250        assert_eq!(hits.len(), 1);
251        assert_eq!(hits[0].start, 100);
252    }
253
254    #[test]
255    fn test_containing() {
256        let set = IntervalSet::from_intervals(vec![
257            iv("chr1", 100, 200),
258            iv("chr1", 150, 250),
259            iv("chr1", 300, 400),
260        ]);
261
262        let hits = set.containing("chr1", 175);
263        assert_eq!(hits.len(), 2);
264
265        let hits = set.containing("chr1", 350);
266        assert_eq!(hits.len(), 1);
267
268        let hits = set.containing("chr2", 100);
269        assert_eq!(hits.len(), 0);
270    }
271
272    #[test]
273    fn test_merge_overlapping() {
274        let set = IntervalSet::from_intervals(vec![
275            iv("chr1", 100, 200),
276            iv("chr1", 150, 300),
277            iv("chr1", 500, 600),
278            iv("chr2", 100, 200),
279        ]);
280
281        let merged = set.merge_overlapping();
282        assert_eq!(merged.len(), 3);
283        assert_eq!(merged.intervals[0].start, 100);
284        assert_eq!(merged.intervals[0].end, 300);
285        assert_eq!(merged.intervals[1].start, 500);
286    }
287
288    #[test]
289    fn test_coverage() {
290        let set = IntervalSet::from_intervals(vec![
291            iv("chr1", 100, 200),
292            iv("chr1", 150, 300),
293            iv("chr1", 500, 600),
294        ]);
295
296        // merged: [100,300) = 200bp + [500,600) = 100bp = 300bp
297        assert_eq!(set.coverage("chr1"), 300);
298        assert_eq!(set.coverage("chr2"), 0);
299    }
300
301    #[test]
302    fn test_summary() {
303        let set = IntervalSet::from_intervals(vec![
304            iv("chr1", 100, 200),
305            iv("chr2", 100, 200),
306        ]);
307        assert_eq!(set.summary(), "IntervalSet: 2 intervals across 2 chromosomes");
308    }
309
310    #[test]
311    fn test_merge_empty() {
312        let set = IntervalSet::new();
313        let merged = set.merge_overlapping();
314        assert!(merged.is_empty());
315    }
316
317    // --- Indexed query tests ---
318
319    #[test]
320    fn test_indexed_overlapping() {
321        let mut set = IntervalSet::from_intervals(vec![
322            iv("chr1", 100, 200),
323            iv("chr1", 300, 400),
324            iv("chr2", 100, 200),
325        ]);
326        set.build_index();
327
328        let query = iv("chr1", 150, 250);
329        let hits = set.overlapping(&query);
330        assert_eq!(hits.len(), 1);
331        assert_eq!(hits[0].start, 100);
332    }
333
334    #[test]
335    fn test_indexed_containing() {
336        let mut set = IntervalSet::from_intervals(vec![
337            iv("chr1", 100, 200),
338            iv("chr1", 150, 250),
339            iv("chr1", 300, 400),
340        ]);
341        set.build_index();
342
343        let hits = set.containing("chr1", 175);
344        assert_eq!(hits.len(), 2);
345
346        let hits = set.containing("chr1", 350);
347        assert_eq!(hits.len(), 1);
348
349        let hits = set.containing("chr2", 100);
350        assert_eq!(hits.len(), 0);
351    }
352
353    #[test]
354    fn test_indexed_matches_linear() {
355        let intervals = vec![
356            iv("chr1", 0, 50),
357            iv("chr1", 30, 80),
358            iv("chr1", 100, 200),
359            iv("chr2", 0, 100),
360        ];
361        let linear_set = IntervalSet::from_intervals(intervals.clone());
362        let mut indexed_set = IntervalSet::from_intervals(intervals);
363        indexed_set.build_index();
364
365        let query = iv("chr1", 40, 120);
366        let mut linear_hits: Vec<u64> = linear_set.overlapping(&query).iter().map(|iv| iv.start).collect();
367        let mut indexed_hits: Vec<u64> = indexed_set.overlapping(&query).iter().map(|iv| iv.start).collect();
368        linear_hits.sort();
369        indexed_hits.sort();
370        assert_eq!(linear_hits, indexed_hits);
371    }
372
373    #[test]
374    fn test_push_invalidates_index() {
375        let mut set = IntervalSet::from_intervals(vec![iv("chr1", 100, 200)]);
376        set.build_index();
377        assert!(set.trees.is_some());
378
379        set.push(iv("chr1", 300, 400));
380        assert!(set.trees.is_none());
381    }
382}