1use std::collections::{BTreeMap, BTreeSet};
9
10use cyanea_core::Summarizable;
11
12use crate::genomic::GenomicInterval;
13use crate::interval_tree::{Interval, IntervalTree};
14
15pub struct IntervalSet {
20 intervals: Vec<GenomicInterval>,
21 sorted: bool,
22 trees: Option<BTreeMap<String, IntervalTree<usize>>>,
24}
25
26impl IntervalSet {
27 pub fn new() -> Self {
29 Self {
30 intervals: Vec::new(),
31 sorted: true,
32 trees: None,
33 }
34 }
35
36 pub fn from_intervals(intervals: Vec<GenomicInterval>) -> Self {
38 Self {
39 sorted: false,
40 intervals,
41 trees: None,
42 }
43 }
44
45 pub fn push(&mut self, interval: GenomicInterval) {
47 self.sorted = false;
48 self.trees = None;
49 self.intervals.push(interval);
50 }
51
52 pub fn len(&self) -> usize {
54 self.intervals.len()
55 }
56
57 pub fn is_empty(&self) -> bool {
59 self.intervals.is_empty()
60 }
61
62 pub fn sort(&mut self) {
64 self.intervals
65 .sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start)));
66 self.sorted = true;
67 }
68
69 pub fn build_index(&mut self) {
74 let mut by_chrom: BTreeMap<String, Vec<Interval<usize>>> = BTreeMap::new();
75 for (idx, iv) in self.intervals.iter().enumerate() {
76 by_chrom
77 .entry(iv.chrom.clone())
78 .or_default()
79 .push(Interval::new(iv.start, iv.end, idx));
80 }
81
82 let trees: BTreeMap<String, IntervalTree<usize>> = by_chrom
83 .into_iter()
84 .map(|(chrom, intervals)| (chrom, IntervalTree::from_unsorted(intervals)))
85 .collect();
86
87 self.trees = Some(trees);
88 }
89
90 pub fn overlapping(&self, query: &GenomicInterval) -> Vec<&GenomicInterval> {
94 if let Some(trees) = &self.trees {
95 if let Some(tree) = trees.get(&query.chrom) {
96 return tree
97 .query(query.start, query.end)
98 .into_iter()
99 .map(|hit| &self.intervals[hit.data])
100 .collect();
101 }
102 return Vec::new();
103 }
104
105 self.intervals
107 .iter()
108 .filter(|iv| iv.overlaps(query))
109 .collect()
110 }
111
112 pub fn containing(&self, chrom: &str, position: u64) -> Vec<&GenomicInterval> {
116 if let Some(trees) = &self.trees {
117 if let Some(tree) = trees.get(chrom) {
118 return tree
119 .query(position, position + 1)
120 .into_iter()
121 .map(|hit| &self.intervals[hit.data])
122 .collect();
123 }
124 return Vec::new();
125 }
126
127 self.intervals
129 .iter()
130 .filter(|iv| iv.chrom == chrom && iv.contains(position))
131 .collect()
132 }
133
134 pub fn merge_overlapping(&self) -> IntervalSet {
139 if self.intervals.is_empty() {
140 return IntervalSet::new();
141 }
142
143 let mut sorted: Vec<GenomicInterval> = self.intervals.clone();
144 sorted.sort_by(|a, b| a.chrom.cmp(&b.chrom).then(a.start.cmp(&b.start)));
145
146 let mut merged = Vec::new();
147 let mut current = sorted[0].clone();
148
149 for iv in &sorted[1..] {
150 if iv.chrom == current.chrom && iv.start <= current.end {
151 current.end = current.end.max(iv.end);
152 } else {
153 merged.push(current);
154 current = iv.clone();
155 }
156 }
157 merged.push(current);
158
159 IntervalSet {
160 intervals: merged,
161 sorted: true,
162 trees: None,
163 }
164 }
165
166 pub fn intervals(&self) -> &[GenomicInterval] {
168 &self.intervals
169 }
170
171 pub fn into_intervals(self) -> Vec<GenomicInterval> {
173 self.intervals
174 }
175
176 pub fn coverage(&self, chrom: &str) -> u64 {
178 let chrom_intervals: Vec<GenomicInterval> = self
179 .intervals
180 .iter()
181 .filter(|iv| iv.chrom == chrom)
182 .cloned()
183 .collect();
184
185 if chrom_intervals.is_empty() {
186 return 0;
187 }
188
189 let subset = IntervalSet::from_intervals(chrom_intervals);
190 let merged = subset.merge_overlapping();
191 merged.intervals.iter().map(|iv| iv.len()).sum()
192 }
193}
194
195impl Default for IntervalSet {
196 fn default() -> Self {
197 Self::new()
198 }
199}
200
201impl Summarizable for IntervalSet {
202 fn summary(&self) -> String {
203 let chroms: BTreeSet<&str> = self.intervals.iter().map(|iv| iv.chrom.as_str()).collect();
204 format!(
205 "IntervalSet: {} intervals across {} chromosomes",
206 self.len(),
207 chroms.len()
208 )
209 }
210}
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215
216 fn iv(chrom: &str, start: u64, end: u64) -> GenomicInterval {
217 GenomicInterval::new(chrom, start, end).unwrap()
218 }
219
220 #[test]
221 fn test_empty_set() {
222 let set = IntervalSet::new();
223 assert!(set.is_empty());
224 assert_eq!(set.len(), 0);
225 }
226
227 #[test]
228 fn test_push_and_sort() {
229 let mut set = IntervalSet::new();
230 set.push(iv("chr1", 200, 300));
231 set.push(iv("chr1", 100, 150));
232 assert!(!set.sorted);
233
234 set.sort();
235 assert!(set.sorted);
236 assert_eq!(set.intervals[0].start, 100);
237 assert_eq!(set.intervals[1].start, 200);
238 }
239
240 #[test]
241 fn test_overlapping() {
242 let set = IntervalSet::from_intervals(vec![
243 iv("chr1", 100, 200),
244 iv("chr1", 300, 400),
245 iv("chr2", 100, 200),
246 ]);
247
248 let query = iv("chr1", 150, 250);
249 let hits = set.overlapping(&query);
250 assert_eq!(hits.len(), 1);
251 assert_eq!(hits[0].start, 100);
252 }
253
254 #[test]
255 fn test_containing() {
256 let set = IntervalSet::from_intervals(vec![
257 iv("chr1", 100, 200),
258 iv("chr1", 150, 250),
259 iv("chr1", 300, 400),
260 ]);
261
262 let hits = set.containing("chr1", 175);
263 assert_eq!(hits.len(), 2);
264
265 let hits = set.containing("chr1", 350);
266 assert_eq!(hits.len(), 1);
267
268 let hits = set.containing("chr2", 100);
269 assert_eq!(hits.len(), 0);
270 }
271
272 #[test]
273 fn test_merge_overlapping() {
274 let set = IntervalSet::from_intervals(vec![
275 iv("chr1", 100, 200),
276 iv("chr1", 150, 300),
277 iv("chr1", 500, 600),
278 iv("chr2", 100, 200),
279 ]);
280
281 let merged = set.merge_overlapping();
282 assert_eq!(merged.len(), 3);
283 assert_eq!(merged.intervals[0].start, 100);
284 assert_eq!(merged.intervals[0].end, 300);
285 assert_eq!(merged.intervals[1].start, 500);
286 }
287
288 #[test]
289 fn test_coverage() {
290 let set = IntervalSet::from_intervals(vec![
291 iv("chr1", 100, 200),
292 iv("chr1", 150, 300),
293 iv("chr1", 500, 600),
294 ]);
295
296 assert_eq!(set.coverage("chr1"), 300);
298 assert_eq!(set.coverage("chr2"), 0);
299 }
300
301 #[test]
302 fn test_summary() {
303 let set = IntervalSet::from_intervals(vec![
304 iv("chr1", 100, 200),
305 iv("chr2", 100, 200),
306 ]);
307 assert_eq!(set.summary(), "IntervalSet: 2 intervals across 2 chromosomes");
308 }
309
310 #[test]
311 fn test_merge_empty() {
312 let set = IntervalSet::new();
313 let merged = set.merge_overlapping();
314 assert!(merged.is_empty());
315 }
316
317 #[test]
320 fn test_indexed_overlapping() {
321 let mut set = IntervalSet::from_intervals(vec![
322 iv("chr1", 100, 200),
323 iv("chr1", 300, 400),
324 iv("chr2", 100, 200),
325 ]);
326 set.build_index();
327
328 let query = iv("chr1", 150, 250);
329 let hits = set.overlapping(&query);
330 assert_eq!(hits.len(), 1);
331 assert_eq!(hits[0].start, 100);
332 }
333
334 #[test]
335 fn test_indexed_containing() {
336 let mut set = IntervalSet::from_intervals(vec![
337 iv("chr1", 100, 200),
338 iv("chr1", 150, 250),
339 iv("chr1", 300, 400),
340 ]);
341 set.build_index();
342
343 let hits = set.containing("chr1", 175);
344 assert_eq!(hits.len(), 2);
345
346 let hits = set.containing("chr1", 350);
347 assert_eq!(hits.len(), 1);
348
349 let hits = set.containing("chr2", 100);
350 assert_eq!(hits.len(), 0);
351 }
352
353 #[test]
354 fn test_indexed_matches_linear() {
355 let intervals = vec![
356 iv("chr1", 0, 50),
357 iv("chr1", 30, 80),
358 iv("chr1", 100, 200),
359 iv("chr2", 0, 100),
360 ];
361 let linear_set = IntervalSet::from_intervals(intervals.clone());
362 let mut indexed_set = IntervalSet::from_intervals(intervals);
363 indexed_set.build_index();
364
365 let query = iv("chr1", 40, 120);
366 let mut linear_hits: Vec<u64> = linear_set.overlapping(&query).iter().map(|iv| iv.start).collect();
367 let mut indexed_hits: Vec<u64> = indexed_set.overlapping(&query).iter().map(|iv| iv.start).collect();
368 linear_hits.sort();
369 indexed_hits.sort();
370 assert_eq!(linear_hits, indexed_hits);
371 }
372
373 #[test]
374 fn test_push_invalidates_index() {
375 let mut set = IntervalSet::from_intervals(vec![iv("chr1", 100, 200)]);
376 set.build_index();
377 assert!(set.trees.is_some());
378
379 set.push(iv("chr1", 300, 400));
380 assert!(set.trees.is_none());
381 }
382}