Skip to main content

rsomics_intervals/
algebra.rs

1use crate::index::IntervalIndex;
2use crate::interval::Interval;
3use crate::set::IntervalSet;
4
5fn mk(chrom: &str, start: u64, end: u64) -> Interval {
6    Interval {
7        chrom: chrom.to_string(),
8        start,
9        end,
10        strand: None,
11    }
12}
13
14// merge overlapping/touching half-open intervals per chrom — bedtools merge -d 0; strand dropped
15#[must_use]
16pub fn merge(input: &IntervalSet) -> IntervalSet {
17    let mut work = input.clone();
18    work.sort();
19    let mut out = IntervalSet::new();
20    for (chrom, ivs) in work.iter_chroms() {
21        if ivs.is_empty() {
22            continue;
23        }
24        let mut cur_start = ivs[0].start;
25        let mut cur_end = ivs[0].end;
26        for iv in &ivs[1..] {
27            if iv.start <= cur_end {
28                cur_end = cur_end.max(iv.end);
29            } else {
30                out.push(mk(chrom, cur_start, cur_end));
31                cur_start = iv.start;
32                cur_end = iv.end;
33            }
34        }
35        out.push(mk(chrom, cur_start, cur_end));
36    }
37    out.sort();
38    out
39}
40
41// clip each overlapping pair to the overlap — bedtools intersect default (a-duplicates retained)
42#[must_use]
43pub fn intersect(a: &IntervalSet, b: &IntervalSet) -> IntervalSet {
44    // coitrees index over b: O(n_a·log n_b + output); an active-set sweep degrades to O(n_a·n_b) on long-a/dense-b (CNV vs SNP)
45    let bx = IntervalIndex::build(b);
46    let mut out = IntervalSet::new();
47    for (chrom, a_ivs) in a.iter_chroms() {
48        for ai in a_ivs {
49            bx.for_each_overlap(chrom, ai.start, ai.end, |bi| {
50                let lo = ai.start.max(bi.start);
51                let hi = ai.end.min(bi.end);
52                if hi > lo {
53                    out.push(mk(chrom, lo, hi));
54                }
55            });
56        }
57    }
58    out.sort();
59    out
60}
61
62// subtract b's coverage from a — bedtools subtract (without -A)
63#[must_use]
64pub fn subtract(a: &IntervalSet, b: &IntervalSet) -> IntervalSet {
65    let b_merged = merge(b);
66    let mut out = IntervalSet::new();
67    for (chrom, a_ivs) in a.iter_chroms() {
68        let b_ivs = b_merged.get(chrom).unwrap_or(&[]);
69        let mut av: Vec<&Interval> = a_ivs.iter().collect();
70        av.sort_unstable_by_key(|x| (x.start, x.end));
71        // b_merged is sorted + disjoint; monotone `lo` makes this O(n+m) not O(n·m).
72        let mut lo = 0usize;
73        for ai in &av {
74            while lo < b_ivs.len() && b_ivs[lo].end <= ai.start {
75                lo += 1;
76            }
77            for (s, e) in subtract_one(ai, &b_ivs[lo..]) {
78                out.push(mk(chrom, s, e));
79            }
80        }
81    }
82    out.sort();
83    out
84}
85
86fn subtract_one(a: &Interval, b_ivs: &[Interval]) -> Vec<(u64, u64)> {
87    let mut cursor = a.start;
88    let mut out = Vec::new();
89    for b in b_ivs {
90        if b.end <= cursor {
91            continue;
92        }
93        if b.start >= a.end {
94            break;
95        }
96        if b.start > cursor {
97            out.push((cursor, b.start.min(a.end)));
98        }
99        cursor = cursor.max(b.end);
100        if cursor >= a.end {
101            break;
102        }
103    }
104    if cursor < a.end {
105        out.push((cursor, a.end));
106    }
107    out
108}
109
110// uncovered regions per chrom, bounded by chrom_sizes — bedtools complement -g
111#[must_use]
112pub fn complement(input: &IntervalSet, chrom_sizes: &[(String, u64)]) -> IntervalSet {
113    let merged = merge(input);
114    let mut out = IntervalSet::new();
115    for (chrom, size) in chrom_sizes {
116        let ivs = merged.get(chrom).unwrap_or(&[]);
117        let mut cursor: u64 = 0;
118        for iv in ivs {
119            if iv.start > cursor {
120                out.push(mk(chrom, cursor, iv.start));
121            }
122            cursor = cursor.max(iv.end);
123            if cursor >= *size {
124                break;
125            }
126        }
127        if cursor < *size {
128            out.push(mk(chrom, cursor, *size));
129        }
130    }
131    out.sort();
132    out
133}
134
135#[must_use]
136pub fn coverage_bases(input: &IntervalSet) -> Vec<(String, u64)> {
137    let merged = merge(input);
138    let mut out = Vec::new();
139    for (chrom, ivs) in merged.iter_chroms() {
140        let total: u64 = ivs.iter().map(Interval::len).sum();
141        out.push((chrom.to_string(), total));
142    }
143    out
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    fn iv(chrom: &str, start: u64, end: u64) -> Interval {
151        Interval::new(chrom, start, end).unwrap()
152    }
153
154    fn set(items: impl IntoIterator<Item = Interval>) -> IntervalSet {
155        let mut s: IntervalSet = items.into_iter().collect();
156        s.sort();
157        s
158    }
159
160    #[test]
161    fn merge_collapses_overlapping_and_touching() {
162        let s = set([
163            iv("chr1", 100, 200),
164            iv("chr1", 150, 250),
165            iv("chr1", 250, 300),
166            iv("chr1", 400, 500),
167        ]);
168        let m = merge(&s);
169        let v = m.get("chr1").unwrap();
170        assert_eq!(v.len(), 2);
171        assert_eq!((v[0].start, v[0].end), (100, 300));
172        assert_eq!((v[1].start, v[1].end), (400, 500));
173    }
174
175    #[test]
176    fn merge_independent_per_chrom() {
177        let s = set([iv("chr1", 100, 200), iv("chr2", 100, 200)]);
178        let m = merge(&s);
179        assert_eq!(m.get("chr1").unwrap().len(), 1);
180        assert_eq!(m.get("chr2").unwrap().len(), 1);
181    }
182
183    #[test]
184    fn intersect_clips_to_overlap() {
185        let a = set([iv("chr1", 100, 200), iv("chr1", 300, 400)]);
186        let b = set([iv("chr1", 150, 350)]);
187        let i = intersect(&a, &b);
188        let v = i.get("chr1").unwrap();
189        assert_eq!(v.len(), 2);
190        assert_eq!((v[0].start, v[0].end), (150, 200));
191        assert_eq!((v[1].start, v[1].end), (300, 350));
192    }
193
194    #[test]
195    fn intersect_long_a_then_short_a_dense_b() {
196        let a = set([iv("chr1", 0, 100), iv("chr1", 10, 11)]);
197        let b = set([iv("chr1", 5, 15), iv("chr1", 50, 60)]);
198        let i = intersect(&a, &b);
199        let v = i.get("chr1").unwrap();
200        assert_eq!(
201            v.iter().map(|x| (x.start, x.end)).collect::<Vec<_>>(),
202            vec![(5, 15), (10, 11), (50, 60)]
203        );
204    }
205
206    #[test]
207    fn intersect_different_chroms_yield_nothing() {
208        let a = set([iv("chr1", 100, 200)]);
209        let b = set([iv("chr2", 100, 200)]);
210        assert!(intersect(&a, &b).is_empty());
211    }
212
213    #[test]
214    fn subtract_punches_holes() {
215        let a = set([iv("chr1", 100, 500)]);
216        let b = set([iv("chr1", 200, 300), iv("chr1", 400, 450)]);
217        let s = subtract(&a, &b);
218        let v = s.get("chr1").unwrap();
219        assert_eq!(v.len(), 3);
220        assert_eq!((v[0].start, v[0].end), (100, 200));
221        assert_eq!((v[1].start, v[1].end), (300, 400));
222        assert_eq!((v[2].start, v[2].end), (450, 500));
223    }
224
225    #[test]
226    fn subtract_full_cover_yields_nothing() {
227        let a = set([iv("chr1", 100, 200)]);
228        let b = set([iv("chr1", 50, 250)]);
229        assert!(subtract(&a, &b).is_empty());
230    }
231
232    #[test]
233    fn complement_against_genome_sizes() {
234        let s = set([iv("chr1", 100, 200), iv("chr1", 400, 500)]);
235        let sizes = vec![("chr1".to_string(), 1000)];
236        let c = complement(&s, &sizes);
237        let v = c.get("chr1").unwrap();
238        assert_eq!(v.len(), 3);
239        assert_eq!((v[0].start, v[0].end), (0, 100));
240        assert_eq!((v[1].start, v[1].end), (200, 400));
241        assert_eq!((v[2].start, v[2].end), (500, 1000));
242    }
243
244    #[test]
245    fn complement_chrom_with_no_intervals_is_full_length() {
246        let s = IntervalSet::new();
247        let sizes = vec![("chr1".to_string(), 1000)];
248        let c = complement(&s, &sizes);
249        let v = c.get("chr1").unwrap();
250        assert_eq!(v.len(), 1);
251        assert_eq!((v[0].start, v[0].end), (0, 1000));
252    }
253
254    #[test]
255    fn coverage_bases_dedupes_overlap() {
256        let s = set([iv("chr1", 100, 200), iv("chr1", 150, 250)]);
257        let cov = coverage_bases(&s);
258        assert_eq!(cov, vec![("chr1".to_string(), 150)]);
259    }
260}