Skip to main content

cyanea_omics/
coverage.rs

1//! Run-length encoded coverage vectors for memory-efficient genome-wide depth.
2//!
3//! [`RleCoverage`] stores depth values as (depth, run_length) pairs, which is
4//! highly efficient for typical genomic coverage profiles where long stretches
5//! share the same depth.
6
7use crate::genomic::GenomicInterval;
8
9/// Run-length encoded coverage vector.
10///
11/// Stores coverage depth as a sequence of (depth, length) runs.
12/// This is memory-efficient for genome-wide coverage where long runs of
13/// identical depth are common (e.g., depth=0 in intergenic regions).
14#[derive(Debug, Clone)]
15pub struct RleCoverage {
16    /// (depth, run_length) pairs in positional order.
17    runs: Vec<(u32, u64)>,
18    /// Total number of positions covered by this vector.
19    total_length: u64,
20}
21
22impl RleCoverage {
23    /// Build an RLE coverage vector from a set of intervals on a chromosome.
24    ///
25    /// Each interval contributes +1 to all positions it covers. The resulting
26    /// coverage vector spans `[0, chrom_length)`.
27    pub fn from_intervals(intervals: &[GenomicInterval], chrom_length: u64) -> Self {
28        if chrom_length == 0 {
29            return Self {
30                runs: Vec::new(),
31                total_length: 0,
32            };
33        }
34
35        // Use sweep-line approach: collect +1/-1 events
36        let mut events: Vec<(u64, i32)> = Vec::with_capacity(intervals.len() * 2);
37        for iv in intervals {
38            let start = iv.start.min(chrom_length);
39            let end = iv.end.min(chrom_length);
40            if start < end {
41                events.push((start, 1));
42                events.push((end, -1));
43            }
44        }
45        events.sort_by_key(|&(pos, delta)| (pos, std::cmp::Reverse(delta)));
46
47        // Sweep line
48        let mut runs = Vec::new();
49        let mut depth: i64 = 0;
50        let mut prev_pos: u64 = 0;
51
52        for (pos, delta) in &events {
53            let pos = *pos;
54            if pos > prev_pos {
55                runs.push((depth as u32, pos - prev_pos));
56                prev_pos = pos;
57            }
58            depth += *delta as i64;
59        }
60
61        // Remaining positions to end of chromosome
62        if prev_pos < chrom_length {
63            runs.push((depth as u32, chrom_length - prev_pos));
64        }
65
66        // Merge consecutive runs with same depth
67        let merged = merge_runs(runs);
68
69        Self {
70            runs: merged,
71            total_length: chrom_length,
72        }
73    }
74
75    /// Build an RLE coverage vector from a dense depth array.
76    pub fn from_depths(depths: &[u32]) -> Self {
77        if depths.is_empty() {
78            return Self {
79                runs: Vec::new(),
80                total_length: 0,
81            };
82        }
83
84        let mut runs = Vec::new();
85        let mut current_depth = depths[0];
86        let mut run_len: u64 = 1;
87
88        for &d in &depths[1..] {
89            if d == current_depth {
90                run_len += 1;
91            } else {
92                runs.push((current_depth, run_len));
93                current_depth = d;
94                run_len = 1;
95            }
96        }
97        runs.push((current_depth, run_len));
98
99        Self {
100            runs,
101            total_length: depths.len() as u64,
102        }
103    }
104
105    /// Get the coverage depth at a specific position.
106    ///
107    /// Returns 0 if the position is out of range.
108    pub fn get(&self, position: u64) -> u32 {
109        if position >= self.total_length {
110            return 0;
111        }
112
113        let mut offset: u64 = 0;
114        for &(depth, length) in &self.runs {
115            offset += length;
116            if position < offset {
117                return depth;
118            }
119        }
120
121        0
122    }
123
124    /// Mean coverage across all positions.
125    pub fn mean_coverage(&self) -> f64 {
126        if self.total_length == 0 {
127            return 0.0;
128        }
129
130        let sum: u64 = self
131            .runs
132            .iter()
133            .map(|&(depth, length)| depth as u64 * length)
134            .sum();
135
136        sum as f64 / self.total_length as f64
137    }
138
139    /// Number of bases with depth above a threshold.
140    pub fn bases_above(&self, threshold: u32) -> u64 {
141        self.runs
142            .iter()
143            .filter(|&&(depth, _)| depth > threshold)
144            .map(|&(_, length)| length)
145            .sum()
146    }
147
148    /// Expand the RLE coverage to a dense vector.
149    pub fn to_vec(&self) -> Vec<u32> {
150        let mut result = Vec::with_capacity(self.total_length as usize);
151        for &(depth, length) in &self.runs {
152            result.extend(std::iter::repeat(depth).take(length as usize));
153        }
154        result
155    }
156
157    /// Total length of the coverage vector.
158    pub fn len(&self) -> u64 {
159        self.total_length
160    }
161
162    /// Whether the coverage vector is empty.
163    pub fn is_empty(&self) -> bool {
164        self.total_length == 0
165    }
166
167    /// Number of RLE runs.
168    pub fn num_runs(&self) -> usize {
169        self.runs.len()
170    }
171}
172
173fn merge_runs(runs: Vec<(u32, u64)>) -> Vec<(u32, u64)> {
174    if runs.is_empty() {
175        return runs;
176    }
177
178    let mut merged = Vec::with_capacity(runs.len());
179    let mut current = runs[0];
180
181    for &(depth, length) in &runs[1..] {
182        if depth == current.0 {
183            current.1 += length;
184        } else {
185            merged.push(current);
186            current = (depth, length);
187        }
188    }
189    merged.push(current);
190
191    merged
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    fn iv(start: u64, end: u64) -> GenomicInterval {
199        GenomicInterval::new("chr1", start, end).unwrap()
200    }
201
202    #[test]
203    fn empty_coverage() {
204        let cov = RleCoverage::from_intervals(&[], 100);
205        assert_eq!(cov.len(), 100);
206        assert_eq!(cov.mean_coverage(), 0.0);
207        assert_eq!(cov.bases_above(0), 0);
208        assert_eq!(cov.get(50), 0);
209    }
210
211    #[test]
212    fn zero_length() {
213        let cov = RleCoverage::from_intervals(&[], 0);
214        assert!(cov.is_empty());
215        assert_eq!(cov.mean_coverage(), 0.0);
216    }
217
218    #[test]
219    fn single_interval() {
220        let cov = RleCoverage::from_intervals(&[iv(10, 20)], 100);
221        assert_eq!(cov.get(5), 0);
222        assert_eq!(cov.get(10), 1);
223        assert_eq!(cov.get(15), 1);
224        assert_eq!(cov.get(19), 1);
225        assert_eq!(cov.get(20), 0);
226        assert_eq!(cov.get(50), 0);
227        assert_eq!(cov.bases_above(0), 10);
228        assert!((cov.mean_coverage() - 0.1).abs() < 1e-10);
229    }
230
231    #[test]
232    fn overlapping_intervals() {
233        let cov = RleCoverage::from_intervals(&[iv(10, 30), iv(20, 40)], 100);
234        assert_eq!(cov.get(15), 1); // only first
235        assert_eq!(cov.get(25), 2); // both
236        assert_eq!(cov.get(35), 1); // only second
237        assert_eq!(cov.get(45), 0); // neither
238    }
239
240    #[test]
241    fn from_depths_basic() {
242        let depths = vec![0, 0, 1, 1, 1, 2, 2, 0];
243        let cov = RleCoverage::from_depths(&depths);
244        assert_eq!(cov.len(), 8);
245        assert_eq!(cov.num_runs(), 4);
246        assert_eq!(cov.get(0), 0);
247        assert_eq!(cov.get(2), 1);
248        assert_eq!(cov.get(5), 2);
249        assert_eq!(cov.get(7), 0);
250    }
251
252    #[test]
253    fn from_depths_empty() {
254        let cov = RleCoverage::from_depths(&[]);
255        assert!(cov.is_empty());
256    }
257
258    #[test]
259    fn round_trip() {
260        let depths = vec![0, 0, 3, 3, 3, 1, 0, 0, 0, 5];
261        let cov = RleCoverage::from_depths(&depths);
262        let expanded = cov.to_vec();
263        assert_eq!(expanded, depths);
264    }
265
266    #[test]
267    fn mean_coverage_value() {
268        let cov = RleCoverage::from_depths(&[0, 1, 2, 3, 4]);
269        assert!((cov.mean_coverage() - 2.0).abs() < 1e-10);
270    }
271
272    #[test]
273    fn bases_above_threshold() {
274        let cov = RleCoverage::from_depths(&[0, 1, 2, 3, 4, 5]);
275        assert_eq!(cov.bases_above(2), 3); // positions with depth 3, 4, 5
276        assert_eq!(cov.bases_above(0), 5); // positions with depth > 0
277        assert_eq!(cov.bases_above(10), 0);
278    }
279
280    #[test]
281    fn get_out_of_range() {
282        let cov = RleCoverage::from_depths(&[1, 2, 3]);
283        assert_eq!(cov.get(3), 0);
284        assert_eq!(cov.get(100), 0);
285    }
286
287    #[test]
288    fn interval_coverage_to_vec() {
289        let cov = RleCoverage::from_intervals(&[iv(2, 5)], 8);
290        let expanded = cov.to_vec();
291        assert_eq!(expanded, vec![0, 0, 1, 1, 1, 0, 0, 0]);
292    }
293
294    #[test]
295    fn heavily_overlapping() {
296        // 5 intervals all covering [0, 10)
297        let intervals: Vec<GenomicInterval> = (0..5).map(|_| iv(0, 10)).collect();
298        let cov = RleCoverage::from_intervals(&intervals, 10);
299        assert_eq!(cov.get(0), 5);
300        assert_eq!(cov.get(9), 5);
301        assert_eq!(cov.num_runs(), 1); // one run of depth 5
302    }
303
304    #[test]
305    fn uniform_depth() {
306        let depths = vec![3; 1000];
307        let cov = RleCoverage::from_depths(&depths);
308        assert_eq!(cov.num_runs(), 1);
309        assert!((cov.mean_coverage() - 3.0).abs() < 1e-10);
310    }
311
312    #[test]
313    fn intervals_beyond_chrom_length() {
314        // Interval extends past chrom_length — should be clamped
315        let cov = RleCoverage::from_intervals(&[iv(5, 200)], 10);
316        assert_eq!(cov.len(), 10);
317        assert_eq!(cov.get(5), 1);
318        assert_eq!(cov.get(9), 1);
319        let expanded = cov.to_vec();
320        assert_eq!(expanded.len(), 10);
321    }
322}