Skip to main content

rsomics_bed_total_bp/
lib.rs

1use rsomics_common::{Result, RsomicsError};
2use std::io::BufRead;
3
4/// Sum `end - start` across all BED records; return the total base-pair count.
5///
6/// Header lines (`#`) and blank lines are skipped. Overlapping intervals are
7/// NOT merged — this reports the raw sum, identical to `bedtools genomecov -bg | awk '{sum+=($3-$2)} END{print sum}'`
8/// on a pre-sorted, non-merged BED.
9pub fn total_bp<R: BufRead>(reader: R) -> Result<u64> {
10    let mut total: u64 = 0;
11    for line in reader.lines() {
12        let line = line.map_err(RsomicsError::Io)?;
13        if line.starts_with('#') || line.is_empty() {
14            continue;
15        }
16        let mut fields = line.splitn(4, '\t');
17        let _chrom = fields.next().unwrap_or("");
18        let start_str = fields.next().unwrap_or("");
19        let end_str = fields.next().unwrap_or("");
20
21        let start: u64 = start_str
22            .parse()
23            .map_err(|e| RsomicsError::InvalidInput(format!("start: {e}")))?;
24        let end: u64 = end_str
25            .parse()
26            .map_err(|e| RsomicsError::InvalidInput(format!("end: {e}")))?;
27        total += end.saturating_sub(start);
28    }
29    Ok(total)
30}
31
32#[cfg(test)]
33mod tests {
34    use super::*;
35    use std::io::Cursor;
36
37    #[test]
38    fn basic() {
39        let input = "chr1\t0\t100\nchr2\t200\t300\n";
40        assert_eq!(total_bp(Cursor::new(input)).unwrap(), 200);
41    }
42
43    #[test]
44    fn skip_header() {
45        let input = "# comment\nchr1\t0\t50\n";
46        assert_eq!(total_bp(Cursor::new(input)).unwrap(), 50);
47    }
48
49    #[test]
50    fn overlapping_counted_twice() {
51        // Total is raw sum, overlaps not merged
52        let input = "chr1\t0\t100\nchr1\t50\t150\n";
53        assert_eq!(total_bp(Cursor::new(input)).unwrap(), 200);
54    }
55}