Skip to main content

rsomics_bed_reldist/
lib.rs

1//! Relative distance statistics between two BED interval sets.
2//!
3//! ## Algorithm
4//!
5//! For each query interval in file A, finds the nearest upstream and downstream
6//! intervals in file B. The relative distance is:
7//!
8//!   `relDist = min(distLeft, distRight) / (distLeft + distRight)`
9//!
10//! where `distLeft` is the distance to the B interval whose end is closest
11//! upstream, and `distRight` is the distance to the B interval whose start is
12//! closest downstream. Only intervals on the same chromosome are considered.
13//!
14//! Both files must be sorted by chromosome then start position (matching
15//! `bedtools sort` / `sort -k1,1 -k2,2n` output).
16//!
17//! The relative distance is binned into 100 equal bins over [0.0, 0.5]. The
18//! output reports, for each bin, the observed fraction and count.
19//!
20//! ## Reference
21//!
22//! Favorov et al. (2012). Exploring Massive, Genome Scale Datasets with the
23//! `GenometriCorr` Package. PLOS Computational Biology 8(5): e1002529.
24//! DOI: 10.1371/journal.pcbi.1002529
25
26use std::io::{BufRead, BufReader, Read, Write};
27
28use rsomics_common::{Result, RsomicsError};
29
30const NUM_BINS: usize = 100;
31
32/// One row of the reldist output table.
33#[derive(Debug, Clone)]
34pub struct RelDistRow {
35    pub rel_dist: f64,
36    pub count: u64,
37    pub total: u64,
38    pub fraction: f64,
39}
40
41/// Parsed BED3 record (chrom, start, end).
42#[derive(Debug, Clone)]
43struct Bed3 {
44    chrom: String,
45    start: i64,
46    end: i64,
47}
48
49fn parse_bed3(line: &str) -> Option<Bed3> {
50    let t = line.trim_end_matches(['\n', '\r']);
51    if t.is_empty() || t.starts_with('#') || t.starts_with("track") || t.starts_with("browser") {
52        return None;
53    }
54    let mut cols = t.splitn(4, '\t');
55    let chrom = cols.next()?.to_owned();
56    let start: i64 = cols.next()?.parse().ok()?;
57    let end: i64 = cols.next()?.parse().ok()?;
58    Some(Bed3 { chrom, start, end })
59}
60
61fn read_bed<R: Read>(r: R) -> Result<Vec<Bed3>> {
62    let mut records = Vec::new();
63    for line in BufReader::new(r).lines() {
64        let line = line.map_err(RsomicsError::Io)?;
65        if let Some(rec) = parse_bed3(&line) {
66            records.push(rec);
67        }
68    }
69    Ok(records)
70}
71
72/// Compute relative distances between query intervals in `a` and reference
73/// intervals in `b`, returning a 100-bin histogram table.
74///
75/// Both `a` and `b` must be sorted by (chrom, start).
76pub fn reldist<RA: Read, RB: Read>(a: RA, b: RB) -> Result<Vec<RelDistRow>> {
77    let a_recs = read_bed(a)?;
78    let b_recs = read_bed(b)?;
79
80    let mut bins = [0u64; NUM_BINS];
81    let mut total: u64 = 0;
82
83    // Group b by chromosome for O(log n) per-record lookup.
84    let mut chrom_ranges: Vec<(&str, usize, usize)> = Vec::new();
85    {
86        let mut i = 0;
87        while i < b_recs.len() {
88            let chrom = b_recs[i].chrom.as_str();
89            let start = i;
90            while i < b_recs.len() && b_recs[i].chrom == chrom {
91                i += 1;
92            }
93            chrom_ranges.push((chrom, start, i));
94        }
95    }
96
97    for a_rec in &a_recs {
98        let Some((_ch, b_start, b_end)) = chrom_ranges
99            .iter()
100            .find(|(c, _, _)| *c == a_rec.chrom.as_str())
101        else {
102            continue;
103        };
104        let b_slice = &b_recs[*b_start..*b_end];
105        if b_slice.is_empty() {
106            continue;
107        }
108
109        let a_mid = i64::midpoint(a_rec.start, a_rec.end);
110
111        // Find leftmost b whose start > a_mid → right neighbor.
112        let right_idx = b_slice.partition_point(|b| b.start <= a_mid);
113
114        let dist_left = if right_idx > 0 {
115            let b_left = &b_slice[right_idx - 1];
116            let b_mid = i64::midpoint(b_left.start, b_left.end);
117            (a_mid - b_mid).max(0)
118        } else {
119            i64::MAX
120        };
121
122        let dist_right = if right_idx < b_slice.len() {
123            let b_right = &b_slice[right_idx];
124            let b_mid = i64::midpoint(b_right.start, b_right.end);
125            (b_mid - a_mid).max(0)
126        } else {
127            i64::MAX
128        };
129
130        if dist_left == i64::MAX && dist_right == i64::MAX {
131            continue;
132        }
133
134        let d_total = match (dist_left, dist_right) {
135            (i64::MAX, _) => dist_right * 2,
136            (_, i64::MAX) => dist_left * 2,
137            (l, r) => l + r,
138        };
139        if d_total == 0 {
140            // Overlapping — relative distance is 0 → bin 0.
141            bins[0] += 1;
142            total += 1;
143            continue;
144        }
145
146        let min_dist = dist_left.min(dist_right);
147        // Both min_dist and d_total are non-negative; rel ∈ [0.0, 0.5].
148        #[allow(clippy::cast_precision_loss)]
149        let rel = min_dist as f64 / d_total as f64;
150        // Multiply by 2 to map [0.0, 0.5] → [0.0, 1.0], then scale to [0, NUM_BINS).
151        // rel is always non-negative, so truncation is the only concern (no sign loss).
152        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
153        let bin = (rel * 2.0 * NUM_BINS as f64) as usize;
154        let bin = bin.min(NUM_BINS - 1);
155        bins[bin] += 1;
156        total += 1;
157    }
158
159    let rows: Vec<RelDistRow> = (0..NUM_BINS)
160        .map(|i| {
161            // Bin centres: domain [0, 0.5], each bin covers 0.5/NUM_BINS.
162            #[allow(clippy::cast_precision_loss)]
163            let rel_dist = (i as f64 + 0.5) * 0.5 / NUM_BINS as f64;
164            let count = bins[i];
165            let fraction = if total > 0 {
166                count as f64 / total as f64
167            } else {
168                0.0
169            };
170            RelDistRow {
171                rel_dist,
172                count,
173                total,
174                fraction,
175            }
176        })
177        .collect();
178
179    Ok(rows)
180}
181
182/// Write reldist results to `w` in tab-separated format (matching bedtools reldist output).
183pub fn write_reldist<W: Write>(rows: &[RelDistRow], w: &mut W) -> Result<()> {
184    writeln!(w, "rel_dist\tcount\ttotal\tfraction").map_err(RsomicsError::Io)?;
185    for row in rows {
186        writeln!(
187            w,
188            "{:.2}\t{}\t{}\t{:.4}",
189            row.rel_dist, row.count, row.total, row.fraction
190        )
191        .map_err(RsomicsError::Io)?;
192    }
193    Ok(())
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use std::io::Cursor;
200
201    fn run(a: &str, b: &str) -> Vec<RelDistRow> {
202        reldist(Cursor::new(a), Cursor::new(b)).unwrap()
203    }
204
205    #[test]
206    fn output_has_100_bins() {
207        let a = "chr1\t100\t200\n";
208        let b = "chr1\t50\t80\nchr1\t250\t300\n";
209        let rows = run(a, b);
210        assert_eq!(rows.len(), 100, "must have 100 bins");
211    }
212
213    #[test]
214    fn empty_a_returns_zero_counts() {
215        let rows = run("", "chr1\t100\t200\n");
216        let total: u64 = rows.iter().map(|r| r.count).sum();
217        assert_eq!(total, 0);
218    }
219
220    #[test]
221    fn empty_b_returns_zero_counts() {
222        let rows = run("chr1\t100\t200\n", "");
223        let total: u64 = rows.iter().map(|r| r.count).sum();
224        assert_eq!(total, 0);
225    }
226
227    #[test]
228    fn total_equals_num_a_records_with_neighbors() {
229        let a = "chr1\t100\t200\nchr1\t400\t500\n";
230        let b = "chr1\t50\t80\nchr1\t250\t300\nchr1\t600\t700\n";
231        let rows = run(a, b);
232        let total = rows.iter().map(|r| r.count).sum::<u64>();
233        assert_eq!(total, 2);
234    }
235
236    #[test]
237    fn fractions_sum_to_one_when_records_exist() {
238        let a = "chr1\t100\t200\nchr1\t400\t500\nchr1\t700\t800\n";
239        let b = "chr1\t50\t80\nchr1\t300\t350\nchr1\t600\t650\nchr1\t900\t950\n";
240        let rows = run(a, b);
241        let total_count: u64 = rows.iter().map(|r| r.count).sum();
242        if total_count > 0 {
243            let frac_sum: f64 = rows.iter().map(|r| r.fraction).sum();
244            assert!(
245                (frac_sum - 1.0).abs() < 1e-9,
246                "fractions must sum to 1.0 (got {frac_sum})"
247            );
248        }
249    }
250
251    #[test]
252    fn different_chromosomes_ignored() {
253        let a = "chr1\t100\t200\n";
254        let b = "chr2\t50\t80\n";
255        let rows = run(a, b);
256        let total: u64 = rows.iter().map(|r| r.count).sum();
257        assert_eq!(total, 0, "different chromosomes must not be matched");
258    }
259}