rsomics_bed_reldist/
lib.rs1use std::io::{BufRead, BufReader, Read, Write};
27
28use rsomics_common::{Result, RsomicsError};
29
30const NUM_BINS: usize = 100;
31
32#[derive(Debug, Clone)]
34pub struct RelDistRow {
35 pub rel_dist: f64,
36 pub count: u64,
37 pub total: u64,
38 pub fraction: f64,
39}
40
41#[derive(Debug, Clone)]
43struct Bed3 {
44 chrom: String,
45 start: i64,
46 end: i64,
47}
48
49fn parse_bed3(line: &str) -> Option<Bed3> {
50 let t = line.trim_end_matches(['\n', '\r']);
51 if t.is_empty() || t.starts_with('#') || t.starts_with("track") || t.starts_with("browser") {
52 return None;
53 }
54 let mut cols = t.splitn(4, '\t');
55 let chrom = cols.next()?.to_owned();
56 let start: i64 = cols.next()?.parse().ok()?;
57 let end: i64 = cols.next()?.parse().ok()?;
58 Some(Bed3 { chrom, start, end })
59}
60
61fn read_bed<R: Read>(r: R) -> Result<Vec<Bed3>> {
62 let mut records = Vec::new();
63 for line in BufReader::new(r).lines() {
64 let line = line.map_err(RsomicsError::Io)?;
65 if let Some(rec) = parse_bed3(&line) {
66 records.push(rec);
67 }
68 }
69 Ok(records)
70}
71
72pub fn reldist<RA: Read, RB: Read>(a: RA, b: RB) -> Result<Vec<RelDistRow>> {
77 let a_recs = read_bed(a)?;
78 let b_recs = read_bed(b)?;
79
80 let mut bins = [0u64; NUM_BINS];
81 let mut total: u64 = 0;
82
83 let mut chrom_ranges: Vec<(&str, usize, usize)> = Vec::new();
85 {
86 let mut i = 0;
87 while i < b_recs.len() {
88 let chrom = b_recs[i].chrom.as_str();
89 let start = i;
90 while i < b_recs.len() && b_recs[i].chrom == chrom {
91 i += 1;
92 }
93 chrom_ranges.push((chrom, start, i));
94 }
95 }
96
97 for a_rec in &a_recs {
98 let Some((_ch, b_start, b_end)) = chrom_ranges
99 .iter()
100 .find(|(c, _, _)| *c == a_rec.chrom.as_str())
101 else {
102 continue;
103 };
104 let b_slice = &b_recs[*b_start..*b_end];
105 if b_slice.is_empty() {
106 continue;
107 }
108
109 let a_mid = i64::midpoint(a_rec.start, a_rec.end);
110
111 let right_idx = b_slice.partition_point(|b| b.start <= a_mid);
113
114 let dist_left = if right_idx > 0 {
115 let b_left = &b_slice[right_idx - 1];
116 let b_mid = i64::midpoint(b_left.start, b_left.end);
117 (a_mid - b_mid).max(0)
118 } else {
119 i64::MAX
120 };
121
122 let dist_right = if right_idx < b_slice.len() {
123 let b_right = &b_slice[right_idx];
124 let b_mid = i64::midpoint(b_right.start, b_right.end);
125 (b_mid - a_mid).max(0)
126 } else {
127 i64::MAX
128 };
129
130 if dist_left == i64::MAX && dist_right == i64::MAX {
131 continue;
132 }
133
134 let d_total = match (dist_left, dist_right) {
135 (i64::MAX, _) => dist_right * 2,
136 (_, i64::MAX) => dist_left * 2,
137 (l, r) => l + r,
138 };
139 if d_total == 0 {
140 bins[0] += 1;
142 total += 1;
143 continue;
144 }
145
146 let min_dist = dist_left.min(dist_right);
147 #[allow(clippy::cast_precision_loss)]
149 let rel = min_dist as f64 / d_total as f64;
150 #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
153 let bin = (rel * 2.0 * NUM_BINS as f64) as usize;
154 let bin = bin.min(NUM_BINS - 1);
155 bins[bin] += 1;
156 total += 1;
157 }
158
159 let rows: Vec<RelDistRow> = (0..NUM_BINS)
160 .map(|i| {
161 #[allow(clippy::cast_precision_loss)]
163 let rel_dist = (i as f64 + 0.5) * 0.5 / NUM_BINS as f64;
164 let count = bins[i];
165 let fraction = if total > 0 {
166 count as f64 / total as f64
167 } else {
168 0.0
169 };
170 RelDistRow {
171 rel_dist,
172 count,
173 total,
174 fraction,
175 }
176 })
177 .collect();
178
179 Ok(rows)
180}
181
182pub fn write_reldist<W: Write>(rows: &[RelDistRow], w: &mut W) -> Result<()> {
184 writeln!(w, "rel_dist\tcount\ttotal\tfraction").map_err(RsomicsError::Io)?;
185 for row in rows {
186 writeln!(
187 w,
188 "{:.2}\t{}\t{}\t{:.4}",
189 row.rel_dist, row.count, row.total, row.fraction
190 )
191 .map_err(RsomicsError::Io)?;
192 }
193 Ok(())
194}
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199 use std::io::Cursor;
200
201 fn run(a: &str, b: &str) -> Vec<RelDistRow> {
202 reldist(Cursor::new(a), Cursor::new(b)).unwrap()
203 }
204
205 #[test]
206 fn output_has_100_bins() {
207 let a = "chr1\t100\t200\n";
208 let b = "chr1\t50\t80\nchr1\t250\t300\n";
209 let rows = run(a, b);
210 assert_eq!(rows.len(), 100, "must have 100 bins");
211 }
212
213 #[test]
214 fn empty_a_returns_zero_counts() {
215 let rows = run("", "chr1\t100\t200\n");
216 let total: u64 = rows.iter().map(|r| r.count).sum();
217 assert_eq!(total, 0);
218 }
219
220 #[test]
221 fn empty_b_returns_zero_counts() {
222 let rows = run("chr1\t100\t200\n", "");
223 let total: u64 = rows.iter().map(|r| r.count).sum();
224 assert_eq!(total, 0);
225 }
226
227 #[test]
228 fn total_equals_num_a_records_with_neighbors() {
229 let a = "chr1\t100\t200\nchr1\t400\t500\n";
230 let b = "chr1\t50\t80\nchr1\t250\t300\nchr1\t600\t700\n";
231 let rows = run(a, b);
232 let total = rows.iter().map(|r| r.count).sum::<u64>();
233 assert_eq!(total, 2);
234 }
235
236 #[test]
237 fn fractions_sum_to_one_when_records_exist() {
238 let a = "chr1\t100\t200\nchr1\t400\t500\nchr1\t700\t800\n";
239 let b = "chr1\t50\t80\nchr1\t300\t350\nchr1\t600\t650\nchr1\t900\t950\n";
240 let rows = run(a, b);
241 let total_count: u64 = rows.iter().map(|r| r.count).sum();
242 if total_count > 0 {
243 let frac_sum: f64 = rows.iter().map(|r| r.fraction).sum();
244 assert!(
245 (frac_sum - 1.0).abs() < 1e-9,
246 "fractions must sum to 1.0 (got {frac_sum})"
247 );
248 }
249 }
250
251 #[test]
252 fn different_chromosomes_ignored() {
253 let a = "chr1\t100\t200\n";
254 let b = "chr2\t50\t80\n";
255 let rows = run(a, b);
256 let total: u64 = rows.iter().map(|r| r.count).sum();
257 assert_eq!(total, 0, "different chromosomes must not be matched");
258 }
259}