Skip to main content

rsomics_bed_multiinter/
lib.rs

1//! Multi-file interval intersection depth sweep.
2//!
3//! ## Algorithm
4//!
5//! For each position in the genome, reports how many of the N input BED files
6//! have at least one interval covering that position. Uses a coordinate-sweep
7//! (priority-queue over start/end events) per chromosome, emitting a row for
8//! every contiguous run of constant depth.
9//!
10//! Output columns (tab-separated):
11//! ```text
12//! chrom  start  end  depth  list  file1  file2  ...  fileN
13//! ```
14//! - `depth`: number of files with coverage at this position.
15//! - `list`: comma-separated names (1-based indices, or `-names` labels) of
16//!   covering files; `"none"` when depth = 0.
17//! - `file1..fileN`: 1 if that file has coverage, 0 otherwise.
18//!
19//! All input files must be sorted by chrom then start. Chromosomes are
20//! processed in the order they first appear across files (matching bedtools).
21//!
22//! ## Reference
23//!
24//! `BEDTools` multiinter — Quinlan & Hall (2010). Bioinformatics 26(6): 841–842.
25//! DOI: 10.1093/bioinformatics/btq033
26
27use std::collections::BinaryHeap;
28use std::io::{BufRead, BufReader, BufWriter, Read, Write};
29
30use rsomics_common::{Result, RsomicsError};
31
32// ── Data types ─────────────────────────────────────────────────────────────
33
34/// A parsed BED record (first three columns only).
35#[derive(Debug, Clone)]
36struct Bed3 {
37    chrom: String,
38    start: i64,
39    end: i64,
40}
41
42/// One coordinate event: a start or end of a file's interval.
43#[derive(Debug, Eq, PartialEq)]
44struct Event {
45    /// Coordinate position.
46    coord: i64,
47    /// Whether this is a start (true) or end (false) event.
48    is_start: bool,
49    /// Which file (0-based index) this event belongs to.
50    file_idx: usize,
51}
52
53/// Events are ordered by coord ascending, then ends before starts at the same
54/// coord (matching bedtools' tie-breaking: process END before START so an
55/// interval [10,20) and [20,30) do not share a reported segment at coord 20).
56impl Ord for Event {
57    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
58        // BinaryHeap is a max-heap; we want min-heap, so reverse.
59        other
60            .coord
61            .cmp(&self.coord)
62            .then(self.is_start.cmp(&other.is_start)) // false (end) < true (start)
63    }
64}
65impl PartialOrd for Event {
66    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
67        Some(self.cmp(other))
68    }
69}
70
71// ── Reader ─────────────────────────────────────────────────────────────────
72
73struct FileReader<R: BufRead> {
74    reader: R,
75    /// The buffered "next" record (peeked but not yet consumed).
76    peeked: Option<Bed3>,
77}
78
79impl<R: BufRead> FileReader<R> {
80    fn new(reader: R) -> Result<Self> {
81        let mut fr = FileReader {
82            reader,
83            peeked: None,
84        };
85        fr.peeked = fr.read_next()?;
86        Ok(fr)
87    }
88
89    fn read_next(&mut self) -> Result<Option<Bed3>> {
90        loop {
91            let mut line = String::new();
92            let n = self.reader.read_line(&mut line).map_err(RsomicsError::Io)?;
93            if n == 0 {
94                return Ok(None);
95            }
96            let t = line.trim_end_matches(['\n', '\r']);
97            if t.is_empty()
98                || t.starts_with('#')
99                || t.starts_with("track")
100                || t.starts_with("browser")
101            {
102                continue;
103            }
104            let mut cols = t.splitn(4, '\t');
105            let chrom = match cols.next() {
106                Some(c) => c.to_owned(),
107                None => continue,
108            };
109            let start: i64 = match cols.next().and_then(|s| s.parse().ok()) {
110                Some(v) => v,
111                None => continue,
112            };
113            let end: i64 = match cols.next().and_then(|s| s.parse().ok()) {
114                Some(v) => v,
115                None => continue,
116            };
117            return Ok(Some(Bed3 { chrom, start, end }));
118        }
119    }
120
121    /// Peek at the current record without consuming it.
122    fn peek(&self) -> Option<&Bed3> {
123        self.peeked.as_ref()
124    }
125
126    /// Consume and return the current record, loading the next.
127    fn next(&mut self) -> Result<Option<Bed3>> {
128        let cur = self.peeked.take();
129        self.peeked = self.read_next()?;
130        Ok(cur)
131    }
132}
133
134// ── Core sweep ─────────────────────────────────────────────────────────────
135
136/// Run the multi-interval intersection sweep.
137///
138/// Reads N BED files from `readers` (each `(impl Read)`), sweeps coordinates,
139/// and writes the result table to `out`.
140pub fn multiinter<R: Read, W: Write>(
141    readers: Vec<R>,
142    names: &[String],
143    header: bool,
144    out: W,
145) -> Result<()> {
146    let n = readers.len();
147    let mut file_readers: Vec<FileReader<BufReader<R>>> = readers
148        .into_iter()
149        .map(|r| FileReader::new(BufReader::new(r)))
150        .collect::<Result<Vec<_>>>()?;
151
152    let mut writer = BufWriter::new(out);
153
154    if header {
155        write!(writer, "chrom\tstart\tend\tnum\tlist").map_err(RsomicsError::Io)?;
156        for name in names {
157            write!(writer, "\t{name}").map_err(RsomicsError::Io)?;
158        }
159        writeln!(writer).map_err(RsomicsError::Io)?;
160    }
161
162    // Determine chromosome processing order: first appearance across all files.
163    // We collect the first chrom of each file (their "current" chrom) and build
164    // an ordered list as we process.
165    let mut processed_chroms: Vec<String> = Vec::new();
166
167    loop {
168        // Find the lexicographically smallest chrom among all files' current records.
169        let next_chrom = file_readers
170            .iter()
171            .filter_map(|fr| fr.peek().map(|r| r.chrom.as_str()))
172            .min()
173            .map(ToOwned::to_owned);
174
175        let Some(chrom) = next_chrom else {
176            break;
177        };
178
179        if !processed_chroms.contains(&chrom) {
180            processed_chroms.push(chrom.clone());
181        }
182
183        // Build the event queue for this chromosome from all files.
184        let mut queue: BinaryHeap<Event> = BinaryHeap::new();
185
186        for (file_idx, fr) in file_readers.iter_mut().enumerate() {
187            // Consume all intervals on this chromosome, pushing events.
188            while fr.peek().is_some_and(|r| r.chrom == chrom) {
189                let rec = fr.next()?.unwrap();
190                queue.push(Event {
191                    coord: rec.start,
192                    is_start: true,
193                    file_idx,
194                });
195                queue.push(Event {
196                    coord: rec.end,
197                    is_start: false,
198                    file_idx,
199                });
200            }
201        }
202
203        // Sweep: track which files are "active" (have depth > 0) at each coord.
204        // Per-file depth tracks nesting so we know exactly when presence changes.
205        // A new output row is emitted only when the PRESENCE SET changes — matching
206        // bedtools' behaviour of outputting 0/1 per file, not raw depth.
207        let mut depth = vec![0u32; n];
208        let mut active_count = 0usize;
209        let mut segment_start: i64 = 0;
210        let mut in_segment = false;
211
212        while let Some(event) = queue.pop() {
213            let coord = event.coord;
214
215            // Does this event change which files are PRESENT (depth 0→1 or 1→0)?
216            let presence_changes = if event.is_start {
217                depth[event.file_idx] == 0
218            } else {
219                depth[event.file_idx] == 1
220            };
221
222            // When presence set changes: close the current open segment (if any).
223            if presence_changes && in_segment && coord > segment_start {
224                emit_row(
225                    &mut writer,
226                    &chrom,
227                    segment_start,
228                    coord,
229                    active_count,
230                    &depth,
231                    names,
232                )?;
233            }
234
235            // Update depth.
236            if event.is_start {
237                depth[event.file_idx] += 1;
238                if depth[event.file_idx] == 1 {
239                    active_count += 1;
240                }
241            } else {
242                depth[event.file_idx] -= 1;
243                if depth[event.file_idx] == 0 {
244                    active_count = active_count.saturating_sub(1);
245                }
246            }
247
248            // After updating, start a new segment at coord if there are active files.
249            if presence_changes {
250                if active_count > 0 {
251                    segment_start = coord;
252                    in_segment = true;
253                } else {
254                    in_segment = false;
255                }
256            }
257        }
258    }
259
260    writer.flush().map_err(RsomicsError::Io)?;
261    Ok(())
262}
263
264fn emit_row<W: Write>(
265    w: &mut W,
266    chrom: &str,
267    start: i64,
268    end: i64,
269    active_count: usize,
270    depth: &[u32],
271    names: &[String],
272) -> Result<()> {
273    // Build the comma-separated list of covering file names.
274    let mut list = String::new();
275    let mut first = true;
276    for (i, &d) in depth.iter().enumerate() {
277        if d > 0 {
278            if !first {
279                list.push(',');
280            }
281            list.push_str(&names[i]);
282            first = false;
283        }
284    }
285    if list.is_empty() {
286        list.push_str("none");
287    }
288
289    write!(w, "{chrom}\t{start}\t{end}\t{active_count}\t{list}").map_err(RsomicsError::Io)?;
290    for &d in depth {
291        write!(w, "\t{}", u32::from(d > 0)).map_err(RsomicsError::Io)?;
292    }
293    writeln!(w).map_err(RsomicsError::Io)
294}
295
296// ── Tests ───────────────────────────────────────────────────────────────────
297
298#[cfg(test)]
299mod tests {
300    use std::io::Cursor;
301
302    use super::*;
303
304    fn run(inputs: &[&str], names: &[&str]) -> String {
305        let readers: Vec<Cursor<&str>> = inputs.iter().map(|s| Cursor::new(*s)).collect();
306        let name_strings: Vec<String> = names.iter().copied().map(str::to_string).collect();
307        let mut out = Vec::new();
308        multiinter(readers, &name_strings, false, &mut out).unwrap();
309        String::from_utf8(out).unwrap()
310    }
311
312    #[test]
313    fn no_overlap_two_files() {
314        let out = run(&["chr1\t100\t200\n", "chr1\t300\t400\n"], &["f1", "f2"]);
315        let lines: Vec<&str> = out.lines().collect();
316        // Two non-overlapping intervals → 2 rows, each with depth 1
317        assert_eq!(lines.len(), 2, "lines: {out:?}");
318        assert!(lines[0].contains("\t1\tf1\t1\t0"), "row0: {}", lines[0]);
319        assert!(lines[1].contains("\t1\tf2\t0\t1"), "row1: {}", lines[1]);
320    }
321
322    #[test]
323    fn full_overlap_two_files() {
324        let out = run(&["chr1\t100\t300\n", "chr1\t100\t300\n"], &["f1", "f2"]);
325        let lines: Vec<&str> = out.lines().collect();
326        assert_eq!(lines.len(), 1, "lines: {out:?}");
327        assert!(lines[0].contains("\t2\tf1,f2\t1\t1"), "row: {}", lines[0]);
328    }
329
330    #[test]
331    fn partial_overlap_three_files() {
332        // A:[100,200) B:[150,250) C:[50,120)
333        // Events: 50(S,C),100(S,A),120(E,C),150(S,B),200(E,A),250(E,B)
334        // Segments: [50,100)→C only, [100,120)→A+C, [120,150)→A only,
335        //           [150,200)→A+B, [200,250)→B only
336        let out = run(
337            &["chr1\t100\t200\n", "chr1\t150\t250\n", "chr1\t50\t120\n"],
338            &["f1", "f2", "f3"],
339        );
340        let lines: Vec<&str> = out.lines().collect();
341        assert_eq!(lines.len(), 5, "lines: {out:?}");
342        // [50,100) depth=1, list=f3
343        assert!(lines[0].starts_with("chr1\t50\t100\t1\tf3"), "{}", lines[0]);
344        // [100,120) depth=2, list=f1,f3
345        assert!(
346            lines[1].starts_with("chr1\t100\t120\t2\tf1,f3"),
347            "{}",
348            lines[1]
349        );
350        // [120,150) depth=1, list=f1
351        assert!(
352            lines[2].starts_with("chr1\t120\t150\t1\tf1"),
353            "{}",
354            lines[2]
355        );
356        // [150,200) depth=2, list=f1,f2
357        assert!(
358            lines[3].starts_with("chr1\t150\t200\t2\tf1,f2"),
359            "{}",
360            lines[3]
361        );
362        // [200,250) depth=1, list=f2
363        assert!(
364            lines[4].starts_with("chr1\t200\t250\t1\tf2"),
365            "{}",
366            lines[4]
367        );
368    }
369
370    #[test]
371    fn header_printed_when_requested() {
372        let readers: Vec<Cursor<&str>> = vec![Cursor::new("chr1\t10\t20\n")];
373        let names = vec!["s1".to_string()];
374        let mut out = Vec::new();
375        multiinter(readers, &names, true, &mut out).unwrap();
376        let s = String::from_utf8(out).unwrap();
377        assert!(s.starts_with("chrom\tstart\tend\tnum\tlist\ts1"), "{s}");
378    }
379
380    #[test]
381    fn multi_chrom_ordering() {
382        let out = run(
383            &["chr1\t10\t20\nchr2\t10\t20\n", "chr2\t15\t25\n"],
384            &["f1", "f2"],
385        );
386        let lines: Vec<&str> = out.lines().collect();
387        // chr1: f1 only → 1 row
388        // chr2: [10,15) f1, [15,20) f1+f2, [20,25) f2 → 3 rows
389        assert_eq!(lines.len(), 4, "lines:\n{out}");
390        assert!(lines[0].starts_with("chr1"), "{}", lines[0]);
391        assert!(lines[1].starts_with("chr2\t10\t15"), "{}", lines[1]);
392        assert!(lines[2].starts_with("chr2\t15\t20"), "{}", lines[2]);
393        assert!(lines[3].starts_with("chr2\t20\t25"), "{}", lines[3]);
394    }
395
396    #[test]
397    fn empty_file_does_not_crash() {
398        let out = run(&["", "chr1\t100\t200\n"], &["f1", "f2"]);
399        let lines: Vec<&str> = out.lines().collect();
400        assert_eq!(lines.len(), 1);
401        assert!(lines[0].contains("\t1\tf2\t0\t1"), "{}", lines[0]);
402    }
403}