use std::collections::BinaryHeap;
use std::io::{BufRead, BufReader, BufWriter, Read, Write};
use rsomics_common::{Result, RsomicsError};
#[derive(Debug, Clone)]
struct Bed3 {
chrom: String,
start: i64,
end: i64,
}
#[derive(Debug, Eq, PartialEq)]
struct Event {
coord: i64,
is_start: bool,
file_idx: usize,
}
impl Ord for Event {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
other
.coord
.cmp(&self.coord)
.then(self.is_start.cmp(&other.is_start)) }
}
impl PartialOrd for Event {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
struct FileReader<R: BufRead> {
reader: R,
peeked: Option<Bed3>,
}
impl<R: BufRead> FileReader<R> {
fn new(reader: R) -> Result<Self> {
let mut fr = FileReader {
reader,
peeked: None,
};
fr.peeked = fr.read_next()?;
Ok(fr)
}
fn read_next(&mut self) -> Result<Option<Bed3>> {
loop {
let mut line = String::new();
let n = self.reader.read_line(&mut line).map_err(RsomicsError::Io)?;
if n == 0 {
return Ok(None);
}
let t = line.trim_end_matches(['\n', '\r']);
if t.is_empty()
|| t.starts_with('#')
|| t.starts_with("track")
|| t.starts_with("browser")
{
continue;
}
let mut cols = t.splitn(4, '\t');
let chrom = match cols.next() {
Some(c) => c.to_owned(),
None => continue,
};
let start: i64 = match cols.next().and_then(|s| s.parse().ok()) {
Some(v) => v,
None => continue,
};
let end: i64 = match cols.next().and_then(|s| s.parse().ok()) {
Some(v) => v,
None => continue,
};
return Ok(Some(Bed3 { chrom, start, end }));
}
}
fn peek(&self) -> Option<&Bed3> {
self.peeked.as_ref()
}
fn next(&mut self) -> Result<Option<Bed3>> {
let cur = self.peeked.take();
self.peeked = self.read_next()?;
Ok(cur)
}
}
pub fn multiinter<R: Read, W: Write>(
readers: Vec<R>,
names: &[String],
header: bool,
out: W,
) -> Result<()> {
let n = readers.len();
let mut file_readers: Vec<FileReader<BufReader<R>>> = readers
.into_iter()
.map(|r| FileReader::new(BufReader::new(r)))
.collect::<Result<Vec<_>>>()?;
let mut writer = BufWriter::new(out);
if header {
write!(writer, "chrom\tstart\tend\tnum\tlist").map_err(RsomicsError::Io)?;
for name in names {
write!(writer, "\t{name}").map_err(RsomicsError::Io)?;
}
writeln!(writer).map_err(RsomicsError::Io)?;
}
let mut processed_chroms: Vec<String> = Vec::new();
loop {
let next_chrom = file_readers
.iter()
.filter_map(|fr| fr.peek().map(|r| r.chrom.as_str()))
.min()
.map(ToOwned::to_owned);
let Some(chrom) = next_chrom else {
break;
};
if !processed_chroms.contains(&chrom) {
processed_chroms.push(chrom.clone());
}
let mut queue: BinaryHeap<Event> = BinaryHeap::new();
for (file_idx, fr) in file_readers.iter_mut().enumerate() {
while fr.peek().is_some_and(|r| r.chrom == chrom) {
let rec = fr.next()?.unwrap();
queue.push(Event {
coord: rec.start,
is_start: true,
file_idx,
});
queue.push(Event {
coord: rec.end,
is_start: false,
file_idx,
});
}
}
let mut depth = vec![0u32; n];
let mut active_count = 0usize;
let mut segment_start: i64 = 0;
let mut in_segment = false;
while let Some(event) = queue.pop() {
let coord = event.coord;
let presence_changes = if event.is_start {
depth[event.file_idx] == 0
} else {
depth[event.file_idx] == 1
};
if presence_changes && in_segment && coord > segment_start {
emit_row(
&mut writer,
&chrom,
segment_start,
coord,
active_count,
&depth,
names,
)?;
}
if event.is_start {
depth[event.file_idx] += 1;
if depth[event.file_idx] == 1 {
active_count += 1;
}
} else {
depth[event.file_idx] -= 1;
if depth[event.file_idx] == 0 {
active_count = active_count.saturating_sub(1);
}
}
if presence_changes {
if active_count > 0 {
segment_start = coord;
in_segment = true;
} else {
in_segment = false;
}
}
}
}
writer.flush().map_err(RsomicsError::Io)?;
Ok(())
}
fn emit_row<W: Write>(
w: &mut W,
chrom: &str,
start: i64,
end: i64,
active_count: usize,
depth: &[u32],
names: &[String],
) -> Result<()> {
let mut list = String::new();
let mut first = true;
for (i, &d) in depth.iter().enumerate() {
if d > 0 {
if !first {
list.push(',');
}
list.push_str(&names[i]);
first = false;
}
}
if list.is_empty() {
list.push_str("none");
}
write!(w, "{chrom}\t{start}\t{end}\t{active_count}\t{list}").map_err(RsomicsError::Io)?;
for &d in depth {
write!(w, "\t{}", u32::from(d > 0)).map_err(RsomicsError::Io)?;
}
writeln!(w).map_err(RsomicsError::Io)
}
#[cfg(test)]
mod tests {
use std::io::Cursor;
use super::*;
fn run(inputs: &[&str], names: &[&str]) -> String {
let readers: Vec<Cursor<&str>> = inputs.iter().map(|s| Cursor::new(*s)).collect();
let name_strings: Vec<String> = names.iter().copied().map(str::to_string).collect();
let mut out = Vec::new();
multiinter(readers, &name_strings, false, &mut out).unwrap();
String::from_utf8(out).unwrap()
}
#[test]
fn no_overlap_two_files() {
let out = run(&["chr1\t100\t200\n", "chr1\t300\t400\n"], &["f1", "f2"]);
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 2, "lines: {out:?}");
assert!(lines[0].contains("\t1\tf1\t1\t0"), "row0: {}", lines[0]);
assert!(lines[1].contains("\t1\tf2\t0\t1"), "row1: {}", lines[1]);
}
#[test]
fn full_overlap_two_files() {
let out = run(&["chr1\t100\t300\n", "chr1\t100\t300\n"], &["f1", "f2"]);
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 1, "lines: {out:?}");
assert!(lines[0].contains("\t2\tf1,f2\t1\t1"), "row: {}", lines[0]);
}
#[test]
fn partial_overlap_three_files() {
let out = run(
&["chr1\t100\t200\n", "chr1\t150\t250\n", "chr1\t50\t120\n"],
&["f1", "f2", "f3"],
);
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 5, "lines: {out:?}");
assert!(lines[0].starts_with("chr1\t50\t100\t1\tf3"), "{}", lines[0]);
assert!(
lines[1].starts_with("chr1\t100\t120\t2\tf1,f3"),
"{}",
lines[1]
);
assert!(
lines[2].starts_with("chr1\t120\t150\t1\tf1"),
"{}",
lines[2]
);
assert!(
lines[3].starts_with("chr1\t150\t200\t2\tf1,f2"),
"{}",
lines[3]
);
assert!(
lines[4].starts_with("chr1\t200\t250\t1\tf2"),
"{}",
lines[4]
);
}
#[test]
fn header_printed_when_requested() {
let readers: Vec<Cursor<&str>> = vec![Cursor::new("chr1\t10\t20\n")];
let names = vec!["s1".to_string()];
let mut out = Vec::new();
multiinter(readers, &names, true, &mut out).unwrap();
let s = String::from_utf8(out).unwrap();
assert!(s.starts_with("chrom\tstart\tend\tnum\tlist\ts1"), "{s}");
}
#[test]
fn multi_chrom_ordering() {
let out = run(
&["chr1\t10\t20\nchr2\t10\t20\n", "chr2\t15\t25\n"],
&["f1", "f2"],
);
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 4, "lines:\n{out}");
assert!(lines[0].starts_with("chr1"), "{}", lines[0]);
assert!(lines[1].starts_with("chr2\t10\t15"), "{}", lines[1]);
assert!(lines[2].starts_with("chr2\t15\t20"), "{}", lines[2]);
assert!(lines[3].starts_with("chr2\t20\t25"), "{}", lines[3]);
}
#[test]
fn empty_file_does_not_crash() {
let out = run(&["", "chr1\t100\t200\n"], &["f1", "f2"]);
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines.len(), 1);
assert!(lines[0].contains("\t1\tf2\t0\t1"), "{}", lines[0]);
}
}