use std::collections::HashMap;
use omics::coordinate::Contig;
use omics::coordinate::Strand;
use omics::coordinate::interval::interbase::Interval;
use omics::coordinate::position::Number;
use rust_lapper as lapper;
use crate::alignment::section::header::Record as Header;
use crate::liftover::stepthrough::interval_pair::ContiguousIntervalPair;
pub mod builder;
pub use builder::Builder;
pub type ChromosomeDictionary = HashMap<String, Number>;
#[derive(Clone, Debug, Eq, PartialEq)]
struct AnnotatedPair {
chain_id: usize,
pair: ContiguousIntervalPair,
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct LiftoverResult {
chain: Header,
segments: Vec<ContiguousIntervalPair>,
}
impl LiftoverResult {
pub fn chain(&self) -> &Header {
&self.chain
}
pub fn segments(&self) -> &[ContiguousIntervalPair] {
&self.segments
}
pub fn into_segments(self) -> Vec<ContiguousIntervalPair> {
self.segments
}
}
#[derive(Debug)]
pub struct Machine {
inner: HashMap<Contig, lapper::Lapper<Number, AnnotatedPair>>,
chains: HashMap<usize, Header>,
reference_chromosomes: ChromosomeDictionary,
query_chromosomes: ChromosomeDictionary,
}
impl Machine {
pub fn reference_chromosomes(&self) -> &ChromosomeDictionary {
&self.reference_chromosomes
}
pub fn query_chromosomes(&self) -> &ChromosomeDictionary {
&self.query_chromosomes
}
pub fn liftover(&self, interval: Interval) -> Option<Vec<LiftoverResult>> {
let entry = self.inner.get(interval.contig())?;
let (start, stop) = match interval.strand() {
Strand::Positive => (
interval.start().position().get(),
interval.end().position().get(),
),
Strand::Negative => (
interval.end().position().get(),
interval.start().position().get(),
),
};
let strand = interval.strand();
let mut hits = Vec::<(usize, ContiguousIntervalPair)>::new();
for e in entry.find(start, stop) {
if e.val.pair.reference().strand() != strand {
continue;
}
let pair = e.val.pair.clone().clamp(interval.clone()).unwrap();
hits.push((e.val.chain_id, pair));
}
if hits.is_empty() {
return None;
}
hits.sort_by_key(|(id, _)| *id);
let mut results = Vec::<LiftoverResult>::new();
for (chain_id, pair) in hits {
match results.last_mut() {
Some(last) if last.chain().id() == chain_id => {
last.segments.push(pair);
}
_ => {
let chain = self.chains[&chain_id].clone();
results.push(LiftoverResult {
chain,
segments: vec![pair],
});
}
}
}
for result in &mut results {
result.segments.sort_by(|a, b| {
a.reference()
.start()
.cmp(b.reference().start())
.then_with(|| a.reference().end().cmp(b.reference().end()))
});
}
results.sort_by(|a, b| {
b.chain()
.score()
.cmp(&a.chain().score())
.then_with(|| a.chain().id().cmp(&b.chain().id()))
});
Some(results)
}
}
#[cfg(test)]
mod tests {
use omics::coordinate::Contig;
use omics::coordinate::interbase::Coordinate;
use omics::coordinate::position::interbase::Position;
use super::*;
use crate::Reader;
use crate::liftover::machine;
#[test]
pub fn test_valid_positive_strand_liftover() -> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 + 0 10 seq1 10 + 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 1u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 7u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 1);
let result = &results[0].segments()[0];
assert_eq!(result.reference().contig().as_str(), "seq0");
assert_eq!(result.reference().start().strand(), Strand::Positive);
assert_eq!(result.reference().start().position(), &Position::new(1));
assert_eq!(result.reference().end().strand(), Strand::Positive);
assert_eq!(result.reference().end().position(), &Position::new(7));
assert_eq!(result.query().contig().as_str(), "seq1");
assert_eq!(result.query().start().strand(), Strand::Positive);
assert_eq!(result.query().start().position(), &Position::new(1));
assert_eq!(result.query().end().strand(), Strand::Positive);
assert_eq!(result.query().end().position(), &Position::new(7));
Ok(())
}
#[test]
pub fn test_valid_negative_strand_liftover() -> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 - 0 10 seq1 10 - 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 7u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 1u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 1);
let result = &results[0].segments()[0];
assert_eq!(result.reference().contig().as_str(), "seq0");
assert_eq!(result.reference().start().strand(), Strand::Negative);
assert_eq!(result.reference().start().position(), &Position::new(7));
assert_eq!(result.reference().end().strand(), Strand::Negative);
assert_eq!(result.reference().end().position(), &Position::new(1));
assert_eq!(result.query().contig().as_str(), "seq1");
assert_eq!(result.query().start().strand(), Strand::Negative);
assert_eq!(result.query().start().position(), &Position::new(7));
assert_eq!(result.query().end().strand(), Strand::Negative);
assert_eq!(result.query().end().position(), &Position::new(1));
Ok(())
}
#[test]
pub fn test_valid_positive_to_negative_strand_liftover()
-> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 + 0 10 seq1 10 - 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 1u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 7u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 1);
let result = &results[0].segments()[0];
assert_eq!(result.reference().contig().as_str(), "seq0");
assert_eq!(result.reference().start().strand(), Strand::Positive);
assert_eq!(result.reference().start().position(), &Position::new(1));
assert_eq!(result.reference().end().strand(), Strand::Positive);
assert_eq!(result.reference().end().position(), &Position::new(7));
assert_eq!(result.query().contig().as_str(), "seq1");
assert_eq!(result.query().start().strand(), Strand::Negative);
assert_eq!(result.query().start().position(), &Position::new(9));
assert_eq!(result.query().end().strand(), Strand::Negative);
assert_eq!(result.query().end().position(), &Position::new(3));
Ok(())
}
#[test]
pub fn test_valid_negative_to_positive_strand_liftover()
-> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 - 0 10 seq1 10 + 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 7u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 1u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 1);
let result = &results[0].segments()[0];
assert_eq!(result.reference().contig().as_str(), "seq0");
assert_eq!(result.reference().start().strand(), Strand::Negative);
assert_eq!(result.reference().start().position(), &Position::new(7));
assert_eq!(result.reference().end().strand(), Strand::Negative);
assert_eq!(result.reference().end().position(), &Position::new(1));
assert_eq!(result.query().contig().as_str(), "seq1");
assert_eq!(result.query().start().strand(), Strand::Positive);
assert_eq!(result.query().start().position(), &Position::new(3));
assert_eq!(result.query().end().strand(), Strand::Positive);
assert_eq!(result.query().end().position(), &Position::new(9));
Ok(())
}
#[test]
pub fn test_nonexistent_positive_strand_interval_liftover()
-> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 - 0 10 seq1 10 - 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 1u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 7u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval);
assert_eq!(results, None);
Ok(())
}
#[test]
pub fn test_nonexistent_negative_strand_interval_liftover()
-> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 + 0 10 seq1 10 + 0 10 0\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 7u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Negative, 1u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval);
assert_eq!(results, None);
Ok(())
}
#[test]
fn test_grouped_by_chain() -> Result<(), Box<dyn std::error::Error>> {
let data =
b"chain 100 seq0 10 + 0 10 seq1 10 + 0 10 0\n10\n\nchain 50 seq0 10 + 0 10 seq2 10 + 0 10 1\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 1u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 5u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 2);
assert_eq!(results[0].chain().score(), 100);
assert_eq!(results[0].chain().id(), 0);
assert_eq!(results[0].segments().len(), 1);
assert_eq!(results[0].segments()[0].query().contig().as_str(), "seq1");
assert_eq!(results[1].chain().score(), 50);
assert_eq!(results[1].chain().id(), 1);
assert_eq!(results[1].segments().len(), 1);
assert_eq!(results[1].segments()[0].query().contig().as_str(), "seq2");
Ok(())
}
#[test]
fn test_gapped_chain_returns_multiple_segments() -> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 10 + 0 10 seq1 10 + 0 10 0\n4\t2\t2\n4";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 0u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 10u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 2);
assert_eq!(results[0].segments()[0].reference().count_entities(), 4);
assert_eq!(results[0].segments()[1].reference().count_entities(), 4);
Ok(())
}
#[test]
fn test_liftover_ordering_is_deterministic() -> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 30 seq0 10 + 0 10 seq3 10 + 0 10 2\n10\n\n\
chain 100 seq0 10 + 0 10 seq1 10 + 0 10 0\n10\n\n\
chain 50 seq0 10 + 0 10 seq2 10 + 0 10 1\n10";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 0u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 10u64);
let interval = Interval::try_new(from, to)?;
for _ in 0..10 {
let results = machine.liftover(interval.clone()).unwrap();
assert_eq!(results.len(), 3);
assert_eq!(results[0].chain().score(), 100);
assert_eq!(results[1].chain().score(), 50);
assert_eq!(results[2].chain().score(), 30);
}
Ok(())
}
#[test]
fn test_segments_ordered_by_reference_position() -> Result<(), Box<dyn std::error::Error>> {
let data = b"chain 0 seq0 20 + 0 20 seq1 20 + 0 20 0\n5\t2\t2\n5\t2\t2\n6";
let reader = Reader::new(&data[..]);
let machine = machine::Builder.try_build_from(reader)?;
let from = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 0u64);
let to = Coordinate::new(Contig::new_unchecked("seq0"), Strand::Positive, 20u64);
let interval = Interval::try_new(from, to)?;
let results = machine.liftover(interval).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].segments().len(), 3);
let starts: Vec<_> = results[0]
.segments()
.iter()
.map(|s| s.reference().start().position().get())
.collect();
assert_eq!(starts, vec![0, 7, 14]);
Ok(())
}
}