use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::io::BufRead;
use omics::coordinate::Contig;
use omics::coordinate::Strand;
use omics::coordinate::position::Number;
use rust_lapper as lapper;
use thiserror::Error;
use super::AnnotatedPair;
use super::ChromosomeDictionary;
use super::Machine;
use crate::alignment;
use crate::alignment::section::header::Record as Header;
use crate::liftover;
use crate::reader;
type Iv = lapper::Interval<Number, AnnotatedPair>;
#[derive(Debug, Error)]
pub enum Error {
#[error("invalid data section: {0}")]
InvalidSections(alignment::section::sections::Error),
#[error("stepthrough error: {0}")]
StepthroughError(liftover::stepthrough::Error),
#[error(
"duplicate chain id `{id}` with inconsistent headers: expected `{expected}`, found \
`{found}`"
)]
DuplicateChainId {
id: usize,
expected: Header,
found: Header,
},
}
type Result<T> = std::result::Result<T, Error>;
#[allow(missing_debug_implementations)]
pub struct Builder;
impl Builder {
#[allow(clippy::result_large_err)]
pub fn try_build_from<T>(&self, mut reader: reader::Reader<T>) -> Result<Machine>
where
T: BufRead,
{
let mut hm = HashMap::<Contig, Vec<Iv>>::default();
let mut chains = HashMap::<usize, Header>::new();
let mut reference_chromosomes = ChromosomeDictionaryBuilder::default();
let mut query_chromosomes = ChromosomeDictionaryBuilder::default();
for result in reader.sections() {
let section = result.map_err(Error::InvalidSections)?;
let header = section.header().clone();
match chains.entry(header.id()) {
Entry::Occupied(e) => {
if *e.get() != header {
return Err(Error::DuplicateChainId {
id: header.id(),
expected: e.get().clone(),
found: header,
});
}
}
Entry::Vacant(e) => {
e.insert(header.clone());
}
}
query_chromosomes.update(
header.query_sequence().chromosome_name().to_string(),
header.query_sequence().chromosome_size(),
);
reference_chromosomes.update(
header.reference_sequence().chromosome_name().to_string(),
header.reference_sequence().chromosome_size(),
);
let contig = Contig::new_unchecked(header.reference_sequence().chromosome_name());
let entry = hm.entry(contig).or_default();
for pair_result in section.stepthrough().map_err(Error::StepthroughError)? {
let pair = pair_result.map_err(Error::StepthroughError)?;
let (start, stop) = match pair.reference().strand() {
Strand::Positive => {
let start = pair.reference().start().position().get();
let end = pair.reference().end().position().get();
(start, end)
}
Strand::Negative => {
let start = pair.reference().end().position().get();
let end = pair.reference().start().position().get();
(start, end)
}
};
entry.push(lapper::Interval {
start,
stop,
val: AnnotatedPair {
chain_id: header.id(),
pair,
},
})
}
}
let mut inner = HashMap::<Contig, lapper::Lapper<Number, AnnotatedPair>>::new();
for (k, v) in hm.into_iter() {
inner.insert(k, lapper::Lapper::new(v));
}
let reference_chromosomes = reference_chromosomes.build();
let query_chromosomes = query_chromosomes.build();
Ok(Machine {
inner,
chains,
reference_chromosomes,
query_chromosomes,
})
}
}
impl Default for Builder {
fn default() -> Self {
Self
}
}
#[derive(Default)]
struct ChromosomeDictionaryBuilder(ChromosomeDictionary);
impl ChromosomeDictionaryBuilder {
fn update(&mut self, contig: String, size: Number) {
match self.0.get(&contig) {
Some(existing) => {
assert!(
*existing == size,
"the current size conflicts with the previously set size"
);
}
None => {
self.0.insert(contig, size);
}
}
}
fn build(self) -> ChromosomeDictionary {
self.0
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Reader;
#[test]
fn duplicate_chain_id_with_inconsistent_headers() {
let data =
b"chain 100 seq0 10 + 0 10 seq1 10 + 0 10 0\n10\n\nchain 50 seq0 10 + 0 10 seq2 10 + 0 10 0\n10";
let reader = Reader::new(&data[..]);
let err = Builder.try_build_from(reader).unwrap_err();
assert!(
matches!(err, Error::DuplicateChainId { id: 0, .. }),
"expected DuplicateChainId error, got: {err}"
);
}
#[test]
fn duplicate_chain_id_with_consistent_headers_succeeds() {
let data =
b"chain 100 seq0 10 + 0 10 seq1 10 + 0 10 0\n10\n\nchain 100 seq0 10 + 0 10 seq1 10 + 0 10 0\n10";
let reader = Reader::new(&data[..]);
assert!(Builder.try_build_from(reader).is_ok());
}
}