use std::collections::{BTreeMap, HashMap};
use biocommons_bioutils::assemblies::{Assembly, Sequence, ASSEMBLY_INFOS};
use crate::common::cli::CANONICAL;
#[derive(thiserror::Error, Debug)]
pub enum ContigMapError {
#[error("the sequence name/alias/accession `{0}` is not known")]
UnknownSequence(String),
}
pub struct ContigMap {
pub assembly: Assembly,
pub name_map: HashMap<String, usize>,
}
impl ContigMap {
pub fn new(assembly: Assembly) -> Self {
let mut name_map = HashMap::new();
let info = &ASSEMBLY_INFOS[assembly];
for (idx, seq) in info.sequences.iter().enumerate() {
name_map.insert(seq.name.clone(), idx);
name_map.insert(seq.refseq_ac.clone(), idx);
name_map.insert(seq.genbank_ac.clone(), idx);
for alias in seq.aliases.iter() {
name_map.insert(alias.clone(), idx);
}
}
Self { assembly, name_map }
}
pub fn chrom_to_idx(
&self,
chrom: &noodles_vcf::record::Chromosome,
) -> Result<usize, ContigMapError> {
match chrom {
noodles_vcf::record::Chromosome::Name(s)
| noodles_vcf::record::Chromosome::Symbol(s) => self.chrom_name_to_idx(s),
}
}
pub fn chrom_name_to_idx(&self, chrom: &str) -> Result<usize, ContigMapError> {
self.name_map
.get(chrom)
.copied()
.ok_or_else(|| ContigMapError::UnknownSequence(chrom.to_string()))
}
pub fn chrom_name_to_seq(&self, chrom: &str) -> Result<&Sequence, ContigMapError> {
Ok(&ASSEMBLY_INFOS[self.assembly].sequences[self.chrom_name_to_idx(chrom)?])
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
struct Key {
chrom: String,
pos: noodles_vcf::record::Position,
reference: String,
alternative: String,
idx: usize,
}
fn build_key(record: &noodles_vcf::Record, i: usize) -> Key {
Key {
chrom: record.chromosome().to_string(),
pos: record.position(),
reference: record.reference_bases().to_string(),
alternative: record
.alternate_bases()
.first()
.expect("must have alternate allele")
.to_string(),
idx: i,
}
}
pub struct MultiQuery<'r, 'h, R>
where
R: std::io::Read + std::io::Seek,
{
queries: Vec<noodles_vcf::reader::Query<'r, 'h, R>>,
records: BTreeMap<Key, noodles_vcf::Record>,
}
impl<'r, 'h, R> MultiQuery<'r, 'h, R>
where
R: std::io::Read + std::io::Seek,
{
pub fn new(
mut record_iters: Vec<noodles_vcf::reader::Query<'r, 'h, R>>,
) -> std::io::Result<Self> {
let mut records = BTreeMap::new();
for (i, iter) in record_iters.iter_mut().enumerate() {
if let Some(result) = iter.next() {
let record = result?;
let key = build_key(&record, i);
records.insert(key, record);
}
}
Ok(Self {
queries: record_iters,
records,
})
}
}
impl<'r, 'h, R> Iterator for MultiQuery<'r, 'h, R>
where
R: std::io::Read + std::io::Seek,
{
type Item = std::io::Result<(usize, noodles_vcf::Record)>;
fn next(&mut self) -> Option<Self::Item> {
let (Key { idx, .. }, record) = self.records.pop_first()?;
if let Some(result) = self.queries[idx].next() {
match result {
Ok(record) => {
let key = build_key(&record, idx);
self.records.insert(key, record);
}
Err(e) => return Some(Err(e)),
}
}
Some(Ok((idx, record)))
}
}
pub fn guess_assembly(
vcf_header: &noodles_vcf::Header,
ambiguous_ok: bool,
initial_assembly: Option<Assembly>,
) -> Result<Assembly, anyhow::Error> {
let mut result = initial_assembly;
let assembly_infos = [
(Assembly::Grch37p10, &ASSEMBLY_INFOS[Assembly::Grch37p10]),
(Assembly::Grch38, &ASSEMBLY_INFOS[Assembly::Grch38]),
];
for (assembly, info) in assembly_infos.iter() {
let contig_map = ContigMap::new(*assembly);
let mut lengths = HashMap::new();
for seq in &info.sequences {
if CANONICAL.contains(&seq.name.as_str()) {
lengths.insert(
contig_map.name_map.get(seq.name.as_str()).unwrap(),
seq.length,
);
}
}
let mut incompatible = 0;
let mut compatible = 0;
for (name, data) in vcf_header.contigs() {
if let Some(length) = data.length() {
let idx = contig_map.name_map.get(name.as_ref());
if let Some(idx) = idx {
let name = &info.sequences[*idx].name;
if CANONICAL.contains(&name.as_ref()) {
if *lengths.get(idx).unwrap() == length {
compatible += 1;
} else {
incompatible += 1;
}
}
}
} else {
tracing::warn!(
"Cannot guess assembly because no length for contig {}",
&name
);
compatible = 0;
break;
}
}
if compatible > 0 && incompatible == 0 {
if let Some(result) = result {
if result != *assembly && !ambiguous_ok {
return Err(anyhow::anyhow!(
"Found ambiguity; initial={:?}, previous={:?}, current={:?}",
initial_assembly,
result,
assembly,
));
}
} else {
result = Some(*assembly);
}
} else {
if let Some(initial_assembly) = initial_assembly {
if initial_assembly == *assembly {
return Err(anyhow::anyhow!(
"Incompatible with initial assembly {:?}",
result.unwrap()
));
}
}
}
}
if let Some(result) = result {
Ok(result)
} else {
Err(anyhow::anyhow!("No matching assembly found"))
}
}
#[cfg(test)]
mod test {
use biocommons_bioutils::assemblies::Assembly;
use super::*;
#[test]
fn guess_assembly_helix_chrmt_ambiguous_ok_initial_none() -> Result<(), anyhow::Error> {
let path = "tests/freqs/reading/helix.chrM.vcf";
let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?;
let header = reader.read_header()?;
let actual = guess_assembly(&header, true, None)?;
assert_eq!(actual, Assembly::Grch37p10);
Ok(())
}
#[test]
fn guess_assembly_helix_chrmt_ambiguous_ok_initial_override() -> Result<(), anyhow::Error> {
let path = "tests/freqs/reading/helix.chrM.vcf";
let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?;
let header = reader.read_header()?;
let actual = guess_assembly(&header, true, Some(Assembly::Grch37p10))?;
assert_eq!(actual, Assembly::Grch37p10);
Ok(())
}
#[test]
fn guess_assembly_helix_chrmt_ambiguous_ok_initial_override_fails() -> Result<(), anyhow::Error>
{
let path = "tests/freqs/reading/helix.chrM.vcf";
let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?;
let header = reader.read_header()?;
assert!(guess_assembly(&header, false, Some(Assembly::Grch37)).is_err());
Ok(())
}
#[test]
fn guess_assembly_helix_chrmt_ambiguous_fail() -> Result<(), anyhow::Error> {
let path = "tests/freqs/reading/helix.chrM.vcf";
let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?;
let header = reader.read_header()?;
assert!(guess_assembly(&header, false, None).is_err());
Ok(())
}
#[test]
fn contig_map_smoke() {
ContigMap::new(Assembly::Grch37p10);
ContigMap::new(Assembly::Grch38);
}
#[test]
fn test_multiquery() -> Result<(), anyhow::Error> {
let mut readers = vec![
noodles_vcf::indexed_reader::Builder::default()
.build_from_path("tests/freqs/reading/gnomad.chrM.vcf.bgz")?,
noodles_vcf::indexed_reader::Builder::default()
.build_from_path("tests/freqs/reading/helix.chrM.vcf.bgz")?,
];
let headers: Vec<_> = readers
.iter_mut()
.map(|reader| reader.read_header())
.collect::<Result<_, _>>()?;
let start = noodles_core::position::Position::try_from(1)?;
let stop = noodles_core::position::Position::try_from(16569)?;
let region = noodles_core::region::Region::new("chrM", start..=stop);
let queries: Vec<_> = readers
.iter_mut()
.zip(&headers)
.map(|(reader, header)| reader.query(header, ®ion))
.collect::<Result<_, _>>()?;
let multi_query = MultiQuery::new(queries)?;
let mut records = Vec::new();
for result in multi_query {
records.push(result?);
}
insta::assert_debug_snapshot!(records);
Ok(())
}
}