use biocommons_bioutils::assemblies::{ASSEMBLY_INFOS, Assembly, Sequence};
use indexmap::IndexMap;
#[derive(Debug, Clone)]
pub struct ContigManager {
alias_to_accession: IndexMap<String, String>,
accession_to_info: IndexMap<String, Sequence>,
name_to_chrom_no: IndexMap<String, u32>,
assembly: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContigInfo {
pub name_without_chr: String,
pub name_with_chr: String,
pub accession: String,
pub chrom_no: u32,
}
const CHR_X: u32 = 23;
const CHR_Y: u32 = 24;
const CHR_M: u32 = 25;
impl ContigManager {
pub fn new(assembly_name: &str) -> Self {
let mut alias_to_accession = IndexMap::new();
let mut accession_to_info = IndexMap::new();
let mut name_to_chrom_no = IndexMap::new();
let assembly = match assembly_name.to_lowercase().as_str() {
"grch37" | "grch37p10" => Some(Assembly::Grch37p10),
"grch38" => Some(Assembly::Grch38),
_ => None,
};
if let Some(assembly) = assembly {
for seq in &ASSEMBLY_INFOS[assembly].sequences {
if !["Primary Assembly", "non-nuclear"].contains(&&*seq.assembly_unit)
|| seq.sequence_role != "assembled-molecule"
{
tracing::debug!("Skipping non-primary sequence: {:?}", seq);
continue;
}
accession_to_info.insert(seq.refseq_ac.clone(), seq.clone());
alias_to_accession.insert(seq.name.clone(), seq.refseq_ac.clone());
alias_to_accession.insert(seq.refseq_ac.clone(), seq.refseq_ac.clone());
for alias in &seq.aliases {
alias_to_accession.insert(alias.clone(), seq.refseq_ac.clone());
}
}
} else {
tracing::debug!(
"Unknown assembly '{}', no assembly-derived contig mappings available",
assembly_name
);
}
for i in 1..=22 {
name_to_chrom_no.insert(format!("{}", i), i);
name_to_chrom_no.insert(format!("chr{}", i), i);
}
name_to_chrom_no.insert("X".to_string(), CHR_X);
name_to_chrom_no.insert("chrX".to_string(), CHR_X);
name_to_chrom_no.insert("Y".to_string(), CHR_Y);
name_to_chrom_no.insert("chrY".to_string(), CHR_Y);
name_to_chrom_no.insert("MT".to_string(), CHR_M);
name_to_chrom_no.insert("chrMT".to_string(), CHR_M);
name_to_chrom_no.insert("M".to_string(), CHR_M);
name_to_chrom_no.insert("chrM".to_string(), CHR_M);
let mut additional_aliases = IndexMap::new();
for (accession, info) in &accession_to_info {
let name = &info.name;
if !name.starts_with("chr") {
let chr_name = format!("chr{}", name);
if !alias_to_accession.contains_key(&chr_name) {
additional_aliases.insert(chr_name, accession.clone());
}
}
else if let Some(stripped_name) = name.strip_prefix("chr")
&& !alias_to_accession.contains_key(stripped_name)
{
additional_aliases.insert(stripped_name.to_string(), accession.clone());
}
}
alias_to_accession.extend(additional_aliases);
let mt_acc = alias_to_accession
.get("chrMT")
.or_else(|| alias_to_accession.get("MT"))
.cloned();
if let Some(ref mt_acc) = mt_acc {
alias_to_accession.insert("M".to_string(), mt_acc.clone());
alias_to_accession.insert("chrM".to_string(), mt_acc.clone());
}
Self {
assembly: assembly_name.to_string(),
alias_to_accession,
accession_to_info,
name_to_chrom_no,
}
}
pub fn assembly(&self) -> &str {
&self.assembly
}
#[inline]
pub fn get_accession(&self, alias: &str) -> Option<&String> {
self.alias_to_accession.get(alias)
}
#[inline]
pub(crate) fn get_primary_name(&self, alias: &str) -> Option<&String> {
self.get_accession(alias)
.and_then(|ac| self.accession_to_info.get(ac))
.map(|info| &info.name)
}
#[inline]
pub fn get_chrom_no(&self, alias: &str) -> Option<u32> {
self.get_primary_name(alias)
.and_then(|name| self.name_to_chrom_no.get(name).copied())
.or_else(|| self.name_to_chrom_no.get(alias).copied())
}
#[inline]
pub fn is_autosomal(chrom_no: u32) -> bool {
(1..=22).contains(&chrom_no)
}
#[inline]
pub fn is_gonosomal(chrom_no: u32) -> bool {
chrom_no == CHR_X || chrom_no == CHR_Y
}
#[inline]
pub fn is_chr_x(chrom_no: u32) -> bool {
chrom_no == CHR_X
}
#[inline]
pub fn is_chr_y(chrom_no: u32) -> bool {
chrom_no == CHR_Y
}
#[inline]
pub fn is_mitochondrial(chrom_no: u32) -> bool {
chrom_no == CHR_M
}
pub fn is_canonical(chrom_no: u32) -> bool {
(1..=25).contains(&chrom_no)
}
#[inline]
pub fn is_autosomal_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_autosomal)
}
#[inline]
pub fn is_gonosomal_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_gonosomal)
}
#[inline]
pub fn is_chr_x_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_chr_x)
}
#[inline]
pub fn is_chr_y_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_chr_y)
}
#[inline]
pub fn is_mitochondrial_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_mitochondrial)
}
#[inline]
pub fn is_canonical_alias(&self, alias: &str) -> bool {
self.get_chrom_no(alias).is_some_and(Self::is_canonical)
}
pub fn get_contig_info(&self, alias: &str) -> Option<ContigInfo> {
let accession = self.get_accession(alias)?;
let seq_info = self.accession_to_info.get(accession)?;
let chrom_no = self.get_chrom_no(alias)?;
let primary_name = &seq_info.name;
let (name_with_chr, name_without_chr) = if primary_name.starts_with("chr") {
(
primary_name.clone(),
primary_name.strip_prefix("chr").unwrap().to_string(),
)
} else {
(format!("chr{}", primary_name), primary_name.clone())
};
Some(ContigInfo {
name_without_chr,
name_with_chr,
accession: accession.clone(),
chrom_no,
})
}
pub fn sequences(&self) -> impl Iterator<Item = &Sequence> {
self.accession_to_info.values()
}
}