use crate::annotate::cli::{TranscriptPickMode, TranscriptPickType};
use crate::annotate::seqvars::reference::{
InMemoryFastaAccess, ReferenceReader, UnbufferedIndexedFastaAccess,
};
use crate::common::contig::ContigManager;
use crate::db::TranscriptDatabase;
use crate::db::create::models::Reason;
use crate::{
annotate::seqvars::csq::ALT_ALN_METHOD,
pbs::txs::{GeneToTxId, Strand, Transcript, TranscriptTag, TxSeqDatabase},
};
use bio::data_structures::interval_tree::ArrayBackedIntervalTree;
use enumflags2::BitFlags;
use hgvs::{
data::error::Error,
data::{
cdot::json::NCBI_ALN_METHOD,
interface::{
Provider as ProviderInterface, TxExonsRecord, TxForRegionRecord, TxIdentityInfo,
TxInfoRecord, TxMappingOptionsRecord,
},
},
sequences::{TranslationTable, seq_md5},
};
use indexmap::IndexMap;
use itertools::Itertools;
use rustc_hash::FxHashMap;
use std::path::Path;
use std::sync::Arc;
type IntervalTree = ArrayBackedIntervalTree<i32, u32>;
pub struct TxIntervalTrees {
pub contig_to_idx: FxHashMap<String, usize>,
pub trees: Vec<IntervalTree>,
}
impl TxIntervalTrees {
pub fn new(db: &TxSeqDatabase) -> Self {
let (contig_to_idx, trees) = Self::build_indices(db);
Self {
contig_to_idx,
trees,
}
}
fn build_indices(db: &TxSeqDatabase) -> (FxHashMap<String, usize>, Vec<IntervalTree>) {
let assembly_name = db.assembly();
let mut contig_to_idx = FxHashMap::default();
let mut trees: Vec<IntervalTree> = Vec::new();
let tx_db = db.tx_db.as_ref().expect("no tx_db?");
let mut txs_counted = 0;
for (tx_id, tx) in tx_db.transcripts.iter().enumerate() {
for genome_alignment in &tx.genome_alignments {
if genome_alignment
.genome_build
.eq_ignore_ascii_case(&assembly_name)
{
let contig = &genome_alignment.contig;
let contig_idx = *contig_to_idx.entry(contig.clone()).or_insert_with(|| {
let idx = trees.len();
trees.push(IntervalTree::new());
idx
});
let mut start = i32::MAX;
let mut stop = i32::MIN;
for exon in &genome_alignment.exons {
start = std::cmp::min(start, exon.alt_start_i);
stop = std::cmp::max(stop, exon.alt_end_i);
}
if start <= stop {
trees[contig_idx].insert(start..stop, tx_id as u32);
}
}
}
txs_counted += 1;
}
tracing::debug!(
"Indexed {} transcripts across {} contigs for assembly {}",
txs_counted,
trees.len(),
assembly_name
);
trees.iter_mut().for_each(|t| t.index());
(contig_to_idx, trees)
}
pub fn get_tx_for_region(
&self,
tx_seq_db: &TxSeqDatabase,
alt_ac: &str,
_alt_aln_method: &str,
start_i: i32,
end_i: i32,
) -> Result<Vec<TxForRegionRecord>, Error> {
let contig_idx = match self.contig_to_idx.get(alt_ac) {
Some(idx) => *idx,
None => return Ok(Vec::new()),
};
let query = start_i..end_i;
let tx_idxs = self.trees[contig_idx].find(query);
Ok(tx_idxs
.iter()
.map(|entry| {
let tx = &tx_seq_db.tx_db.as_ref().expect("no tx_db?").transcripts
[*entry.data() as usize];
assert_eq!(
tx.genome_alignments.len(),
1,
"Can only have one alignment in Mehari"
);
let alt_strand = tx.genome_alignments.first().unwrap().strand;
TxForRegionRecord {
tx_ac: tx.id.clone(),
alt_ac: alt_ac.to_string(),
alt_strand: match Strand::try_from(alt_strand).expect("invalid strand") {
Strand::Plus => 1,
Strand::Minus => -1,
_ => unreachable!("invalid strand {}", alt_strand),
},
alt_aln_method: ALT_ALN_METHOD.to_string(),
start_i,
end_i,
}
})
.collect())
}
}
pub trait PbsTranscriptExt {
fn is_clean(&self) -> bool;
fn is_incomplete_3p(&self) -> bool;
fn available_cds_len(&self, tx_len: i32) -> Option<i32>;
}
impl PbsTranscriptExt for Transcript {
fn is_clean(&self) -> bool {
self.filter_reason.unwrap_or(0) == 0
}
fn is_incomplete_3p(&self) -> bool {
let flags = BitFlags::<Reason>::from_bits_truncate(self.filter_reason.unwrap_or(0));
flags.intersects(Reason::MissingStopCodon | Reason::ThreePrimeEndTruncated)
}
fn available_cds_len(&self, tx_len: i32) -> Option<i32> {
self.start_codon
.map(|start| self.stop_codon.unwrap_or(tx_len) - start)
}
}
#[derive(Debug, Clone, Default, derive_builder::Builder)]
#[builder(pattern = "immutable")]
pub struct Config {
#[builder(default)]
pub pick_transcript: Vec<TranscriptPickType>,
#[builder(default)]
pub pick_transcript_mode: TranscriptPickMode,
}
pub struct Provider {
pub tx_seq_db: TxSeqDatabase,
pub tx_trees: TxIntervalTrees,
pub contig_manager: Arc<ContigManager>,
gene_map: FxHashMap<String, u32>,
tx_map: FxHashMap<String, u32>,
seq_map: FxHashMap<String, u32>,
assembly_map: IndexMap<String, String>,
contig_alias_map: FxHashMap<String, String>,
picked_gene_to_tx_id: Option<Vec<GeneToTxId>>,
reference_reader: Option<ReferenceReaderImpl>,
data_version: String,
schema_version: String,
}
enum ReferenceReaderImpl {
InMemory(InMemoryFastaAccess),
Unbuffered(UnbufferedIndexedFastaAccess),
}
impl ReferenceReader for ReferenceReaderImpl {
fn get(
&self,
ac: &str,
start: Option<u64>,
end: Option<u64>,
) -> anyhow::Result<Option<Vec<u8>>> {
match self {
ReferenceReaderImpl::InMemory(v) => v.get(ac, start, end),
ReferenceReaderImpl::Unbuffered(v) => v.get(ac, start, end),
}
}
}
fn transcript_length(tx: &Transcript) -> i32 {
let mut max_tx_length = 0;
for genome_alignment in tx.genome_alignments.iter() {
let mut tx_length = 0;
for exon_alignment in genome_alignment.exons.iter() {
tx_length += exon_alignment.alt_cds_end_i() - exon_alignment.alt_cds_start_i();
}
if tx_length > max_tx_length {
max_tx_length = tx_length;
}
}
max_tx_length
}
impl Provider {
pub fn new(
mut tx_seq_db: TxSeqDatabase,
reference: Option<impl AsRef<Path>>,
in_memory_reference: bool,
config: Config,
) -> Self {
let assembly_name = tx_seq_db.assembly().clone();
let contig_manager = Arc::new(ContigManager::new(&assembly_name));
let mut assembly_map = IndexMap::new();
if let Some(tx_db) = &tx_seq_db.tx_db {
for tx in &tx_db.transcripts {
for aln in &tx.genome_alignments {
if aln.genome_build.eq_ignore_ascii_case(&assembly_name)
&& !assembly_map.contains_key(&aln.contig)
{
let common_name = contig_manager
.get_primary_name(&aln.contig)
.cloned()
.unwrap_or_else(|| aln.contig.clone());
assembly_map.insert(aln.contig.clone(), common_name);
}
}
}
}
let tx_trees = TxIntervalTrees::new(&tx_seq_db);
let mut contig_alias_map = FxHashMap::default();
if let Some(tx_db) = &tx_seq_db.tx_db {
for tx in &tx_db.transcripts {
for aln in &tx.genome_alignments {
let db_contig = aln.contig.clone();
contig_alias_map.insert(db_contig.clone(), db_contig.clone());
if let Some(primary) = contig_manager.get_primary_name(&db_contig) {
contig_alias_map.insert(primary.to_string(), db_contig.clone());
}
if let Some(acc) = contig_manager.get_accession(&db_contig) {
contig_alias_map.insert(acc.to_string(), db_contig.clone());
}
if let Some(info) = contig_manager.get_contig_info(&db_contig) {
contig_alias_map.insert(info.name_with_chr.clone(), db_contig.clone());
contig_alias_map.insert(info.name_without_chr.clone(), db_contig.clone());
}
}
}
}
let gene_map = FxHashMap::from_iter(
tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.gene_to_tx
.iter()
.enumerate()
.map(|(idx, entry)| (entry.gene_id.clone(), idx as u32)),
);
let tx_map = FxHashMap::from_iter(
tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts
.iter()
.enumerate()
.map(|(idx, tx)| (tx.id.clone(), idx as u32)),
);
let seq_map = FxHashMap::from_iter(
tx_seq_db
.seq_db
.as_ref()
.expect("no seq_db?")
.aliases
.iter()
.zip(
tx_seq_db
.seq_db
.as_ref()
.expect("no seq_db?")
.aliases_idx
.iter(),
)
.map(|(alias, idx)| (alias.clone(), *idx)),
);
let picked_gene_to_tx_id = Self::picked_genes_to_tx_map(&mut tx_seq_db, &config, &tx_map);
let data_version = format!(
"{}{}{:?}",
tx_seq_db.version.as_ref().unwrap_or(&"".to_string()),
tx_seq_db
.source_version
.iter()
.map(|v| format!("{:#?}", v))
.join(","),
config
);
let schema_version = data_version.clone();
let reference_reader = match (reference, in_memory_reference) {
(Some(path), false) => Some(ReferenceReaderImpl::Unbuffered(
UnbufferedIndexedFastaAccess::from_path(path, contig_manager.clone())
.expect("Failed to open reference FASTA file"),
)),
(Some(path), true) => Some(ReferenceReaderImpl::InMemory(
InMemoryFastaAccess::from_path(path, contig_manager.clone())
.expect("Failed to open reference FASTA file"),
)),
(None, _) => None,
};
Self {
tx_seq_db,
tx_trees,
contig_manager,
contig_alias_map,
gene_map,
tx_map,
seq_map,
assembly_map,
picked_gene_to_tx_id,
reference_reader,
data_version,
schema_version,
}
}
fn picked_genes_to_tx_map(
tx_seq_db: &mut TxSeqDatabase,
config: &Config,
tx_map: &FxHashMap<String, u32>,
) -> Option<Vec<GeneToTxId>> {
if config.pick_transcript.is_empty() || tx_seq_db.tx_db.is_none() {
return None;
}
let tx_db = tx_seq_db.tx_db.as_mut().unwrap();
let mut new_gene_to_tx = Vec::new();
fn tag_to_picktype(tag: &i32) -> Option<TranscriptPickType> {
match TranscriptTag::try_from(*tag).unwrap() {
TranscriptTag::Unknown | TranscriptTag::Other | TranscriptTag::OtherBackport => {
None
}
TranscriptTag::Basic => Some(TranscriptPickType::Basic),
TranscriptTag::EnsemblCanonical => Some(TranscriptPickType::EnsemblCanonical),
TranscriptTag::ManeSelect => Some(TranscriptPickType::ManeSelect),
TranscriptTag::ManePlusClinical => Some(TranscriptPickType::ManePlusClinical),
TranscriptTag::RefSeqSelect => Some(TranscriptPickType::RefSeqSelect),
TranscriptTag::Selenoprotein => None,
TranscriptTag::GencodePrimary => Some(TranscriptPickType::GencodePrimary),
TranscriptTag::EnsemblGraft => None,
TranscriptTag::BasicBackport => Some(TranscriptPickType::BasicBackport),
TranscriptTag::EnsemblCanonicalBackport => {
Some(TranscriptPickType::EnsemblCanonicalBackport)
}
TranscriptTag::ManeSelectBackport => Some(TranscriptPickType::ManeSelectBackport),
TranscriptTag::ManePlusClinicalBackport => {
Some(TranscriptPickType::ManePlusClinicalBackport)
}
TranscriptTag::RefSeqSelectBackport => {
Some(TranscriptPickType::RefSeqSelectBackport)
}
TranscriptTag::SelenoproteinBackport => None,
TranscriptTag::GencodePrimaryBackport => {
Some(TranscriptPickType::GencodePrimaryBackport)
}
}
}
let transcript_id_to_source = |tx_id: &str| -> String {
if tx_id.starts_with("ENST") {
"Ensembl".to_string()
} else if tx_id.starts_with('N') || tx_id.starts_with('X') {
"RefSeq".to_string()
} else {
"Other".to_string()
}
};
for entry in tx_db.gene_to_tx.iter() {
let mut longest_tx_per_source: FxHashMap<String, (bool, usize, i32)> =
FxHashMap::default();
let mut tx_tags = entry
.tx_ids
.iter()
.filter_map(|tx_id| {
tx_map.get(tx_id).map(|tx_idx| {
let tx = &tx_db.transcripts[*tx_idx as usize];
let tags = tx.tags.iter().filter_map(tag_to_picktype).collect_vec();
let length = transcript_length(tx);
let source = transcript_id_to_source(tx_id);
let is_clean = tx.is_clean();
(tx_id, tags, length, source, is_clean)
})
})
.enumerate()
.map(|(i, (tx_id, tags, length, source, is_clean))| {
longest_tx_per_source
.entry(source)
.and_modify(|(prev_clean, prev_i, prev_length)| {
if (is_clean, length) > (*prev_clean, *prev_length) {
*prev_clean = is_clean;
*prev_i = i;
*prev_length = length;
}
})
.or_insert((is_clean, i, length));
(tx_id, tags, length)
})
.collect_vec();
for (_, (_, i, _)) in longest_tx_per_source.iter() {
tx_tags[*i].1.push(TranscriptPickType::Length);
}
let tx_ids = match config.pick_transcript_mode {
TranscriptPickMode::First => {
let tx_id = config
.pick_transcript
.iter()
.filter_map(|pick| {
tx_tags
.iter()
.find(|(_, tags, _)| tags.contains(pick))
.map(|(tx_id, _, _)| tx_id)
})
.next();
if let Some(tx_id) = tx_id {
vec![tx_id.to_string()]
} else {
vec![]
}
}
TranscriptPickMode::All => {
tx_tags
.iter()
.filter_map(|(tx_id, tags, _)| {
tags.iter()
.any(|tag| config.pick_transcript.contains(tag))
.then_some(tx_id.to_string())
})
.collect()
}
};
let new_entry = if !tx_ids.is_empty() {
GeneToTxId {
gene_id: entry.gene_id.clone(),
tx_ids,
filtered: Some(false),
filter_reason: None,
}
} else {
tracing::trace!(
"no transcript found for gene {} with the chosen transcript picking strategy: {:?}",
&entry.gene_id,
&config.pick_transcript
);
GeneToTxId {
gene_id: entry.gene_id.clone(),
tx_ids: vec![],
filtered: Some(true),
filter_reason: Some(Reason::NoTranscriptLeft as u32),
}
};
tracing::trace!(
"picked transcripts {:?} for gene {}",
new_entry.tx_ids,
new_entry.gene_id
);
new_gene_to_tx.push(new_entry);
}
Some(new_gene_to_tx)
}
pub fn assembly(&self) -> String {
self.tx_seq_db.assembly().clone()
}
pub fn transcript_picking(&self) -> bool {
self.picked_gene_to_tx_id.is_some()
}
pub fn get_picked_transcripts(&self, hgnc_id: &str) -> Option<Vec<String>> {
self.gene_map.get(hgnc_id).and_then(|gene_idx| {
let gene_to_tx = if let Some(picked_gene_to_tx_id) = self.picked_gene_to_tx_id.as_ref()
{
picked_gene_to_tx_id
} else {
&self.tx_seq_db.tx_db.as_ref().expect("no tx_db?").gene_to_tx
};
let gene = &gene_to_tx[*gene_idx as usize];
if let Some(true) = gene.filtered {
None
} else {
Some(gene.tx_ids.clone())
}
})
}
pub fn get_tx(&self, tx_id: &str) -> Option<&Transcript> {
self.tx_map.get(tx_id).and_then(|idx| {
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[*idx as usize];
if let Some(true) = tx.filtered {
None
} else {
Some(tx)
}
})
}
pub fn reference_available(&self) -> bool {
self.reference_reader.is_some()
}
}
impl ProviderInterface for Provider {
fn data_version(&self) -> &str {
&self.data_version
}
fn schema_version(&self) -> &str {
&self.schema_version
}
fn get_assembly_map(
&self,
_assembly: &str,
) -> Result<IndexMap<String, String>, hgvs::data::error::Error> {
Ok(self.assembly_map.clone())
}
fn get_gene_info(&self, _hgnc: &str) -> Result<hgvs::data::interface::GeneInfoRecord, Error> {
panic!("not implemented");
}
fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, Error> {
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx_idx = tx_idx as usize;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx];
Ok(tx.protein.clone())
}
fn get_seq_part(
&self,
ac: &str,
begin: Option<usize>,
end: Option<usize>,
) -> Result<String, Error> {
let is_contig = self.contig_alias_map.contains_key(ac);
let seq = if is_contig && self.reference_available() {
let reader = self.reference_reader.as_ref().unwrap();
let seq = reader
.get(ac, begin.map(|x| x as u64), end.map(|x| x as u64))
.map_err(|e| {
tracing::error!("Failed to fetch sequence for {}: {}", ac, e);
Error::NoSequenceRecord(format!("{} - {}", ac, e))
})?
.ok_or_else(|| Error::NoSequenceRecord(ac.to_string()))?;
return String::from_utf8(seq).map_err(|_| {
Error::NoSequenceRecord("Failed converting seq to UTF-8.".to_string())
});
} else {
let seq_idx = *self
.seq_map
.get(ac)
.ok_or_else(|| Error::NoSequenceRecord(ac.to_string()))?;
let seq_idx = seq_idx as usize;
self.tx_seq_db.seq_db.as_ref().expect("no seq_db?").seqs[seq_idx].as_bytes()
};
let slice = match (begin, end) {
(Some(begin), Some(end)) => {
let begin = std::cmp::min(begin, seq.len());
let end = std::cmp::min(end, seq.len());
&seq[begin..end]
}
(Some(begin), None) => {
let begin = std::cmp::min(begin, seq.len());
&seq[begin..]
}
(None, Some(end)) => &seq[..end],
(None, None) => seq,
};
Ok(String::from_utf8_lossy(slice).to_string())
}
fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, Error> {
Ok(vec![format!("MD5_{}", seq_md5(seq, true)?)])
}
fn get_similar_transcripts(
&self,
_tx_ac: &str,
) -> Result<Vec<hgvs::data::interface::TxSimilarityRecord>, Error> {
panic!("not implemented");
}
fn get_tx_exons(
&self,
tx_ac: &str,
alt_ac: &str,
_alt_aln_method: &str,
) -> Result<Vec<TxExonsRecord>, Error> {
let db_contig = self
.contig_alias_map
.get(alt_ac)
.map(String::as_str)
.unwrap_or(alt_ac);
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx_idx = tx_idx as usize;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx];
let hgnc = tx.gene_id.clone();
let tx_ac_str = tx_ac.to_string();
let alt_ac_str = db_contig.to_string();
let alt_aln_method_str = ALT_ALN_METHOD.to_string();
for genome_alignment in &tx.genome_alignments {
if genome_alignment.contig == db_contig {
let alt_strand =
match Strand::try_from(genome_alignment.strand).expect("invalid strand") {
Strand::Plus => 1,
Strand::Minus => -1,
_ => unreachable!("invalid strand {}", &genome_alignment.strand),
};
let mut exons = Vec::with_capacity(genome_alignment.exons.len());
for exon in &genome_alignment.exons {
exons.push(TxExonsRecord {
hgnc: hgnc.clone(),
tx_ac: tx_ac_str.clone(),
alt_ac: alt_ac_str.clone(),
alt_aln_method: alt_aln_method_str.clone(),
alt_strand,
ord: exon.ord,
tx_start_i: exon.alt_cds_start_i.map(|val| val - 1).unwrap_or(-1),
tx_end_i: exon.alt_cds_end_i.unwrap_or(-1),
alt_start_i: exon.alt_start_i,
alt_end_i: exon.alt_end_i,
cigar: exon.cigar.clone(),
tx_aseq: None,
alt_aseq: None,
tx_exon_set_id: i32::MAX,
alt_exon_set_id: i32::MAX,
tx_exon_id: i32::MAX,
alt_exon_id: i32::MAX,
exon_aln_id: i32::MAX,
});
}
exons.sort_unstable_by_key(|e| e.alt_start_i);
return Ok(exons);
}
}
Err(Error::NoAlignmentFound(
tx_ac.to_string(),
alt_ac.to_string(),
))
}
fn get_tx_exon_coords(
&self,
tx_ac: &str,
alt_ac: &str,
_alt_aln_method: &str,
) -> Result<Vec<(i32, i32)>, Error> {
let db_contig = self
.contig_alias_map
.get(alt_ac)
.map(String::as_str)
.unwrap_or(alt_ac);
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx as usize];
for genome_alignment in &tx.genome_alignments {
if genome_alignment.contig == db_contig {
let mut coords = Vec::with_capacity(genome_alignment.exons.len());
for exon in &genome_alignment.exons {
let tx_start = exon.alt_cds_start_i.map(|val| val - 1).unwrap_or(-1);
let tx_end = exon.alt_cds_end_i.unwrap_or(-1);
coords.push((exon.alt_start_i, tx_start, tx_end));
}
coords.sort_unstable_by_key(|c| c.0);
return Ok(coords.into_iter().map(|(_, s, e)| (s, e)).collect());
}
}
Err(Error::NoAlignmentFound(
tx_ac.to_string(),
db_contig.to_string(),
))
}
fn get_cds_start_end(
&self,
tx_ac: &str,
alt_ac: &str,
_alt_aln_method: &str,
) -> Result<(Option<i32>, Option<i32>), Error> {
let db_contig = self
.contig_alias_map
.get(alt_ac)
.map(String::as_str)
.unwrap_or(alt_ac);
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx as usize];
for genome_alignment in &tx.genome_alignments {
if genome_alignment.contig == db_contig {
return Ok((tx.start_codon, tx.stop_codon));
}
}
Err(Error::NoAlignmentFound(
tx_ac.to_string(),
db_contig.to_string(),
))
}
fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, Error> {
let tx_acs = if let Some(tx_acs) = self.get_picked_transcripts(gene) {
tx_acs
} else {
tracing::warn!("no transcripts found for gene: {}", gene);
return Ok(Vec::default());
};
tx_acs
.iter()
.filter_map(|tx_ac| -> Option<Result<TxInfoRecord, Error>> {
if let Some(tx_idx) = self.tx_map.get(tx_ac) {
let tx_idx = *tx_idx as usize;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx];
if let Some(genome_alignment) = tx.genome_alignments.first() {
Some(Ok(TxInfoRecord {
hgnc: tx.gene_id.clone(),
cds_start_i: tx.start_codon,
cds_end_i: tx.stop_codon,
tx_ac: tx.id.clone(),
alt_ac: genome_alignment.contig.to_string(),
alt_aln_method: "splign".into(),
}))
} else {
Some(Err(Error::NoAlignmentFound(
tx_ac.to_string(),
format!("{:?}", self.assembly()),
)))
}
} else {
tracing::warn!("transcript ID not found {} for gene {}", tx_ac, gene);
None
}
})
.collect::<Result<Vec<_>, _>>()
}
fn get_tx_for_region(
&self,
alt_ac: &str,
_alt_aln_method: &str,
start_i: i32,
end_i: i32,
) -> Result<Vec<TxForRegionRecord>, Error> {
let db_contig = self
.contig_alias_map
.get(alt_ac)
.map(String::as_str)
.unwrap_or(alt_ac);
let contig_idx = match self.tx_trees.contig_to_idx.get(db_contig) {
Some(idx) => *idx,
None => return Ok(Vec::new()),
};
let query = start_i..end_i;
let tx_idxs = self.tx_trees.trees[contig_idx].find(query);
Ok(tx_idxs
.iter()
.map(|entry| {
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[*entry.data() as usize];
assert_eq!(
tx.genome_alignments.len(),
1,
"Can only have one alignment in Mehari"
);
let alt_strand = tx.genome_alignments.first().unwrap().strand;
TxForRegionRecord {
tx_ac: tx.id.clone(),
alt_ac: alt_ac.to_string(),
alt_strand: match Strand::try_from(alt_strand).expect("invalid strand") {
Strand::Plus => 1,
Strand::Minus => -1,
_ => unreachable!("invalid strand {}", alt_strand),
},
alt_aln_method: ALT_ALN_METHOD.to_string(),
start_i,
end_i,
}
})
.collect())
}
fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, Error> {
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx_idx = tx_idx as usize;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx];
let is_selenoprotein = tx.tags.contains(&(TranscriptTag::Selenoprotein as i32));
let hgnc = tx.gene_id.clone();
let mut tmp = tx
.genome_alignments
.first()
.unwrap()
.exons
.iter()
.map(|exon| {
(
exon.ord,
exon.alt_cds_end_i
.map(|alt_cds_end_i| alt_cds_end_i + 1 - exon.alt_cds_start_i.unwrap())
.unwrap_or_default(),
)
})
.collect::<Vec<(i32, i32)>>();
tmp.sort();
let is_mitochondrial = self
.contig_manager
.is_mitochondrial_alias(tx.genome_alignments.first().unwrap().contig.as_str());
let lengths = tmp.into_iter().map(|(_, length)| length).collect();
Ok(TxIdentityInfo {
tx_ac: tx_ac.to_string(),
alt_ac: tx_ac.to_string(), alt_aln_method: String::from("transcript"),
cds_start_i: tx.start_codon,
cds_end_i: tx.stop_codon,
lengths,
hgnc,
translation_table: if is_mitochondrial {
TranslationTable::VertebrateMitochondrial
} else if is_selenoprotein {
TranslationTable::Selenocysteine
} else {
TranslationTable::Standard
},
})
}
fn get_tx_info(
&self,
tx_ac: &str,
alt_ac: &str,
_alt_aln_method: &str,
) -> Result<TxInfoRecord, Error> {
let db_contig = self
.contig_alias_map
.get(alt_ac)
.map(String::as_str)
.unwrap_or(alt_ac);
let tx_idx = *self
.tx_map
.get(tx_ac)
.ok_or(Error::NoTranscriptFound(tx_ac.to_string()))?;
let tx_idx = tx_idx as usize;
let tx = &self
.tx_seq_db
.tx_db
.as_ref()
.expect("no tx_db?")
.transcripts[tx_idx];
for genome_alignment in &tx.genome_alignments {
if genome_alignment.contig == db_contig {
return Ok(TxInfoRecord {
hgnc: tx.gene_id.clone(),
cds_start_i: tx.start_codon,
cds_end_i: tx.stop_codon,
tx_ac: tx.id.clone(),
alt_ac: alt_ac.to_string(),
alt_aln_method: ALT_ALN_METHOD.to_string(),
});
}
}
Err(Error::NoAlignmentFound(
tx_ac.to_string(),
alt_ac.to_string(),
))
}
fn get_tx_mapping_options(&self, tx_ac: &str) -> Result<Vec<TxMappingOptionsRecord>, Error> {
let tx = self
.get_tx(tx_ac)
.ok_or_else(|| Error::NoTranscriptFound(tx_ac.to_string()))?;
let current_build = self.assembly();
let options = tx
.genome_alignments
.iter()
.filter(|aln| aln.genome_build.eq_ignore_ascii_case(¤t_build))
.map(|aln| TxMappingOptionsRecord {
tx_ac: tx_ac.to_string(),
alt_ac: aln.contig.clone(),
alt_aln_method: NCBI_ALN_METHOD.to_string(),
})
.collect::<Vec<_>>();
if options.is_empty() {
return Err(Error::NoAlignmentFound(tx_ac.to_string(), current_build));
}
Ok(options)
}
}
#[cfg(test)]
mod test {
#[test]
fn test_sync() {
fn is_sync<T: Sync>() {}
is_sync::<super::Provider>();
}
}