use crate::db::create::models::{
Fix, GeneId, Identifier, Reason, TranscriptExt, TranscriptId, TranscriptLoader,
};
use crate::db::create::reference::SequenceProvider;
use anyhow::Error;
use derive_new::new;
use enumflags2::{BitFlag, BitFlags};
use hgvs::data::cdot::json::models::{BioType, Gene, Tag, Transcript};
use hgvs::sequences::{TranslationTable, translate_cds};
use itertools::Itertools;
use once_cell::sync::Lazy;
use rayon::iter::Either;
use rayon::prelude::{IntoParallelRefIterator, ParallelIterator};
use seqrepo::AliasOrSeqId;
use std::cmp::Reverse;
use std::collections::{HashMap, HashSet};
use std::time::Instant;
pub(crate) fn filter_initial_gene_id_entries(loader: &mut TranscriptLoader) -> Result<(), Error> {
if loader.disable_filters {
return Ok(());
}
tracing::info!("Filtering Gene Id entries …");
for (gene_id, tx_ids) in &loader.gene_id_to_transcript_ids {
if tx_ids.is_empty() {
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::NoTranscripts;
}
if let Some(gene) = loader.gene_id_to_gene.get(gene_id) {
if gene
.biotype
.as_ref()
.is_some_and(|bt| bt.iter().any(|b| DISCARD_BIOTYPES_GENES.contains(b)))
{
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::Biotype;
}
} else {
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::MissingGene;
}
}
tracing::info!("… done filter Gene Id entries");
Ok(())
}
pub(crate) fn filter_genes(loader: &mut TranscriptLoader) -> Result<(), Error> {
if loader.disable_filters {
return Ok(());
}
tracing::info!("Filtering genes …");
let missing_symbol = |_gene_id: GeneId, gene: &Gene, _txs: &[&Transcript]| -> bool {
gene.gene_symbol.is_none() || gene.gene_symbol.as_ref().unwrap().is_empty()
};
let biotype = |_gene_id: GeneId, gene: &Gene, _txs: &[&Transcript]| -> bool {
gene.biotype
.as_ref()
.map(|bt| bt.iter().any(|bt| DISCARD_BIOTYPES_GENES.contains(bt)))
.unwrap_or(false)
};
let predicted_only = |_gene_id: GeneId, _gene: &Gene, txs: &[&Transcript]| -> bool {
!txs.is_empty() && txs.iter().all(|tx| tx.id.starts_with('X'))
};
type Filter = fn(GeneId, &Gene, &[&Transcript]) -> bool;
let filters: [(Filter, Reason); 3] = [
(missing_symbol, Reason::MissingGeneSymbol),
(biotype, Reason::Biotype),
(predicted_only, Reason::PredictedTranscriptsOnly),
];
let discarded_genes = loader
.gene_id_to_gene
.iter()
.filter_map(|(gene_id, gene)| {
let txs = loader
.gene_id_to_transcript_ids
.get(gene_id)
.map(|tx_ids| {
tx_ids
.iter()
.filter_map(|tx_id| loader.transcript_id_to_transcript.get(tx_id))
.collect_vec()
})
.unwrap_or_default();
let reason = filters
.iter()
.filter_map(|(f, r)| f(gene_id.clone(), gene, &txs).then_some(*r))
.fold(BitFlags::<Reason>::default(), |a, b| a | b);
let empty = reason.is_empty();
(!empty).then_some((Identifier::Gene(gene_id.clone()), reason))
})
.collect_vec();
for (id, reason) in discarded_genes {
loader.mark_discarded(&id, reason)?;
}
tracing::info!("… done filtering genes");
Ok(())
}
pub(crate) fn filter_transcripts(loader: &mut TranscriptLoader) -> Result<(), Error> {
if loader.disable_filters {
return Ok(());
}
tracing::info!("Filtering transcripts …");
#[derive(new)]
struct Params<'a> {
tx: &'a Transcript,
tx_id: &'a str,
}
let missing_hgnc = |p: &Params| -> bool {
p.tx.hgnc.is_none()
|| p.tx.hgnc.as_ref().unwrap().is_empty()
|| p.tx.hgnc.as_ref().unwrap().starts_with("GENE:")
};
let empty_genome_builds = |p: &Params| -> bool { p.tx.genome_builds.is_empty() };
let cds_start_end_nf = |p: &Params| -> bool {
p.tx.genome_builds.values().any(|gb| {
if let Some(tag) = &gb.tag {
tag.contains(&Tag::Other("cds_start_NF".into()))
|| tag.contains(&Tag::Other("cds_end_NF".into()))
} else {
false
}
})
};
let partial = |p: &Params| -> bool {
if !matches!(p.tx.partial, Some(1)) {
return false;
}
if p.tx_id.starts_with("NR") {
false
} else if p.tx.protein_coding() {
if let Some(alignment) = p.tx.genome_builds.values().next() {
let exons = alignment.exons.clone();
if exons.is_empty() {
return true;
}
let is_reverse = matches!(
alignment.strand,
hgvs::data::cdot::json::models::Strand::Minus
);
let five_prime_trunc = {
let cds_start = alignment.cds_start;
let first = if is_reverse {
exons.iter().max_by_key(|e| e.alt_start_i).unwrap()
} else {
exons.iter().min_by_key(|e| e.alt_start_i).unwrap()
};
Some(first.alt_start_i) == cds_start
};
let three_prime_trunc = {
let cds_end = alignment.cds_end;
let last = if is_reverse {
exons.iter().min_by_key(|e| e.alt_end_i).unwrap()
} else {
exons.iter().max_by_key(|e| e.alt_end_i).unwrap()
};
Some(last.alt_end_i) == cds_end
};
five_prime_trunc || three_prime_trunc
} else {
false
}
} else {
true
}
};
let predicted = |p: &Params| -> bool { p.tx_id.starts_with('X') };
let biotype = |p: &Params| -> bool {
p.tx.biotype
.as_ref()
.map(|bt| {
bt.iter()
.any(|bt| DISCARD_BIOTYPES_TRANSCRIPTS.contains(bt))
})
.unwrap_or(false)
};
type Filter = fn(&Params) -> bool;
let tx_filters: [(Filter, Reason); 6] = [
(missing_hgnc, Reason::MissingHgncId),
(empty_genome_builds, Reason::EmptyGenomeBuilds),
(partial, Reason::OnlyPartialAlignmentInRefSeq),
(predicted, Reason::PredictedTranscript),
(biotype, Reason::Biotype),
(cds_start_end_nf, Reason::CdsStartOrEndNotConfirmed),
];
#[derive(new)]
struct GroupParams<'a> {
idx: usize,
tx_ids: &'a [TranscriptId],
txs: &'a [&'a Transcript],
}
type GroupFilter = fn(&GroupParams) -> bool;
let old_version = |p: &GroupParams| -> bool {
let versioned = group_transcripts_by_release_and_version(p.txs);
let (tx_ac, tx_version) = p.tx_ids[p.idx].split_version();
for release in p.txs[p.idx].genome_builds.keys() {
if let Some(other) = versioned.get(&(release.into(), tx_ac.to_string()))
&& other.iter().any(|(version, _tx)| *version > tx_version)
{
return true;
}
}
false
};
let nm_instead_of_nr = |p: &GroupParams| -> bool {
p.tx_ids[p.idx].starts_with("NR_") && p.tx_ids.iter().any(|tx_id| tx_id.starts_with("NM_"))
};
let gene_id_grouped_tx_filters: [(GroupFilter, Reason); 2] = [
(old_version, Reason::OldVersion),
(nm_instead_of_nr, Reason::UseNmTranscriptInsteadOfNr),
];
let start = Instant::now();
let discarded = loader
.transcript_id_to_transcript
.iter()
.filter_map(|(tx_id, tx)| {
let p = Params::new(tx, tx_id);
let reason = tx_filters
.iter()
.filter_map(|(f, r)| f(&p).then_some(*r))
.fold(BitFlags::<Reason>::default(), |a, b| a | b);
let empty = reason.is_empty();
(!empty).then(|| (Identifier::Transcript(tx_id.clone()), reason))
})
.collect_vec();
for (id, reason) in discarded {
loader.mark_discarded(&id, reason)?;
}
let discarded = loader
.gene_id_to_transcript_ids
.iter()
.filter_map(|(_gene_id, tx_ids)| {
let txs = tx_ids
.iter()
.map(|tx_id| loader.transcript_id_to_transcript.get(tx_id).unwrap())
.collect_vec();
let tx_reasons = tx_ids
.iter()
.enumerate()
.filter_map(|(idx, tx)| {
let p = GroupParams::new(idx, tx_ids, &txs);
let reason = gene_id_grouped_tx_filters
.iter()
.filter_map(|(f, r)| f(&p).then_some(*r))
.fold(BitFlags::<Reason>::default(), |a, b| a | b);
let empty = reason.is_empty();
if !empty {
Some((Identifier::Transcript(tx.clone()), reason))
} else {
None
}
})
.collect_vec();
if tx_reasons.is_empty() {
None
} else {
Some(tx_reasons)
}
})
.flatten()
.collect_vec();
for (id, reason) in discarded {
loader.mark_discarded(&id, reason)?;
}
tracing::info!("… done filtering transcripts in {:?}", start.elapsed());
Ok(())
}
pub(crate) fn filter_transcripts_with_sequence(
loader: &mut TranscriptLoader,
seq_provider: &mut SequenceProvider,
) -> Result<HashMap<TranscriptId, String>, Error> {
tracing::info!("Filtering transcripts with sequences …");
let start = Instant::now();
let five_prime_truncated = |tx: &Transcript| -> bool {
let is_mt = MITOCHONDRIAL_ACCESSIONS.iter().any(|a| tx.is_on_contig(a));
if tx.protein_coding() && !is_mt {
tx.genome_builds.iter().any(|(_release, alignment)| {
let cds_start = alignment.cds_start;
let exons = alignment.exons.clone();
if exons.is_empty() {
return true;
}
let is_reverse = matches!(
alignment.strand,
hgvs::data::cdot::json::models::Strand::Minus
);
let first = if is_reverse {
exons.iter().max_by_key(|e| e.alt_start_i).unwrap()
} else {
exons.iter().min_by_key(|e| e.alt_start_i).unwrap()
};
Some(first.alt_start_i) == cds_start
})
} else {
false
}
};
let three_prime_truncated = |tx: &Transcript| -> bool {
let is_mt = MITOCHONDRIAL_ACCESSIONS.iter().any(|a| tx.is_on_contig(a));
if tx.protein_coding() && !is_mt {
tx.genome_builds.iter().any(|(_release, alignment)| {
let cds_end = alignment.cds_end;
let exons = alignment.exons.clone();
if exons.is_empty() {
return true;
}
let is_reverse = matches!(
alignment.strand,
hgvs::data::cdot::json::models::Strand::Minus
);
let last = if is_reverse {
exons.iter().min_by_key(|e| e.alt_end_i).unwrap()
} else {
exons.iter().max_by_key(|e| e.alt_end_i).unwrap()
};
Some(last.alt_end_i) == cds_end
})
} else {
false
}
};
let invalid_cds_length = |tx: &Transcript| -> bool {
if tx.protein_coding() {
tx.cds_length().is_none_or(|l| l % 3 != 0)
} else {
false
}
};
let (discards, keeps_with_reasons): (Vec<_>, Vec<_>) = loader
.transcript_id_to_transcript
.par_iter()
.partition_map(|(tx_id, tx)| {
if let Some(d) = loader.discards.get(&Identifier::Transcript(tx_id.clone()))
&& d.intersects(Reason::hard())
{
return Either::Left((Identifier::Transcript(tx_id.clone()), *d));
}
let has_invalid_cds_length = if loader.disable_filters {
false
} else {
invalid_cds_length(tx)
};
let namespace: String = if tx_id.starts_with("ENST") {
String::from("Ensembl")
} else {
String::from("NCBI")
};
let seq = seq_provider.fetch_sequence(&AliasOrSeqId::Alias {
value: (*tx_id).to_string(),
namespace: Some(namespace.clone()),
});
if let Ok(seq) = seq {
let is_mt = MITOCHONDRIAL_ACCESSIONS.iter().any(|a| tx.is_on_contig(a));
let is_seleno = tx
.biotype
.as_ref()
.map(|bt| bt.contains(&BioType::Selenoprotein))
.unwrap_or(false);
if seq.is_empty() {
return Either::Left((
Identifier::Transcript(tx_id.clone()),
Reason::MissingSequence.into(),
));
};
let cds = tx
.start_codon
.and_then(|start| tx.stop_codon.map(|stop| (start as usize, stop as usize)));
let cds = if tx.protein_coding() { cds } else { None };
if let Some((cds_start, cds_end)) = cds {
let cds_length = cds_end.saturating_sub(cds_start);
let delta = (3 - (cds_length % 3)).max(cds_end.saturating_sub(seq.len()));
let seq = append_poly_a(seq, delta);
let safe_end = cds_end.min(seq.len());
let safe_start = cds_start.min(safe_end);
let tx_seq_to_translate = &seq[safe_start..safe_end];
let aa_sequence_result = translate_cds(
tx_seq_to_translate,
true,
"*",
if is_mt {
TranslationTable::VertebrateMitochondrial
} else if is_seleno {
TranslationTable::Selenocysteine
} else {
TranslationTable::Standard
},
);
let mut reason = Reason::empty();
if let Ok(aa_sequence) = aa_sequence_result {
let has_missing_stop_codon = (!is_mt && !aa_sequence.ends_with('*'))
|| (is_mt && !aa_sequence.contains('*'));
if has_missing_stop_codon && !loader.disable_filters {
reason |= Reason::MissingStopCodon;
if five_prime_truncated(tx) {
reason |= Reason::FivePrimeEndTruncated;
}
if three_prime_truncated(tx) {
reason |= Reason::ThreePrimeEndTruncated;
}
}
}
if has_invalid_cds_length
|| loader
.fixes
.get(&Identifier::Transcript(tx_id.clone()))
.is_some_and(|f| f.contains(Fix::Cds))
{
reason |= Reason::InvalidCdsLength;
}
if reason.intersects(Reason::hard()) {
Either::Left((Identifier::Transcript(tx_id.clone()), reason))
} else {
Either::Right((tx_id.clone(), seq, reason))
}
} else {
Either::Right((tx_id.clone(), seq, Reason::empty()))
}
} else {
Either::Left((
Identifier::Transcript(tx_id.clone()),
Reason::MissingSequence.into(),
))
}
});
for (id, reason) in discards {
loader.mark_discarded(&id, reason)?;
}
let mut sequence_map = HashMap::new();
for (tx_id, seq, reason) in keeps_with_reasons {
if !reason.is_empty() {
loader.mark_discarded(&Identifier::Transcript(tx_id.clone()), reason)?;
}
sequence_map.insert(tx_id, seq);
}
tracing::info!(
"… done filtering transcripts with sequence provider in {:?}",
start.elapsed()
);
Ok(sequence_map)
}
pub(crate) fn filter_empty_gene_id_mappings(loader: &mut TranscriptLoader) -> Result<(), Error> {
tracing::info!("Removing empty Gene Id mappings …");
let gene_ids_gene: HashSet<_> = loader.gene_id_to_gene.keys().collect();
let gene_ids_tx: HashSet<_> = loader.gene_id_to_transcript_ids.keys().collect();
let gene_id_but_no_tx_id = &gene_ids_gene - &gene_ids_tx;
let tx_id_but_no_gene_id = &gene_ids_tx - &gene_ids_gene;
for gene_id in gene_id_but_no_tx_id {
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::NoTranscripts;
}
for gene_id in tx_id_but_no_gene_id {
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::MissingGene;
}
for (gene_id, txs) in loader.gene_id_to_transcript_ids.iter() {
if txs.is_empty() {
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::NoTranscriptLeft;
}
}
tracing::info!("… done removing empty HGNC mappings");
Ok(())
}
pub const MITOCHONDRIAL_ACCESSIONS: &[&str] = &["NC_012920.1", "NC_001807.4"];
static DISCARD_BIOTYPES_TRANSCRIPTS: Lazy<HashSet<BioType>> = Lazy::new(|| {
HashSet::from([
BioType::IgCGene,
BioType::IgDGene,
BioType::IgJGene,
BioType::IgVGene,
BioType::TrCGene,
BioType::TrDGene,
BioType::TrJGene,
BioType::TrVGene,
BioType::AberrantProcessedTranscript,
BioType::UnconfirmedTranscript,
BioType::NmdTranscriptVariant,
])
});
static DISCARD_BIOTYPES_GENES: Lazy<HashSet<BioType>> = Lazy::new(|| {
HashSet::from([
BioType::CGeneSegment,
BioType::DGeneSegment,
BioType::JGeneSegment,
BioType::VGeneSegment,
])
});
fn group_transcripts_by_release_and_version<'a>(
txs: &[&'a Transcript],
) -> HashMap<(String, String), Vec<(u32, &'a Transcript)>> {
let mut versioned = txs
.iter()
.flat_map(|tx| {
let releases = tx.genome_builds.keys().cloned().collect_vec();
let tx_id = TranscriptId::try_new(&tx.id).unwrap();
let (ac, version) = tx_id.split_version();
let ac = ac.to_string();
releases
.into_iter()
.map(move |r| ((r, ac.clone()), (version, *tx)))
})
.into_group_map();
versioned.values_mut().for_each(|txs| {
txs.sort_unstable_by_key(|(version, _)| Reverse(*version));
});
versioned
}
fn append_poly_a(seq: String, length: usize) -> String {
let mut seq = seq.into_bytes();
seq.extend_from_slice(b"A".repeat(length).as_slice());
String::from_utf8(seq).expect("must be valid UTF-8")
}
pub(crate) fn propagate_discard_reasons(loader: &mut TranscriptLoader) -> Result<(), Error> {
for (gene_id, _) in loader.gene_id_to_gene.iter() {
let tx_ids = loader
.gene_id_to_transcript_ids
.get(gene_id)
.map(|v| v.as_slice())
.unwrap_or_default();
if !tx_ids.is_empty()
&& tx_ids.iter().all(|tx_id| {
loader
.discards
.get(&Identifier::Transcript(tx_id.clone()))
.is_some_and(|d| d.intersects(Reason::hard()))
})
{
*loader
.discards
.entry(Identifier::Gene(gene_id.clone()))
.or_default() |= Reason::NoTranscriptLeft;
}
}
Ok(())
}