use crate::annotate::seqvars::csq::Config;
use crate::pbs::txs::TranscriptTag;
use enumflags2::bitflags;
use hgvs::data::cdot::json::models::Tag;
use nom::Parser;
use nom::bytes::complete::take_until;
use nom::character::complete::char;
use nom::combinator::{map_res, opt, rest};
use nom::multi::separated_list1;
use nom::{
IResult,
branch::alt,
bytes::complete::tag,
character::complete::{alphanumeric1, digit1},
combinator::{all_consuming, map},
};
use parse_display::{Display, FromStr};
use std::collections::BTreeMap;
use std::str::FromStr;
use strum::IntoEnumIterator;
pub const ANN_TX_SEQ_REF: &str = "tx_sequence_ref";
pub const ANN_TX_SEQ_ALT: &str = "tx_sequence_alt";
pub const ANN_AA_SEQ_REF: &str = "aa_sequence_ref";
pub const ANN_AA_SEQ_ALT: &str = "aa_sequence_alt";
pub const ANN_COMPOUND_IDS: &str = "compound_ids";
pub const ANN_COMPOUND_VARIANTS: &str = "compound_variants";
#[derive(
Debug,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Clone,
Copy,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
strum::EnumIter,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[display(style = "UPPERCASE")]
#[serde(rename_all = "snake_case")]
pub enum PutativeImpact {
High,
Moderate,
Low,
Modifier,
}
#[bitflags]
#[repr(u64)]
#[derive(
Debug,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Clone,
Copy,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
strum::EnumIter,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[display(style = "snake_case")]
#[serde(rename_all = "snake_case")]
pub enum Consequence {
TranscriptAblation,
ExonLossVariant,
SpliceAcceptorVariant,
SpliceDonorVariant,
StopGained,
FrameshiftVariant,
FrameshiftElongation,
FrameshiftTruncation,
StopLost,
StartLost,
TranscriptAmplification,
FeatureElongation,
FeatureTruncation,
DisruptiveInframeInsertion,
DisruptiveInframeDeletion,
ConservativeInframeInsertion,
ConservativeInframeDeletion,
InframeDeletion,
InframeInsertion,
MissenseVariant,
RareAminoAcidVariant,
SelenocysteineGain,
SelenocysteineLoss,
ProteinAlteringVariant,
#[display("splice_donor_5th_base_variant")]
#[serde(rename = "splice_donor_5th_base_variant")]
SpliceDonorFifthBaseVariant,
SpliceRegionVariant,
ExonicSpliceRegionVariant,
SpliceDonorRegionVariant,
SplicePolypyrimidineTractVariant,
IncompleteTerminalCodonVariant,
StartRetainedVariant,
StopRetainedVariant,
SynonymousVariant,
CodingSequenceVariant,
#[display("mature_miRNA_variant")]
#[serde(rename = "mature_miRNA_variant")]
MatureMirnaVariant,
#[display("5_prime_UTR_exon_variant")]
#[serde(rename = "5_prime_UTR_exon_variant")]
FivePrimeUtrExonVariant,
#[display("5_prime_UTR_intron_variant")]
#[serde(rename = "5_prime_UTR_intron_variant")]
FivePrimeUtrIntronVariant,
#[display("5_prime_UTR_variant")]
#[serde(rename = "5_prime_UTR_variant")]
FivePrimeUtrVariant,
#[display("3_prime_UTR_exon_variant")]
#[serde(rename = "3_prime_UTR_exon_variant")]
ThreePrimeUtrExonVariant,
#[display("3_prime_UTR_intron_variant")]
#[serde(rename = "3_prime_UTR_intron_variant")]
ThreePrimeUtrIntronVariant,
#[display("3_prime_UTR_variant")]
#[serde(rename = "3_prime_UTR_variant")]
ThreePrimeUtrVariant,
NonCodingTranscriptVariant,
NonCodingTranscriptExonVariant,
NonCodingTranscriptIntronVariant,
CodingTranscriptIntronVariant,
UpstreamGeneVariant,
DownstreamGeneVariant,
#[display("TFBS_ablation")]
#[serde(rename = "TFBS_ablation")]
TfbsAblation,
#[display("TFBS_amplification")]
#[serde(rename = "TFBS_amplification")]
TfbsAmplification,
#[display("TF_binding_site_variant")]
#[serde(rename = "TF_binding_site_variant")]
TfBindingSiteVariant,
RegulatoryRegionAblation,
RegulatoryRegionAmplification,
RegulatoryRegionVariant,
IntergenicVariant,
IntronVariant,
GeneVariant,
}
impl From<Consequence> for PutativeImpact {
fn from(val: Consequence) -> Self {
use Consequence::*;
match val {
TranscriptAblation
| ExonLossVariant
| SpliceAcceptorVariant
| SpliceDonorVariant
| StopGained
| FrameshiftVariant
| FrameshiftElongation
| FrameshiftTruncation
| StopLost
| StartLost
| TranscriptAmplification
| FeatureElongation
| FeatureTruncation => PutativeImpact::High,
DisruptiveInframeInsertion
| DisruptiveInframeDeletion
| ConservativeInframeInsertion
| ConservativeInframeDeletion
| InframeInsertion
| InframeDeletion
| ProteinAlteringVariant
| MissenseVariant
| RareAminoAcidVariant
| SelenocysteineGain
| SelenocysteineLoss => PutativeImpact::Moderate,
SpliceDonorFifthBaseVariant
| SpliceRegionVariant
| ExonicSpliceRegionVariant
| SpliceDonorRegionVariant
| SplicePolypyrimidineTractVariant
| StartRetainedVariant
| StopRetainedVariant
| SynonymousVariant
| IncompleteTerminalCodonVariant => PutativeImpact::Low,
CodingSequenceVariant
| MatureMirnaVariant
| FivePrimeUtrExonVariant
| FivePrimeUtrIntronVariant
| FivePrimeUtrVariant
| ThreePrimeUtrExonVariant
| ThreePrimeUtrIntronVariant
| ThreePrimeUtrVariant
| NonCodingTranscriptVariant
| NonCodingTranscriptExonVariant
| NonCodingTranscriptIntronVariant
| CodingTranscriptIntronVariant
| UpstreamGeneVariant
| DownstreamGeneVariant
| TfbsAblation
| TfbsAmplification
| TfBindingSiteVariant
| RegulatoryRegionAblation
| RegulatoryRegionAmplification
| RegulatoryRegionVariant
| IntergenicVariant
| IntronVariant
| GeneVariant => PutativeImpact::Modifier,
}
}
}
impl Consequence {
pub fn all() -> Vec<Self> {
Self::iter().collect()
}
pub fn impact(&self) -> PutativeImpact {
PutativeImpact::from(*self)
}
}
#[derive(
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Display, serde::Deserialize, serde::Serialize,
)]
pub enum Allele {
#[display("{alternative}")]
Alt { alternative: String },
#[display("{alternative}-{reference}")]
AltRef {
alternative: String,
reference: String,
},
#[display("{alternative}-{other_chrom}:{other_pos}_{other_ref}>{other_alt}")]
Compound {
alternative: String,
other_chrom: String,
other_pos: u32,
other_ref: String,
other_alt: String,
},
#[display("grouped({0})")]
Grouped(GroupedAlleles),
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Deserialize, serde::Serialize)]
pub struct GroupedAlleles {
pub references: Vec<String>,
pub alternatives: Vec<String>,
}
impl std::fmt::Display for GroupedAlleles {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.alternatives.join("+"))
}
}
impl std::str::FromStr for GroupedAlleles {
type Err = std::convert::Infallible;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(GroupedAlleles {
references: Vec::new(),
alternatives: s.split('+').map(|a| a.to_string()).collect(),
})
}
}
mod parse {
use nom::Parser;
use nom::bytes::complete::take_while1;
pub static NA_IUPAC: &str = "ACGTURYMKWSBDHVNacgturymkwsbdhvn";
pub fn na1(input: &str) -> Result<(&str, &str), nom::Err<nom::error::Error<&str>>> {
take_while1(|c: char| NA_IUPAC.contains(c)).parse(input)
}
}
impl Allele {
pub fn parse(input: &str) -> IResult<&str, Self> {
all_consuming(alt((
Self::parse_grouped,
Self::parse_compound,
Self::parse_alt_ref,
Self::parse_alt,
)))
.parse(input)
}
fn parse_grouped(input: &str) -> IResult<&str, Self> {
map(
(
tag("grouped("),
separated_list1(tag("+"), parse::na1),
tag(")"),
),
|(_, alternatives, _)| {
Allele::Grouped(GroupedAlleles {
references: Vec::new(),
alternatives: alternatives.into_iter().map(|s| s.to_string()).collect(),
})
},
)
.parse(input)
}
fn parse_compound(input: &str) -> IResult<&str, Self> {
map(
(
parse::na1,
tag("-"),
alphanumeric1,
tag(":"),
digit1,
tag("_"),
parse::na1,
tag(">"),
parse::na1,
),
|(alternative, _, other_chrom, _, other_pos, _, other_ref, _, other_alt)| {
Allele::Compound {
alternative: alternative.to_string(),
other_chrom: other_chrom.to_string(),
other_pos: other_pos.parse::<u32>().unwrap(),
other_ref: other_ref.to_string(),
other_alt: other_alt.to_string(),
}
},
)
.parse(input)
}
fn parse_alt_ref(input: &str) -> IResult<&str, Self> {
map(
(parse::na1, tag("-"), parse::na1),
|(alternative, _, reference)| Allele::AltRef {
alternative: alternative.to_string(),
reference: reference.to_string(),
},
)
.parse(input)
}
fn parse_alt(input: &str) -> IResult<&str, Self> {
map(parse::na1, |alternative| Allele::Alt {
alternative: alternative.to_string(),
})
.parse(input)
}
}
impl FromStr for Allele {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::parse(s)
.map(|(_, value)| value)
.map_err(|e| anyhow::anyhow!("{}", e))
}
}
#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[display(style = "snake_case")]
pub enum SoFeature {
Transcript,
}
#[derive(
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Display, serde::Deserialize, serde::Serialize,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum FeatureType {
#[display("{term}")]
SoTerm { term: SoFeature },
#[display("{value}")]
Custom { value: String },
}
impl FromStr for FeatureType {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
SoFeature::from_str(s)
.map(|term| FeatureType::SoTerm { term })
.or_else(|_| {
Ok(FeatureType::Custom {
value: s.to_string(),
})
})
}
}
#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
strum::EnumIter,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum FeatureBiotype {
Coding,
Noncoding,
}
impl FeatureBiotype {
pub fn is_coding(&self) -> bool {
matches!(self, FeatureBiotype::Coding)
}
}
#[derive(
Debug,
Clone,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
strum::EnumIter,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[serde(rename_all = "snake_case")]
pub enum FeatureTag {
Unknown,
Basic,
EnsemblCanonical,
ManeSelect,
ManePlusClinical,
RefSeqSelect,
Selenoprotein,
GencodePrimary,
#[serde(skip)]
EnsemblGraft,
BasicBackport,
EnsemblCanonicalBackport,
ManeSelectBackport,
ManePlusClinicalBackport,
RefSeqSelectBackport,
SelenoproteinBackport,
GencodePrimaryBackport,
#[display("{0}-backport")]
OtherBackport(String),
#[display("{0}")]
Other(String),
}
impl FeatureTag {
pub fn to_backported(&self) -> Self {
match self {
FeatureTag::Basic => FeatureTag::BasicBackport,
FeatureTag::EnsemblCanonical => FeatureTag::EnsemblCanonicalBackport,
FeatureTag::ManeSelect => FeatureTag::ManeSelectBackport,
FeatureTag::ManePlusClinical => FeatureTag::ManePlusClinicalBackport,
FeatureTag::RefSeqSelect => FeatureTag::RefSeqSelectBackport,
FeatureTag::Selenoprotein => FeatureTag::SelenoproteinBackport,
FeatureTag::GencodePrimary => FeatureTag::GencodePrimaryBackport,
FeatureTag::Unknown => FeatureTag::Unknown,
FeatureTag::EnsemblGraft => FeatureTag::OtherBackport("ensembl_graft".to_string()),
FeatureTag::Other(s) => FeatureTag::OtherBackport(s.clone()),
_ => self.clone(),
}
}
}
impl From<TranscriptTag> for FeatureTag {
fn from(value: TranscriptTag) -> Self {
match value {
TranscriptTag::Unknown => FeatureTag::Unknown,
TranscriptTag::Basic => FeatureTag::Basic,
TranscriptTag::EnsemblCanonical => FeatureTag::EnsemblCanonical,
TranscriptTag::ManeSelect => FeatureTag::ManeSelect,
TranscriptTag::ManePlusClinical => FeatureTag::ManePlusClinical,
TranscriptTag::RefSeqSelect => FeatureTag::RefSeqSelect,
TranscriptTag::Selenoprotein => FeatureTag::Selenoprotein,
TranscriptTag::GencodePrimary => FeatureTag::GencodePrimary,
TranscriptTag::EnsemblGraft => FeatureTag::EnsemblGraft,
TranscriptTag::Other => FeatureTag::Other("Other".to_string()),
TranscriptTag::BasicBackport => FeatureTag::BasicBackport,
TranscriptTag::EnsemblCanonicalBackport => FeatureTag::EnsemblCanonicalBackport,
TranscriptTag::ManeSelectBackport => FeatureTag::ManeSelectBackport,
TranscriptTag::ManePlusClinicalBackport => FeatureTag::ManePlusClinicalBackport,
TranscriptTag::RefSeqSelectBackport => FeatureTag::RefSeqSelectBackport,
TranscriptTag::SelenoproteinBackport => FeatureTag::SelenoproteinBackport,
TranscriptTag::GencodePrimaryBackport => FeatureTag::GencodePrimaryBackport,
TranscriptTag::OtherBackport => FeatureTag::OtherBackport("Other".to_string()),
}
}
}
impl From<Tag> for FeatureTag {
fn from(value: Tag) -> Self {
match value {
Tag::Basic => FeatureTag::Basic,
Tag::EnsemblCanonical => FeatureTag::EnsemblCanonical,
Tag::ManeSelect => FeatureTag::ManeSelect,
Tag::ManePlusClinical => FeatureTag::ManePlusClinical,
Tag::RefSeqSelect => FeatureTag::RefSeqSelect,
Tag::GencodePrimary => FeatureTag::GencodePrimary,
Tag::Other(v) => {
if v == "EnsemblGraft" {
FeatureTag::EnsemblGraft
} else {
FeatureTag::Other(v)
}
}
}
}
}
impl From<FeatureTag> for Tag {
fn from(value: FeatureTag) -> Self {
match value {
FeatureTag::Basic => Tag::Basic,
FeatureTag::EnsemblCanonical => Tag::EnsemblCanonical,
FeatureTag::ManeSelect => Tag::ManeSelect,
FeatureTag::ManePlusClinical => Tag::ManePlusClinical,
FeatureTag::RefSeqSelect => Tag::RefSeqSelect,
FeatureTag::GencodePrimary => Tag::GencodePrimary,
FeatureTag::Selenoprotein => Tag::Other("Selenoprotein".to_string()),
FeatureTag::EnsemblGraft => Tag::Other("EnsemblGraft".to_string()),
FeatureTag::Unknown => Tag::Other("Unknown".to_string()),
FeatureTag::Other(v) => Tag::Other(v),
FeatureTag::BasicBackport => Tag::Other("basic-backport".to_string()),
FeatureTag::ManeSelectBackport => Tag::Other("mane_select-backport".to_string()),
FeatureTag::EnsemblCanonicalBackport => {
Tag::Other("ensembl_canonical-backport".to_string())
}
FeatureTag::ManePlusClinicalBackport => {
Tag::Other("mane_plus_clinical-backport".to_string())
}
FeatureTag::RefSeqSelectBackport => Tag::Other("ref_seq_select-backport".to_string()),
FeatureTag::SelenoproteinBackport => Tag::Other("selenoprotein-backport".to_string()),
FeatureTag::GencodePrimaryBackport => {
Tag::Other("gencode_primary-backport".to_string())
}
FeatureTag::OtherBackport(v) => Tag::Other(format!("{}-backport", v)),
}
}
}
#[derive(
Clone,
Debug,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Display,
FromStr,
Default,
serde::Deserialize,
serde::Serialize,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[display("{ord}/{total}")]
pub struct Rank {
pub ord: i32,
pub total: i32,
}
impl Rank {
#[inline]
pub fn is_first(&self) -> bool {
self.ord == 1
}
#[inline]
pub fn is_last(&self) -> bool {
self.ord == self.total
}
}
#[derive(
Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Default, serde::Deserialize, serde::Serialize,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
pub struct Pos {
pub ord: i32,
pub total: Option<i32>,
}
impl std::fmt::Display for Pos {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if let Some(total) = self.total {
write!(f, "{}/{}", self.ord, total)
} else {
write!(f, "{}", self.ord)
}
}
}
impl Pos {
fn parse_number_neg(input: &str) -> IResult<&str, i32> {
map((tag("-"), digit1::<&str, _>), |(sign, num)| {
let num = num.parse::<i32>().unwrap();
if sign == "-" { -num } else { num }
})
.parse(input)
}
fn parse_number_nosign(input: &str) -> IResult<&str, i32> {
map(digit1::<&str, _>, |num| num.parse::<i32>().unwrap()).parse(input)
}
fn parse_number(input: &str) -> IResult<&str, i32> {
alt((Self::parse_number_neg, Self::parse_number_nosign)).parse(input)
}
fn parse_with_total(input: &str) -> IResult<&str, Self> {
map((Self::parse_number, tag("/"), digit1), |(num, _, total)| {
Pos {
ord: num,
total: Some(total.parse::<i32>().unwrap()),
}
})
.parse(input)
}
fn parse_no_total(input: &str) -> IResult<&str, Self> {
map(Self::parse_number, |num| Pos {
ord: num,
total: None,
})
.parse(input)
}
}
impl FromStr for Pos {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
all_consuming(alt((Self::parse_with_total, Self::parse_no_total)))
.parse(s)
.map(|(_, value)| value)
.map_err(|e| anyhow::anyhow!("{}", e))
}
}
#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
Display,
FromStr,
serde::Deserialize,
serde::Serialize,
)]
#[cfg_attr(feature = "server", derive(utoipa::ToSchema))]
#[display(style = "SNAKE_CASE")]
#[serde(rename_all = "snake_case")]
pub enum Message {
ErrorChromosomeNotFound,
ErrorOutOfChromosomeRange,
WarningRefDoesNotMatchGenome,
WarningSequenceNotAvailable,
WarningTranscriptIncomplete,
WarningTranscriptMultipleStopCodons,
WarningTranscriptsNoStartCodon,
InfoRealignThreePrime,
InfoCompoundAnnotation,
InfoNonReferenceAnnotation,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct AnnField {
#[serde(alias = "Allele")]
pub allele: Allele,
#[serde(alias = "Annotation")]
pub consequences: Vec<Consequence>,
#[serde(alias = "Annotation_Impact")]
pub putative_impact: PutativeImpact,
#[serde(alias = "Gene_Name")]
pub gene_symbol: String,
#[serde(alias = "Gene_ID")]
pub gene_id: String,
#[serde(alias = "Feature_Type")]
pub feature_type: FeatureType,
#[serde(alias = "Feature_ID")]
pub feature_id: String,
#[serde(alias = "Transcript_BioType")]
pub feature_biotype: Vec<FeatureBiotype>,
#[serde(alias = "Feature_Tags")]
pub feature_tags: Vec<FeatureTag>,
#[serde(alias = "Rank")]
pub rank: Option<Rank>,
#[serde(alias = "HGVS.g")]
pub hgvs_g: Option<String>,
#[serde(alias = "HGVS.n")]
pub hgvs_n: Option<String>,
#[serde(alias = "HGVS.c")]
pub hgvs_c: Option<String>,
#[serde(alias = "HGVS.p")]
pub hgvs_p: Option<String>,
#[serde(alias = "cDNA.pos / cDNA.length")]
pub cdna_pos: Option<Pos>,
#[serde(alias = "CDS.pos / CDS.length")]
pub cds_pos: Option<Pos>,
#[serde(alias = "AA.pos / AA.length")]
pub protein_pos: Option<Pos>,
#[serde(alias = "Distance")]
pub distance: Option<i32>,
#[serde(alias = "Strand")]
pub strand: i32,
#[serde(alias = "ERRORS / WARNINGS / INFO")]
pub messages: Option<Vec<Message>>,
#[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
pub custom_fields: BTreeMap<String, Option<String>>,
}
impl AnnField {
pub fn ann_field_names(config: &Config) -> Vec<String> {
let names = serde_aux::serde_introspection::serde_introspect::<Self>();
let mut result: Vec<_> = names
.iter()
.step_by(2)
.filter(|&x| *x != "custom_fields")
.map(|s| s.to_string())
.collect();
result.extend(config.custom_columns.clone());
result
}
}
impl Default for AnnField {
fn default() -> Self {
Self {
allele: Allele::Alt {
alternative: Default::default(),
},
consequences: vec![],
putative_impact: PutativeImpact::Modifier,
gene_symbol: Default::default(),
gene_id: Default::default(),
feature_type: FeatureType::SoTerm {
term: SoFeature::Transcript,
},
feature_id: Default::default(),
feature_biotype: vec![FeatureBiotype::Coding],
feature_tags: Default::default(),
rank: Default::default(),
hgvs_g: Default::default(),
hgvs_n: Default::default(),
hgvs_c: Default::default(),
hgvs_p: Default::default(),
cdna_pos: Default::default(),
cds_pos: Default::default(),
protein_pos: Default::default(),
distance: Default::default(),
strand: Default::default(),
messages: Default::default(),
custom_fields: Default::default(),
}
}
}
fn parse_pipe_field(i: &str) -> IResult<&str, &str> {
let (i, val) = alt((take_until("|"), rest)).parse(i)?;
let (i, _) = opt(char('|')).parse(i)?;
Ok((i, val))
}
fn parse_string(i: &str) -> IResult<&str, String> {
map(parse_pipe_field, String::from).parse(i)
}
fn parse_opt_string(i: &str) -> IResult<&str, Option<String>> {
let (i, val) = parse_pipe_field(i)?;
Ok((i, (!val.is_empty()).then(|| val.to_string())))
}
fn parse_parsed<T: FromStr>(i: &str) -> IResult<&str, T> {
map_res(parse_pipe_field, |s| s.parse::<T>()).parse(i)
}
fn parse_opt_parsed<T: FromStr>(i: &str) -> IResult<&str, Option<T>> {
let (i, val) = parse_pipe_field(i)?;
let parsed = (!val.is_empty())
.then(|| {
val.parse::<T>().map_err(|_| {
nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Verify))
})
})
.transpose()?;
Ok((i, parsed))
}
fn parse_item_list<T: FromStr>(i: &str) -> IResult<&str, Vec<T>> {
let (i, field) = parse_pipe_field(i)?;
if field.is_empty() {
return Ok((i, vec![]));
}
let (_, items) = separated_list1(
char('&'),
map_res(alt((take_until("&"), rest)), |s: &str| s.parse::<T>()),
)
.parse(field)?;
Ok((i, items))
}
fn parse_opt_list<T: FromStr>(i: &str) -> IResult<&str, Option<Vec<T>>> {
let (i, field) = parse_pipe_field(i)?;
if field.is_empty() {
return Ok((i, None));
}
let (_, items) = separated_list1(
char('&'),
map_res(alt((take_until("&"), rest)), |s: &str| s.parse::<T>()),
)
.parse(field)?;
Ok((i, Some(items)))
}
impl AnnField {
fn parse_nom<'a>(i: &'a str, config: &Config) -> IResult<&'a str, Self> {
let (i, allele) = parse_parsed(i)?;
let (i, consequences) = parse_item_list(i)?;
let (i, putative_impact) = parse_parsed(i)?;
let (i, gene_symbol) = parse_string(i)?;
let (i, gene_id) = parse_string(i)?;
let (i, feature_type) = parse_parsed(i)?;
let (i, feature_id) = parse_string(i)?;
let (i, feature_biotype) = parse_item_list(i)?;
let (i, feature_tags) = parse_item_list(i)?;
let (i, rank) = parse_opt_parsed(i)?;
let (i, hgvs_g) = parse_opt_string(i)?;
let (i, hgvs_n) = parse_opt_string(i)?;
let (i, hgvs_c) = parse_opt_string(i)?;
let (i, hgvs_p) = parse_opt_string(i)?;
let (i, cdna_pos) = parse_opt_parsed(i)?;
let (i, cds_pos) = parse_opt_parsed(i)?;
let (i, protein_pos) = parse_opt_parsed(i)?;
let (i, distance) = parse_opt_parsed(i)?;
let (i, strand) = parse_parsed(i)?;
let (i, messages) = parse_opt_list(i)?;
let mut custom_fields = BTreeMap::new();
let mut current_i = i;
for col_name in &config.custom_columns {
let (next_i, val) = parse_opt_string(current_i)?;
current_i = next_i;
custom_fields.insert(col_name.clone(), val);
}
Ok((
current_i,
Self {
allele,
consequences,
putative_impact,
gene_symbol,
gene_id,
feature_type,
feature_id,
feature_biotype,
feature_tags,
rank,
hgvs_g,
hgvs_n,
hgvs_c,
hgvs_p,
cdna_pos,
cds_pos,
protein_pos,
distance,
strand,
messages,
custom_fields,
},
))
}
pub fn parse(s: &str, config: &Config) -> Result<Self, anyhow::Error> {
match Self::parse_nom(s, config) {
Ok((_, ann)) => Ok(ann),
Err(nom::Err::Error(e)) | Err(nom::Err::Failure(e)) => {
Err(anyhow::anyhow!("nom parsing error: {:?}", e.code))
}
Err(nom::Err::Incomplete(_)) => {
Err(anyhow::anyhow!("nom parsing error: incomplete string"))
}
}
}
pub fn format(&self, config: &Config) -> String {
use std::fmt::Write;
let mut buf = String::with_capacity(256);
let _ = write!(buf, "{}|", self.allele);
let mut first = true;
for c in &self.consequences {
if !first {
buf.push('&');
}
let _ = write!(buf, "{}", c);
first = false;
}
let _ = write!(
buf,
"|{}|{}|{}|{}|{}|",
self.putative_impact,
self.gene_symbol,
self.gene_id,
self.feature_type,
self.feature_id
);
first = true;
for b in &self.feature_biotype {
if !first {
buf.push('&');
}
let _ = write!(buf, "{}", b);
first = false;
}
buf.push('|');
first = true;
for t in &self.feature_tags {
if !first {
buf.push('&');
}
let _ = write!(buf, "{}", t);
first = false;
}
buf.push('|');
if let Some(rank) = &self.rank {
let _ = write!(buf, "{}", rank);
}
buf.push('|');
if let Some(g) = &self.hgvs_g {
buf.push_str(g);
}
buf.push('|');
if let Some(n) = &self.hgvs_n {
buf.push_str(n);
}
buf.push('|');
if let Some(c) = &self.hgvs_c {
buf.push_str(c);
}
buf.push('|');
if let Some(p) = &self.hgvs_p {
buf.push_str(p);
}
buf.push('|');
if let Some(pos) = &self.cdna_pos {
let _ = write!(buf, "{}", pos);
}
buf.push('|');
if let Some(pos) = &self.cds_pos {
let _ = write!(buf, "{}", pos);
}
buf.push('|');
if let Some(pos) = &self.protein_pos {
let _ = write!(buf, "{}", pos);
}
buf.push('|');
if let Some(distance) = &self.distance {
let _ = write!(buf, "{}", distance);
}
let _ = write!(buf, "|{}|", self.strand);
if let Some(messages) = &self.messages {
first = true;
for m in messages {
if !first {
buf.push('&');
}
let _ = write!(buf, "{}", m);
first = false;
}
}
for col_name in &config.custom_columns {
buf.push('|');
if let Some(Some(val)) = self.custom_fields.get(col_name) {
buf.push_str(val);
}
}
buf
}
}
impl FromStr for AnnField {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::parse(s, &Config::default())
}
}
impl std::fmt::Display for AnnField {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.format(&Config::default()))
}
}
#[cfg(test)]
mod test {
use pretty_assertions::assert_eq;
use std::str::FromStr;
use super::*;
#[test]
fn putative_impact_display() {
assert_eq!(format!("{}", PutativeImpact::High), "HIGH");
assert_eq!(format!("{}", PutativeImpact::Moderate), "MODERATE");
assert_eq!(format!("{}", PutativeImpact::Low), "LOW");
assert_eq!(format!("{}", PutativeImpact::Modifier), "MODIFIER");
}
#[test]
fn putative_impact_from_str() -> Result<(), anyhow::Error> {
assert_eq!(PutativeImpact::from_str("HIGH")?, PutativeImpact::High);
assert_eq!(
PutativeImpact::from_str("MODERATE")?,
PutativeImpact::Moderate
);
assert_eq!(PutativeImpact::from_str("LOW")?, PutativeImpact::Low);
assert_eq!(
PutativeImpact::from_str("MODIFIER")?,
PutativeImpact::Modifier
);
Ok(())
}
#[test]
fn consequence_display() {
assert_eq!(format!("{}", Consequence::TfbsAblation), "TFBS_ablation");
assert_eq!(
format!("{}", Consequence::ThreePrimeUtrExonVariant),
"3_prime_UTR_exon_variant"
);
assert_eq!(
format!("{}", Consequence::FivePrimeUtrIntronVariant),
"5_prime_UTR_intron_variant"
);
assert_eq!(
format!("{}", Consequence::MatureMirnaVariant),
"mature_miRNA_variant"
);
assert_eq!(
format!("{}", Consequence::TfBindingSiteVariant),
"TF_binding_site_variant"
);
assert_eq!(
format!("{}", Consequence::TfbsAmplification),
"TFBS_amplification"
);
}
#[test]
fn consequence_from_str() -> Result<(), anyhow::Error> {
assert_eq!(
Consequence::from_str("TFBS_ablation")?,
Consequence::TfbsAblation,
);
assert_eq!(
Consequence::from_str("3_prime_UTR_exon_variant")?,
Consequence::ThreePrimeUtrExonVariant,
);
assert_eq!(
Consequence::from_str("5_prime_UTR_intron_variant")?,
Consequence::FivePrimeUtrIntronVariant,
);
assert_eq!(
Consequence::from_str("mature_miRNA_variant")?,
Consequence::MatureMirnaVariant,
);
assert_eq!(
Consequence::from_str("TF_binding_site_variant")?,
Consequence::TfBindingSiteVariant,
);
assert_eq!(
Consequence::from_str("TFBS_amplification")?,
Consequence::TfbsAmplification,
);
Ok(())
}
#[test]
fn consequence_to_impact() {
{
let p: PutativeImpact = Consequence::MissenseVariant.into();
assert_eq!(p, PutativeImpact::Moderate,);
}
{
let p: PutativeImpact = Consequence::SynonymousVariant.into();
assert_eq!(p, PutativeImpact::Low,);
}
{
let p: PutativeImpact = Consequence::UpstreamGeneVariant.into();
assert_eq!(p, PutativeImpact::Modifier,);
}
}
#[test]
fn allele_display() {
assert_eq!(
format!(
"{}",
Allele::Alt {
alternative: String::from("A")
}
),
"A",
);
assert_eq!(
format!(
"{}",
Allele::AltRef {
alternative: String::from("A"),
reference: String::from("C"),
}
),
"A-C",
);
assert_eq!(
format!(
"{}",
Allele::Compound {
alternative: String::from("A"),
other_chrom: String::from("chr1"),
other_pos: 123,
other_ref: String::from("C"),
other_alt: String::from("T"),
}
),
"A-chr1:123_C>T",
);
}
#[test]
fn allele_from_str() -> Result<(), anyhow::Error> {
assert_eq!(
Allele::from_str("A")?,
Allele::Alt {
alternative: String::from("A")
},
);
assert_eq!(
Allele::from_str("A-C")?,
Allele::AltRef {
alternative: String::from("A"),
reference: String::from("C"),
}
);
assert_eq!(
Allele::from_str("A-chr1:123_C>T")?,
Allele::Compound {
alternative: String::from("A"),
other_chrom: String::from("chr1"),
other_pos: 123,
other_ref: String::from("C"),
other_alt: String::from("T"),
},
);
Ok(())
}
#[test]
fn so_feature_display_from_str() -> Result<(), anyhow::Error> {
assert_eq!(format!("{}", SoFeature::Transcript), "transcript",);
assert_eq!(SoFeature::from_str("transcript")?, SoFeature::Transcript,);
Ok(())
}
#[test]
fn feature_type_display() -> Result<(), anyhow::Error> {
assert_eq!(
format!(
"{}",
FeatureType::SoTerm {
term: SoFeature::Transcript
}
),
"transcript",
);
assert_eq!(
format!(
"{}",
FeatureType::Custom {
value: String::from("foo")
}
),
"foo",
);
Ok(())
}
#[test]
fn so_feature_from_str() -> Result<(), anyhow::Error> {
assert_eq!(
FeatureType::from_str("transcript")?,
FeatureType::SoTerm {
term: SoFeature::Transcript
},
);
assert_eq!(
FeatureType::from_str("foo")?,
FeatureType::Custom {
value: String::from("foo")
},
);
Ok(())
}
#[test]
fn feature_biotype() -> Result<(), anyhow::Error> {
assert_eq!(FeatureBiotype::from_str("Coding")?, FeatureBiotype::Coding);
assert_eq!(
FeatureBiotype::from_str("Noncoding")?,
FeatureBiotype::Noncoding
);
assert_eq!(format!("{}", FeatureBiotype::Coding), "Coding");
assert_eq!(format!("{}", FeatureBiotype::Noncoding), "Noncoding");
Ok(())
}
#[test]
fn rank_display() -> Result<(), anyhow::Error> {
assert_eq!(format!("{}", Rank { ord: 1, total: 2 }), "1/2",);
Ok(())
}
#[test]
fn rank_from_str() -> Result<(), anyhow::Error> {
assert_eq!(Rank::from_str("1/2")?, Rank { ord: 1, total: 2 },);
Ok(())
}
#[test]
fn pos_display() -> Result<(), anyhow::Error> {
assert_eq!(
format!(
"{}",
Pos {
ord: 1,
total: None
}
),
"1",
);
assert_eq!(
format!(
"{}",
Pos {
ord: 1,
total: Some(2)
}
),
"1/2",
);
Ok(())
}
#[test]
fn pos_from_str() -> Result<(), anyhow::Error> {
assert_eq!(
Pos::from_str("1")?,
Pos {
ord: 1,
total: None
},
);
assert_eq!(
Pos::from_str("1/2")?,
Pos {
ord: 1,
total: Some(2)
},
);
Ok(())
}
#[test]
fn message() -> Result<(), anyhow::Error> {
assert_eq!(
format!("{}", Message::ErrorChromosomeNotFound),
"ERROR_CHROMOSOME_NOT_FOUND",
);
assert_eq!(
Message::from_str("ERROR_CHROMOSOME_NOT_FOUND")?,
Message::ErrorChromosomeNotFound,
);
Ok(())
}
#[test]
fn ann_field_display() {
let value = AnnField {
allele: Allele::Alt {
alternative: String::from("A"),
},
consequences: vec![Consequence::MissenseVariant],
putative_impact: PutativeImpact::Moderate,
gene_symbol: String::from("GENE"),
gene_id: String::from("HGNC:gene_id"),
feature_type: FeatureType::SoTerm {
term: SoFeature::Transcript,
},
feature_id: String::from("feature_id"),
feature_biotype: vec![FeatureBiotype::Coding],
feature_tags: vec![FeatureTag::Other("Other".to_string())],
rank: Some(Rank { ord: 1, total: 2 }),
hgvs_g: Some(String::from("HGVS.g")),
hgvs_n: Some(String::from("HGVS.n")),
hgvs_c: Some(String::from("HGVS.c")),
hgvs_p: Some(String::from("HGVS.p")),
cdna_pos: Some(Pos {
ord: 1,
total: None,
}),
cds_pos: Some(Pos {
ord: 1,
total: Some(2),
}),
protein_pos: Some(Pos {
ord: 1,
total: None,
}),
distance: Some(1),
strand: 0,
messages: Some(vec![Message::ErrorChromosomeNotFound]),
custom_fields: BTreeMap::new(),
};
assert_eq!(
format!("{}", &value),
"A|missense_variant|MODERATE|GENE|HGNC:gene_id|transcript|feature_id|Coding|Other|1/2|HGVS.g|\
HGVS.n|HGVS.c|HGVS.p|1|1/2|1|1|0|ERROR_CHROMOSOME_NOT_FOUND"
);
}
#[test]
fn ann_field_from_str() -> Result<(), anyhow::Error> {
let value = "A|missense_variant|MODERATE|GENE|HGNC:gene_id|transcript|feature_id|\
Coding|Other|1/2|HGVS.g|HGVS.n|HGVS.c|HGVS.p|1|1/2|1|1|0|ERROR_CHROMOSOME_NOT_FOUND";
let field = AnnField::from_str(value)?;
assert_eq!(format!("{}", &field), value);
Ok(())
}
#[test]
fn ann_field_from_str_with_empty_fields() -> Result<(), anyhow::Error> {
let value = "A|missense_variant|MODERATE|GENE|HGNC:gene_id|transcript|feature_id|\
||1/2|HGVS.g|HGVS.n|HGVS.c|HGVS.p|1|1/2|1|1|0|ERROR_CHROMOSOME_NOT_FOUND";
let field = AnnField::from_str(value)?;
assert_eq!(format!("{}", &field), value);
Ok(())
}
fn dummy_ann_string() -> &'static str {
"A|missense_variant|MODERATE|GENE|HGNC:gene_id|transcript|feature_id|\
Coding|Other|1/2|HGVS.g|HGVS.n|HGVS.c|HGVS.p|1|1/2|1|1|0|ERROR_CHROMOSOME_NOT_FOUND"
}
fn full_sequence_config() -> Config {
Config {
custom_columns: vec![
ANN_TX_SEQ_REF.to_string(),
ANN_TX_SEQ_ALT.to_string(),
ANN_AA_SEQ_REF.to_string(),
ANN_AA_SEQ_ALT.to_string(),
],
..Default::default()
}
}
#[test]
fn parse_and_format_with_all_sequences_present() -> Result<(), anyhow::Error> {
let config = full_sequence_config();
let value = format!("{}|ACG|TCG|T|S", dummy_ann_string());
let field = AnnField::parse(&value, &config)?;
assert_eq!(
field.custom_fields.get(ANN_TX_SEQ_REF),
Some(&Some("ACG".to_string()))
);
assert_eq!(
field.custom_fields.get(ANN_TX_SEQ_ALT),
Some(&Some("TCG".to_string()))
);
assert_eq!(
field.custom_fields.get(ANN_AA_SEQ_REF),
Some(&Some("T".to_string()))
);
assert_eq!(
field.custom_fields.get(ANN_AA_SEQ_ALT),
Some(&Some("S".to_string()))
);
assert_eq!(field.format(&config), value);
Ok(())
}
#[test]
fn parse_fallback_to_empty_when_expected_columns_are_missing() -> Result<(), anyhow::Error> {
let config = full_sequence_config();
let value = dummy_ann_string();
let field = AnnField::parse(value, &config)?;
assert_eq!(field.custom_fields.get(ANN_TX_SEQ_REF), Some(&None));
assert_eq!(field.custom_fields.get(ANN_TX_SEQ_ALT), Some(&None));
assert_eq!(field.custom_fields.get(ANN_AA_SEQ_REF), Some(&None));
assert_eq!(field.custom_fields.get(ANN_AA_SEQ_ALT), Some(&None));
let expected_output = format!("{}||||", dummy_ann_string());
assert_eq!(field.format(&config), expected_output);
Ok(())
}
#[test]
fn parse_and_format_with_empty_trailing_pipes() -> Result<(), anyhow::Error> {
let config = full_sequence_config();
let value = format!("{}||||", dummy_ann_string());
let field = AnnField::parse(&value, &config)?;
assert_eq!(field.custom_fields.get(ANN_TX_SEQ_REF), Some(&None));
assert_eq!(field.custom_fields.get(ANN_TX_SEQ_ALT), Some(&None));
assert_eq!(field.custom_fields.get(ANN_AA_SEQ_REF), Some(&None));
assert_eq!(field.custom_fields.get(ANN_AA_SEQ_ALT), Some(&None));
assert_eq!(field.format(&config), value);
Ok(())
}
#[test]
fn format_follows_config_schema() -> Result<(), anyhow::Error> {
let config = Config {
custom_columns: vec![ANN_TX_SEQ_REF.to_string()],
..Default::default()
};
let mut field = AnnField::parse(dummy_ann_string(), &config)?;
field
.custom_fields
.insert(ANN_TX_SEQ_REF.into(), Some("ACG".to_string()));
field
.custom_fields
.insert(ANN_TX_SEQ_ALT.into(), Some("TCG".to_string()));
field
.custom_fields
.insert("CustomScore".into(), Some("99".to_string()));
let formatted = field.format(&config);
let names = AnnField::ann_field_names(&config);
let parts: Vec<&str> = formatted.split('|').collect();
assert_eq!(
parts.len(),
names.len(),
"Formatted field count must match header name count"
);
assert_eq!(parts.last(), Some(&"ACG"));
Ok(())
}
#[test]
fn dynamic_header_names_match_config() {
let config = Config {
custom_columns: vec![
ANN_TX_SEQ_ALT.to_string(),
ANN_AA_SEQ_REF.to_string(),
ANN_AA_SEQ_ALT.to_string(),
],
..Default::default()
};
let names = AnnField::ann_field_names(&config);
assert!(!names.contains(&ANN_TX_SEQ_REF.to_string()));
assert!(names.contains(&ANN_TX_SEQ_ALT.to_string()));
assert!(names.contains(&ANN_AA_SEQ_REF.to_string()));
assert!(names.contains(&ANN_AA_SEQ_ALT.to_string()));
assert_eq!(names.last(), Some(&ANN_AA_SEQ_ALT.to_string()));
}
#[test]
fn automatic_ann_header_names() -> Result<(), anyhow::Error> {
let ann_names = AnnField::ann_field_names(&Config::default());
assert_eq!(
ann_names,
[
"Allele",
"Annotation",
"Annotation_Impact",
"Gene_Name",
"Gene_ID",
"Feature_Type",
"Feature_ID",
"Transcript_BioType",
"Feature_Tags",
"Rank",
"HGVS.g",
"HGVS.n",
"HGVS.c",
"HGVS.p",
"cDNA.pos / cDNA.length",
"CDS.pos / CDS.length",
"AA.pos / AA.length",
"Distance",
"Strand",
"ERRORS / WARNINGS / INFO"
]
);
Ok(())
}
}