use super::LegacyFormat;
use crate::hgvs::location::AminoAcid;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LegacyProteinFormat {
ArrowSubstitution,
NumberFirst,
}
#[allow(dead_code)]
pub fn parse_legacy_protein(input: &str) -> Option<(LegacyProteinFormat, String)> {
convert_legacy_protein(input).map(|(format, s)| {
let prot_format = match format {
super::LegacyFormat::ArrowProteinSubstitution => LegacyProteinFormat::ArrowSubstitution,
super::LegacyFormat::NumberFirstProtein => LegacyProteinFormat::NumberFirst,
_ => LegacyProteinFormat::ArrowSubstitution, };
(prot_format, s)
})
}
pub fn convert_legacy_protein(input: &str) -> Option<(LegacyFormat, String)> {
if let Some(result) = try_number_first(input) {
return Some((LegacyFormat::NumberFirstProtein, result));
}
if let Some(result) = try_arrow_format(input) {
return Some((LegacyFormat::ArrowProteinSubstitution, result));
}
None
}
fn try_number_first(input: &str) -> Option<String> {
if !input.starts_with(|c: char| c.is_ascii_digit()) {
return None;
}
let num_end = input.find(|c: char| !c.is_ascii_digit())?;
let position: u64 = input[..num_end].parse().ok()?;
let rest = &input[num_end..];
if rest.len() < 2 {
return None;
}
if !rest.contains('>') {
return None;
}
let ref_char = rest.chars().next()?;
let ref_aa = AminoAcid::from_one_letter(ref_char)?;
let arrow_pos = rest.find('>')?;
let alt_part = &rest[arrow_pos + 1..];
let (alt_aa, suffix) = parse_amino_acid_with_suffix(alt_part)?;
Some(format!("{}{}{}{}", ref_aa, position, alt_aa, suffix))
}
fn try_arrow_format(input: &str) -> Option<String> {
if !input.starts_with(|c: char| c.is_ascii_alphabetic()) {
if !input.starts_with('*') {
return None;
}
}
if !input.contains('>') {
return None;
}
let mut chars = input.chars().peekable();
let ref_aa = if let Some(aa) = try_parse_three_letter(input) {
for _ in 0..3 {
chars.next();
}
aa
} else {
let c = chars.next()?;
AminoAcid::from_one_letter(c)?
};
let mut position = String::new();
while let Some(&c) = chars.peek() {
if c.is_ascii_digit() {
position.push(chars.next().unwrap());
} else {
break;
}
}
if position.is_empty() {
return None;
}
if chars.next() != Some('>') {
return None;
}
let remaining: String = chars.collect();
let (alt_aa, suffix) = parse_amino_acid_with_suffix(&remaining)?;
Some(format!("{}{}{}{}", ref_aa, position, alt_aa, suffix))
}
fn try_parse_three_letter(input: &str) -> Option<AminoAcid> {
if input.len() < 3 {
return None;
}
let codes = [
("Ala", AminoAcid::Ala),
("Arg", AminoAcid::Arg),
("Asn", AminoAcid::Asn),
("Asp", AminoAcid::Asp),
("Cys", AminoAcid::Cys),
("Gln", AminoAcid::Gln),
("Glu", AminoAcid::Glu),
("Gly", AminoAcid::Gly),
("His", AminoAcid::His),
("Ile", AminoAcid::Ile),
("Leu", AminoAcid::Leu),
("Lys", AminoAcid::Lys),
("Met", AminoAcid::Met),
("Phe", AminoAcid::Phe),
("Pro", AminoAcid::Pro),
("Sec", AminoAcid::Sec),
("Ser", AminoAcid::Ser),
("Thr", AminoAcid::Thr),
("Trp", AminoAcid::Trp),
("Tyr", AminoAcid::Tyr),
("Val", AminoAcid::Val),
("Ter", AminoAcid::Ter),
("Xaa", AminoAcid::Xaa),
];
let prefix = &input[..3];
let normalized: String = prefix
.chars()
.enumerate()
.map(|(i, c)| {
if i == 0 {
c.to_ascii_uppercase()
} else {
c.to_ascii_lowercase()
}
})
.collect();
for (code, aa) in codes {
if normalized == code {
return Some(aa);
}
}
None
}
fn parse_amino_acid_with_suffix(input: &str) -> Option<(AminoAcid, &str)> {
if let Some(aa) = try_parse_three_letter(input) {
return Some((aa, &input[3..]));
}
if input.is_empty() {
return None;
}
let c = input.chars().next()?;
let aa = AminoAcid::from_one_letter(c)?;
Some((aa, &input[1..]))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_arrow_substitution() {
let (format, converted) = convert_legacy_protein("V600>E").unwrap();
assert_eq!(format, LegacyFormat::ArrowProteinSubstitution);
assert_eq!(converted, "Val600Glu");
}
#[test]
fn test_arrow_substitution_single_letter() {
let (format, converted) = convert_legacy_protein("M1>I").unwrap();
assert_eq!(format, LegacyFormat::ArrowProteinSubstitution);
assert_eq!(converted, "Met1Ile");
}
#[test]
fn test_arrow_to_stop() {
let (format, converted) = convert_legacy_protein("W288>*").unwrap();
assert_eq!(format, LegacyFormat::ArrowProteinSubstitution);
assert_eq!(converted, "Trp288Ter");
}
#[test]
fn test_number_first() {
let (format, converted) = convert_legacy_protein("600V>E").unwrap();
assert_eq!(format, LegacyFormat::NumberFirstProtein);
assert_eq!(converted, "Val600Glu");
}
#[test]
fn test_number_first_to_stop() {
let (format, converted) = convert_legacy_protein("288W>*").unwrap();
assert_eq!(format, LegacyFormat::NumberFirstProtein);
assert_eq!(converted, "Trp288Ter");
}
#[test]
fn test_three_letter_arrow() {
let (format, converted) = convert_legacy_protein("Val600>Glu").unwrap();
assert_eq!(format, LegacyFormat::ArrowProteinSubstitution);
assert_eq!(converted, "Val600Glu");
}
#[test]
fn test_no_arrow_returns_none() {
assert!(convert_legacy_protein("V600E").is_none());
}
#[test]
fn test_normal_hgvs_returns_none() {
assert!(convert_legacy_protein("Val600Glu").is_none());
}
#[test]
fn test_try_parse_three_letter() {
assert_eq!(try_parse_three_letter("Val600"), Some(AminoAcid::Val));
assert_eq!(try_parse_three_letter("val600"), Some(AminoAcid::Val));
assert_eq!(try_parse_three_letter("VAL600"), Some(AminoAcid::Val));
assert_eq!(try_parse_three_letter("Ter"), Some(AminoAcid::Ter));
assert_eq!(try_parse_three_letter("Xy"), None); assert_eq!(try_parse_three_letter("Xyz"), None); }
}