use crate::hgvs::location::{AminoAcid, CdsPos, GenomePos, ProtPos, RnaPos, TxPos};
use nom::{
branch::alt,
character::complete::{char, digit1},
combinator::opt,
IResult, Parser,
};
#[inline]
pub fn parse_genome_pos(input: &str) -> IResult<&str, GenomePos> {
let bytes = input.as_bytes();
if bytes.len() >= 4 {
if bytes[0..4] == *b"pter" {
return Ok((&input[4..], GenomePos::pter()));
}
if bytes[0..4] == *b"qter" {
return Ok((&input[4..], GenomePos::qter()));
}
}
if bytes.len() >= 3 && bytes[0..3] == *b"cen" {
return Ok((&input[3..], GenomePos::cen()));
}
let (remaining, s) = digit1.parse(input)?;
let base: u64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
match offset {
Some(off) => Ok((remaining, GenomePos::with_offset(base, off))),
None => Ok((remaining, GenomePos::new(base))),
}
}
pub const OFFSET_UNKNOWN_POSITIVE: i64 = i64::MAX;
pub const OFFSET_UNKNOWN_NEGATIVE: i64 = i64::MIN;
#[inline]
fn parse_offset(input: &str) -> IResult<&str, i64> {
let (input, sign) = alt((char('+'), char('-'))).parse(input)?;
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('?').parse(input) {
let offset = if sign == '+' {
OFFSET_UNKNOWN_POSITIVE
} else {
OFFSET_UNKNOWN_NEGATIVE
};
return Ok((remaining, offset));
}
let (input, num) = digit1.parse(input)?;
let value: i64 = num.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let signed_value = if sign == '-' { -value } else { value };
Ok((input, signed_value))
}
#[inline]
pub fn parse_cds_pos(input: &str) -> IResult<&str, CdsPos> {
if let Some(rest) = input.strip_prefix('?') {
let (remaining, offset) = opt(parse_offset).parse(rest)?;
return Ok((remaining, CdsPos::unknown(offset)));
}
if let Some(rest) = input.strip_prefix('*') {
let (remaining, s) = digit1.parse(rest)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
CdsPos {
base,
offset,
utr3: true,
},
));
}
if let Some(rest) = input.strip_prefix('-') {
let (remaining, s) = digit1.parse(rest)?;
let parsed: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let base: i64 = -parsed;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
CdsPos {
base,
offset,
utr3: false,
},
));
}
let (remaining, s) = digit1.parse(input)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
Ok((
remaining,
CdsPos {
base,
offset,
utr3: false,
},
))
}
#[inline]
pub fn parse_tx_pos(input: &str) -> IResult<&str, TxPos> {
if let Some(rest) = input.strip_prefix('*') {
let (remaining, s) = digit1.parse(rest)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
TxPos {
base,
offset,
downstream: true,
},
));
}
if let Some(rest) = input.strip_prefix('-') {
let (remaining, s) = digit1.parse(rest)?;
let parsed: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let base: i64 = -parsed;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
TxPos {
base,
offset,
downstream: false,
},
));
}
let (remaining, s) = digit1.parse(input)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
Ok((
remaining,
TxPos {
base,
offset,
downstream: false,
},
))
}
#[inline]
pub fn parse_rna_pos(input: &str) -> IResult<&str, RnaPos> {
if let Some(rest) = input.strip_prefix('*') {
let (remaining, s) = digit1.parse(rest)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
RnaPos {
base,
offset,
utr3: true,
},
));
}
if let Some(rest) = input.strip_prefix('-') {
let (remaining, s) = digit1.parse(rest)?;
let parsed: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
let base: i64 = -parsed;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
return Ok((
remaining,
RnaPos {
base,
offset,
utr3: false,
},
));
}
let (remaining, s) = digit1.parse(input)?;
let base: i64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if base == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let (remaining, offset) = opt(parse_offset).parse(remaining)?;
Ok((
remaining,
RnaPos {
base,
offset,
utr3: false,
},
))
}
#[inline]
fn parse_amino_acid_three_letter(input: &str) -> IResult<&str, AminoAcid> {
let bytes = input.as_bytes();
if bytes.len() < 3 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let normalized: [u8; 3] = [
bytes[0].to_ascii_uppercase(),
bytes[1].to_ascii_lowercase(),
bytes[2].to_ascii_lowercase(),
];
let aa = match &normalized {
b"Ala" => AminoAcid::Ala,
b"Arg" => AminoAcid::Arg,
b"Asn" => AminoAcid::Asn,
b"Asp" => AminoAcid::Asp,
b"Cys" => AminoAcid::Cys,
b"Gln" => AminoAcid::Gln,
b"Glu" => AminoAcid::Glu,
b"Gly" => AminoAcid::Gly,
b"His" => AminoAcid::His,
b"Ile" => AminoAcid::Ile,
b"Leu" => AminoAcid::Leu,
b"Lys" => AminoAcid::Lys,
b"Met" => AminoAcid::Met,
b"Phe" => AminoAcid::Phe,
b"Pro" => AminoAcid::Pro,
b"Pyl" => AminoAcid::Pyl,
b"Sec" => AminoAcid::Sec,
b"Ser" => AminoAcid::Ser,
b"Thr" => AminoAcid::Thr,
b"Trp" => AminoAcid::Trp,
b"Tyr" => AminoAcid::Tyr,
b"Val" => AminoAcid::Val,
b"Ter" => AminoAcid::Ter,
b"Xaa" => AminoAcid::Xaa,
_ => {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
};
Ok((&input[3..], aa))
}
#[inline]
pub fn parse_amino_acid_one_letter(input: &str) -> IResult<&str, AminoAcid> {
if input.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Eof,
)));
}
let c = input.chars().next().unwrap();
if let Some(aa) = AminoAcid::from_one_letter(c) {
Ok((&input[1..], aa))
} else {
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
}
#[inline]
pub fn parse_amino_acid(input: &str) -> IResult<&str, AminoAcid> {
if input.len() >= 3 {
if let Ok(result) = parse_amino_acid_three_letter(input) {
return Ok(result);
}
}
parse_amino_acid_one_letter(input)
}
#[inline]
pub fn parse_prot_pos(input: &str) -> IResult<&str, ProtPos> {
let (remaining, aa) = parse_amino_acid.parse(input)?;
let (remaining, s) = digit1.parse(remaining)?;
let num: u64 = s.parse().map_err(|_| {
nom::Err::Error(nom::error::Error::new(input, nom::error::ErrorKind::Digit))
})?;
if num == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
Ok((remaining, ProtPos::new(aa, num)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_genome_pos() {
let (remaining, pos) = parse_genome_pos("12345del").unwrap();
assert_eq!(remaining, "del");
assert_eq!(pos.base, 12345);
}
#[test]
fn test_parse_cds_pos_simple() {
let (remaining, pos) = parse_cds_pos("100del").unwrap();
assert_eq!(remaining, "del");
assert_eq!(pos.base, 100);
assert_eq!(pos.offset, None);
assert!(!pos.utr3);
}
#[test]
fn test_parse_cds_pos_with_positive_offset() {
let (remaining, pos) = parse_cds_pos("100+5G>A").unwrap();
assert_eq!(remaining, "G>A");
assert_eq!(pos.base, 100);
assert_eq!(pos.offset, Some(5));
}
#[test]
fn test_parse_cds_pos_with_negative_offset() {
let (remaining, pos) = parse_cds_pos("100-10G>A").unwrap();
assert_eq!(remaining, "G>A");
assert_eq!(pos.base, 100);
assert_eq!(pos.offset, Some(-10));
}
#[test]
fn test_parse_cds_pos_5utr() {
let (remaining, pos) = parse_cds_pos("-20G>A").unwrap();
assert_eq!(remaining, "G>A");
assert_eq!(pos.base, -20);
assert!(!pos.utr3);
}
#[test]
fn test_parse_cds_pos_3utr() {
let (remaining, pos) = parse_cds_pos("*50G>A").unwrap();
assert_eq!(remaining, "G>A");
assert_eq!(pos.base, 50);
assert!(pos.utr3);
}
#[test]
fn test_parse_amino_acid() {
let (remaining, aa) = parse_amino_acid("Met1").unwrap();
assert_eq!(remaining, "1");
assert_eq!(aa, AminoAcid::Met);
}
#[test]
fn test_parse_prot_pos() {
let (remaining, pos) = parse_prot_pos("Val600Glu").unwrap();
assert_eq!(remaining, "Glu");
assert_eq!(pos.aa, AminoAcid::Val);
assert_eq!(pos.number, 600);
}
#[test]
fn test_parse_amino_acid_single_letter() {
let (remaining, aa) = parse_amino_acid("V600").unwrap();
assert_eq!(remaining, "600");
assert_eq!(aa, AminoAcid::Val);
let (remaining, aa) = parse_amino_acid("E").unwrap();
assert_eq!(remaining, "");
assert_eq!(aa, AminoAcid::Glu);
let (remaining, aa) = parse_amino_acid("M1").unwrap();
assert_eq!(remaining, "1");
assert_eq!(aa, AminoAcid::Met);
let (remaining, aa) = parse_amino_acid("*").unwrap();
assert_eq!(remaining, "");
assert_eq!(aa, AminoAcid::Ter);
}
#[test]
fn test_parse_amino_acid_prefers_three_letter() {
let (remaining, aa) = parse_amino_acid("Val600").unwrap();
assert_eq!(remaining, "600");
assert_eq!(aa, AminoAcid::Val);
let (remaining, aa) = parse_amino_acid("Met1").unwrap();
assert_eq!(remaining, "1");
assert_eq!(aa, AminoAcid::Met);
}
#[test]
fn test_parse_prot_pos_single_letter() {
let (remaining, pos) = parse_prot_pos("V600E").unwrap();
assert_eq!(remaining, "E");
assert_eq!(pos.aa, AminoAcid::Val);
assert_eq!(pos.number, 600);
let (remaining, pos) = parse_prot_pos("M1I").unwrap();
assert_eq!(remaining, "I");
assert_eq!(pos.aa, AminoAcid::Met);
assert_eq!(pos.number, 1);
}
#[test]
fn test_parse_cds_pos_unknown() {
use crate::hgvs::location::CDS_BASE_UNKNOWN;
let (remaining, pos) = parse_cds_pos("?dup").unwrap();
assert_eq!(remaining, "dup");
assert_eq!(pos.base, CDS_BASE_UNKNOWN);
assert!(pos.is_unknown());
assert_eq!(pos.offset, None);
let (remaining, pos) = parse_cds_pos("?-232_4484+?del").unwrap();
assert_eq!(remaining, "_4484+?del");
assert_eq!(pos.base, CDS_BASE_UNKNOWN);
assert!(pos.is_unknown());
assert_eq!(pos.offset, Some(-232));
let (remaining, pos) = parse_cds_pos("?+10del").unwrap();
assert_eq!(remaining, "del");
assert_eq!(pos.base, CDS_BASE_UNKNOWN);
assert!(pos.is_unknown());
assert_eq!(pos.offset, Some(10));
let (remaining, pos) = parse_cds_pos("148-?_228+?").unwrap();
assert_eq!(remaining, "_228+?");
assert_eq!(pos.base, 148);
assert!(!pos.is_unknown());
assert_eq!(pos.offset, Some(OFFSET_UNKNOWN_NEGATIVE));
let (remaining, pos) = parse_cds_pos("228+?").unwrap();
assert_eq!(remaining, "");
assert_eq!(pos.base, 228);
assert!(!pos.is_unknown());
assert_eq!(pos.offset, Some(OFFSET_UNKNOWN_POSITIVE));
}
#[test]
fn test_cds_pos_unknown_display() {
let pos = CdsPos::unknown(None);
assert_eq!(pos.to_string(), "?");
let pos = CdsPos::unknown(Some(-232));
assert_eq!(pos.to_string(), "?-232");
let pos = CdsPos::unknown(Some(10));
assert_eq!(pos.to_string(), "?+10");
}
#[test]
fn test_parse_tx_pos_simple() {
let (remaining, pos) = parse_tx_pos("100del").unwrap();
assert_eq!(remaining, "del");
assert_eq!(pos.base, 100);
assert_eq!(pos.offset, None);
assert!(!pos.downstream);
}
#[test]
fn test_parse_tx_pos_upstream() {
let (remaining, pos) = parse_tx_pos("-30C>G").unwrap();
assert_eq!(remaining, "C>G");
assert_eq!(pos.base, -30);
assert!(!pos.downstream);
}
#[test]
fn test_parse_tx_pos_downstream() {
let (remaining, pos) = parse_tx_pos("*5C>G").unwrap();
assert_eq!(remaining, "C>G");
assert_eq!(pos.base, 5);
assert!(pos.downstream);
assert_eq!(pos.to_string(), "*5");
}
#[test]
fn test_parse_tx_pos_downstream_with_offset() {
let (remaining, pos) = parse_tx_pos("*5+10C>G").unwrap();
assert_eq!(remaining, "C>G");
assert_eq!(pos.base, 5);
assert_eq!(pos.offset, Some(10));
assert!(pos.downstream);
assert_eq!(pos.to_string(), "*5+10");
}
#[test]
fn test_parse_tx_pos_with_offset() {
let (remaining, pos) = parse_tx_pos("100+5G>A").unwrap();
assert_eq!(remaining, "G>A");
assert_eq!(pos.base, 100);
assert_eq!(pos.offset, Some(5));
assert!(!pos.downstream);
}
}