use crate::hgvs::edit::{
AminoAcidSeq, Base, ExtDirection, InsertedPart, InsertedSequence, MethylationStatus, NaEdit,
ProteinEdit, RepeatCount, RepeatUnit, Sequence,
};
use crate::hgvs::location::AminoAcid;
use crate::hgvs::parser::position::{parse_amino_acid, parse_amino_acid_one_letter};
use nom::{
branch::alt,
bytes::complete::{tag, take_while1},
character::complete::{char, digit1},
combinator::{map, opt},
multi::many1,
sequence::preceded,
IResult, Parser,
};
use std::borrow::Cow;
use std::str::FromStr;
const fn is_iupac_base(b: u8) -> bool {
matches!(
b,
b'A' | b'C'
| b'G'
| b'T'
| b'U'
| b'N'
| b'R'
| b'Y'
| b'S'
| b'W'
| b'K'
| b'M'
| b'B'
| b'D'
| b'H'
| b'V'
| b'a'
| b'c'
| b'g'
| b't'
| b'u'
| b'n'
| b'r'
| b'y'
| b's'
| b'w'
| b'k'
| b'm'
| b'b'
| b'd'
| b'h'
| b'v'
)
}
#[inline]
fn parse_base(input: &str) -> IResult<&str, Base> {
let bytes = input.as_bytes();
if bytes.is_empty() || !is_iupac_base(bytes[0]) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::OneOf,
)));
}
let c = bytes[0] as char;
Ok((&input[1..], Base::from_char(c).unwrap()))
}
#[inline]
fn parse_sequence(input: &str) -> IResult<&str, Sequence> {
let bytes = input.as_bytes();
let mut end = 0;
while end < bytes.len() && is_iupac_base(bytes[end]) {
end += 1;
}
if end == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::TakeWhile1,
)));
}
let s = &input[..end];
let normalized: Cow<'_, str> = if s.bytes().all(|b| b.is_ascii_uppercase()) {
Cow::Borrowed(s)
} else {
Cow::Owned(s.to_ascii_uppercase())
};
Ok((&input[end..], Sequence::from_str(&normalized).unwrap()))
}
fn parse_opt_sequence(input: &str) -> IResult<&str, Option<Sequence>> {
opt(parse_sequence).parse(input)
}
#[inline]
fn parse_substitution(input: &str) -> IResult<&str, NaEdit> {
let bytes = input.as_bytes();
if bytes.len() < 3 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if !is_iupac_base(bytes[0]) || bytes[1] != b'>' || !is_iupac_base(bytes[2]) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if bytes.len() > 3 && is_iupac_base(bytes[3]) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let reference = Base::from_char(bytes[0] as char).unwrap();
let alternative = Base::from_char(bytes[2] as char).unwrap();
Ok((
&input[3..],
NaEdit::Substitution {
reference,
alternative,
},
))
}
#[inline]
fn parse_substitution_no_ref(input: &str) -> IResult<&str, NaEdit> {
let bytes = input.as_bytes();
if bytes.len() < 2 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if bytes[0] != b'>' || !is_iupac_base(bytes[1]) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if bytes.len() > 2 && is_iupac_base(bytes[2]) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
let alternative = Base::from_char(bytes[1] as char).unwrap();
Ok((&input[2..], NaEdit::SubstitutionNoRef { alternative }))
}
fn parse_multibase_substitution(input: &str) -> IResult<&str, NaEdit> {
let (input, ref_seq) = parse_sequence(input)?;
let (input, _) = tag(">").parse(input)?;
let (input, alt_seq) = parse_sequence(input)?;
if ref_seq.len() <= 1 && alt_seq.len() <= 1 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
Ok((
input,
NaEdit::Delins {
sequence: InsertedSequence::Literal(alt_seq),
},
))
}
#[inline]
fn parse_deletion(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("del").parse(input)?;
let bytes = input.as_bytes();
if bytes.is_empty() {
return Ok((
input,
NaEdit::Deletion {
sequence: None,
length: None,
},
));
}
match bytes[0] {
b'0'..=b'9' => {
let (remaining, len_str) = digit1::<&str, nom::error::Error<&str>>(input)?;
Ok((
remaining,
NaEdit::Deletion {
sequence: None,
length: Some(len_str.parse().unwrap_or(0)),
},
))
}
c if is_iupac_base(c) => {
let (remaining, seq) = parse_sequence(input)?;
Ok((
remaining,
NaEdit::Deletion {
sequence: Some(seq),
length: None,
},
))
}
_ => {
Ok((
input,
NaEdit::Deletion {
sequence: None,
length: None,
},
))
}
}
}
fn parse_insertion(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("ins").parse(input)?;
let (input, sequence) = parse_inserted_sequence(input)?;
Ok((input, NaEdit::Insertion { sequence }))
}
#[inline]
fn parse_inserted_sequence(input: &str) -> IResult<&str, InsertedSequence> {
let bytes = input.as_bytes();
if bytes.is_empty() {
return Ok((input, InsertedSequence::Empty));
}
if let Some(reference) = parse_reference_location(input) {
let ref_len = reference.len();
return Ok((&input[ref_len..], InsertedSequence::Reference(reference)));
}
match bytes[0] {
b'[' => {
parse_bracketed_inserted_sequence(input)
}
b'(' => {
parse_parenthesized_count(input)
}
b'0'..=b'9' => {
if let Ok((remaining, part)) = parse_cds_position_range(input) {
use crate::hgvs::edit::InsertedPart;
let seq = match part {
InsertedPart::PositionRange { start, end } => {
InsertedSequence::Complex(vec![InsertedPart::PositionRange { start, end }])
}
InsertedPart::PositionRangeInv { start, end } => {
InsertedSequence::Complex(vec![InsertedPart::PositionRangeInv {
start,
end,
}])
}
InsertedPart::CdsPositionRange(s) => {
InsertedSequence::Complex(vec![InsertedPart::CdsPositionRange(s)])
}
_ => InsertedSequence::Empty,
};
Ok((remaining, seq))
} else {
parse_simple_count(input)
}
}
c if is_iupac_base(c) => {
if bytes.len() > 1 && bytes[1] == b'[' {
parse_repeated_base_insertion(input)
} else {
let mut has_non_iupac = false;
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if is_iupac_base(b) {
i += 1;
} else if b.is_ascii_lowercase() {
has_non_iupac = true;
i += 1;
} else if b.is_ascii_digit() && i > 0 {
has_non_iupac = true;
i += 1;
} else if b.is_ascii_uppercase() {
has_non_iupac = true;
i += 1;
} else {
break;
}
}
if has_non_iupac && i > 0 {
let name = &input[..i];
return Ok((&input[i..], InsertedSequence::Named(name.to_string())));
}
let (remaining, seq) = parse_sequence(input)?;
if remaining.starts_with('[') {
if let Ok((remaining2, count)) = parse_repeat_count(remaining) {
return Ok((
remaining2,
InsertedSequence::SequenceRepeat {
sequence: seq,
count,
},
));
}
}
Ok((remaining, InsertedSequence::Literal(seq)))
}
}
c if c.is_ascii_uppercase() => {
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_alphanumeric() {
i += 1;
} else {
break;
}
}
if i > 0 {
let name = &input[..i];
Ok((&input[i..], InsertedSequence::Named(name.to_string())))
} else {
Ok((input, InsertedSequence::Empty))
}
}
_ => {
Ok((input, InsertedSequence::Empty))
}
}
}
fn parse_bracketed_inserted_sequence(input: &str) -> IResult<&str, InsertedSequence> {
let (input, _) = char('[').parse(input)?;
if is_reference_accession_prefix(input) {
if let Some(close_pos) = input.find(']') {
let reference = &input[..close_pos];
let remaining = &input[close_pos + 1..];
return Ok((
remaining,
InsertedSequence::Reference(reference.to_string()),
));
}
}
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('(').parse(input) {
let (remaining, min) = digit1.parse(remaining)?;
let (remaining, _) = char('_').parse(remaining)?;
let (remaining, max) = digit1.parse(remaining)?;
let (remaining, _) = char(')').parse(remaining)?;
let (remaining, _) = char(']').parse(remaining)?;
return Ok((
remaining,
InsertedSequence::Range(min.parse().unwrap_or(0), max.parse().unwrap_or(0)),
));
}
let mut parts = Vec::with_capacity(4);
let mut remaining_input = input;
while let Ok((remaining, part)) = parse_inserted_part(remaining_input) {
parts.push(part);
remaining_input = remaining;
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>(';').parse(remaining_input) {
remaining_input = remaining;
} else {
break;
}
}
let (input, _) = char(']').parse(remaining_input)?;
if parts.is_empty() {
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
} else {
Ok((input, InsertedSequence::Complex(parts)))
}
}
fn is_reference_accession_prefix(input: &str) -> bool {
let known_prefixes = [
"NC_", "NG_", "NM_", "NP_", "NR_", "NT_", "NW_", "XM_", "XP_", "XR_", "ENST", "ENSP",
"ENSG", "LRG_",
];
for prefix in known_prefixes {
if input.starts_with(prefix) {
return true;
}
}
let bytes = input.as_bytes();
if bytes.len() >= 3 {
if bytes[0].is_ascii_uppercase() {
if bytes[1].is_ascii_digit() {
return true;
} else if bytes[1].is_ascii_uppercase() && bytes.len() >= 4 && bytes[2].is_ascii_digit()
{
return true;
}
}
}
false
}
fn parse_reference_location(input: &str) -> Option<String> {
let colon_pos = input.find(':')?;
let after_colon = &input[colon_pos + 1..];
if after_colon.len() < 2 {
return None;
}
let coord_type = after_colon.as_bytes()[0];
let dot = after_colon.as_bytes()[1];
if dot != b'.' || !matches!(coord_type, b'g' | b'c' | b'm' | b'n' | b'r' | b'p' | b'o') {
return None;
}
let accession = &input[..colon_pos];
if accession.is_empty() {
return None;
}
let acc_bytes = accession.as_bytes();
if !acc_bytes[0].is_ascii_alphabetic() {
return None;
}
for &b in acc_bytes {
if !b.is_ascii_alphanumeric() && b != b'_' && b != b'.' {
return None;
}
}
let mut end_pos = colon_pos + 3; let pos_part = &input[end_pos..];
for c in pos_part.chars() {
if c.is_ascii_digit() || c == '_' || c == '+' || c == '-' || c == '?' {
end_pos += c.len_utf8();
} else {
break;
}
}
if end_pos <= colon_pos + 3 {
return None;
}
Some(input[..end_pos].to_string())
}
fn parse_external_ref_part(input: &str) -> IResult<&str, crate::hgvs::edit::InsertedPart> {
use crate::hgvs::edit::InsertedPart;
if !is_reference_accession_prefix(input) {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let end_pos = input.find([';', ']']).unwrap_or(input.len());
if end_pos == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let reference = &input[..end_pos];
if !reference.contains(':') {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
Ok((
&input[end_pos..],
InsertedPart::ExternalRef(reference.to_string()),
))
}
fn parse_cds_position_range(input: &str) -> IResult<&str, crate::hgvs::edit::InsertedPart> {
use crate::hgvs::edit::InsertedPart;
let mut end_pos = 0;
let bytes = input.as_bytes();
while end_pos < bytes.len() && bytes[end_pos].is_ascii_digit() {
end_pos += 1;
}
if end_pos == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)));
}
if end_pos < bytes.len() && (bytes[end_pos] == b'+' || bytes[end_pos] == b'-') {
end_pos += 1;
while end_pos < bytes.len() && bytes[end_pos].is_ascii_digit() {
end_pos += 1;
}
}
if end_pos >= bytes.len() || bytes[end_pos] != b'_' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
end_pos += 1;
let second_start = end_pos;
while end_pos < bytes.len() && bytes[end_pos].is_ascii_digit() {
end_pos += 1;
}
if end_pos == second_start {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)));
}
if end_pos < bytes.len() && (bytes[end_pos] == b'+' || bytes[end_pos] == b'-') {
end_pos += 1;
while end_pos < bytes.len() && bytes[end_pos].is_ascii_digit() {
end_pos += 1;
}
}
let range_str = &input[..end_pos];
let mut remaining = &input[end_pos..];
let has_offset = range_str.contains('+') || range_str.contains('-');
let has_inv = if remaining.starts_with("inv") {
remaining = &remaining[3..];
true
} else {
false
};
if has_offset {
let range_with_inv = if has_inv {
format!("{}inv", range_str)
} else {
range_str.to_string()
};
Ok((remaining, InsertedPart::CdsPositionRange(range_with_inv)))
} else {
let parts: Vec<&str> = range_str.split('_').collect();
if parts.len() == 2 {
let start: u64 = parts[0].parse().unwrap_or(0);
let end: u64 = parts[1].parse().unwrap_or(0);
if has_inv {
Ok((remaining, InsertedPart::PositionRangeInv { start, end }))
} else {
Ok((remaining, InsertedPart::PositionRange { start, end }))
}
} else {
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
}
}
fn parse_inserted_part(input: &str) -> IResult<&str, crate::hgvs::edit::InsertedPart> {
use crate::hgvs::edit::InsertedPart;
alt((
parse_external_ref_part,
|input| {
let (remaining, base) = parse_base(input)?;
let (remaining, count) = parse_repeat_count(remaining)?;
Ok((remaining, InsertedPart::Repeat { base, count }))
},
parse_cds_position_range,
map(parse_sequence, InsertedPart::Literal),
))
.parse(input)
}
fn parse_parenthesized_count(input: &str) -> IResult<&str, InsertedSequence> {
let (input, _) = char('(').parse(input)?;
if let Ok((remaining, _)) = tag::<_, _, nom::error::Error<&str>>("?)").parse(input) {
return Ok((remaining, InsertedSequence::Uncertain));
}
if let Ok((remaining, min_str)) = digit1::<_, nom::error::Error<&str>>.parse(input) {
if let Ok((remaining, _)) = tag::<_, _, nom::error::Error<&str>>("_?)").parse(remaining) {
return Ok((
remaining,
InsertedSequence::Range(min_str.parse().unwrap_or(0), u64::MAX),
));
}
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('_').parse(remaining) {
let (remaining, max_str) = digit1.parse(remaining)?;
let (remaining, _) = char(')').parse(remaining)?;
let start1: u64 = min_str.parse().unwrap_or(0);
let end1: u64 = max_str.parse().unwrap_or(0);
if let Some(rest) = remaining.strip_prefix("_(") {
if let Ok((rest, start2_str)) = digit1::<_, nom::error::Error<&str>>.parse(rest) {
if let Ok((rest, _)) = char::<_, nom::error::Error<&str>>('_').parse(rest) {
if let Ok((rest, end2_str)) =
digit1::<_, nom::error::Error<&str>>.parse(rest)
{
if let Ok((rest, _)) =
char::<_, nom::error::Error<&str>>(')').parse(rest)
{
let start2: u64 = start2_str.parse().unwrap_or(0);
let end2: u64 = end2_str.parse().unwrap_or(0);
return Ok((
rest,
InsertedSequence::Complex(vec![
InsertedPart::PositionRange {
start: start1,
end: end1,
},
InsertedPart::PositionRange {
start: start2,
end: end2,
},
]),
));
}
}
}
}
}
return Ok((remaining, InsertedSequence::Range(start1, end1)));
}
let (remaining, _) = char(')').parse(remaining)?;
return Ok((
remaining,
InsertedSequence::Count(min_str.parse().unwrap_or(0)),
));
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)))
}
fn parse_repeated_base_insertion(input: &str) -> IResult<&str, InsertedSequence> {
let (input, base) = parse_base(input)?;
let (input, count) = parse_repeat_count(input)?;
Ok((input, InsertedSequence::Repeat { base, count }))
}
#[inline]
fn parse_simple_count(input: &str) -> IResult<&str, InsertedSequence> {
let (remaining, count_str) = digit1.parse(input)?;
let start: u64 = count_str.parse().unwrap_or(0);
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('_').parse(remaining) {
let (remaining, end_str) = digit1.parse(remaining)?;
let end: u64 = end_str.parse().unwrap_or(0);
if let Ok((remaining, _)) = tag::<_, _, nom::error::Error<&str>>("inv").parse(remaining) {
return Ok((remaining, InsertedSequence::PositionRangeInv { start, end }));
}
return Ok((remaining, InsertedSequence::PositionRange { start, end }));
}
if remaining
.as_bytes()
.first()
.is_some_and(|&b| b.is_ascii_alphabetic())
{
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
Ok((remaining, InsertedSequence::Count(start)))
}
fn parse_delins(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("delins").parse(input)?;
let (input, sequence) = parse_inserted_sequence(input)?;
Ok((input, NaEdit::Delins { sequence }))
}
fn parse_delins_with_deleted_seq(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("del").parse(input)?;
let (input, _deleted_seq) = parse_sequence(input)?;
let (input, _) = tag("ins").parse(input)?;
let (input, inserted_seq) = parse_inserted_sequence(input)?;
Ok((
input,
NaEdit::Delins {
sequence: inserted_seq,
},
))
}
fn parse_delins_with_deleted_count(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("del").parse(input)?;
let (input, _deleted_count) = digit1.parse(input)?;
let (input, _) = tag("ins").parse(input)?;
let (input, inserted_seq) = parse_inserted_sequence(input)?;
Ok((
input,
NaEdit::Delins {
sequence: inserted_seq,
},
))
}
#[inline]
fn parse_dupins(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("dupins").parse(input)?;
let (input, sequence) = parse_inserted_sequence(input)?;
Ok((input, NaEdit::DupIns { sequence }))
}
#[inline]
fn parse_duplication(input: &str) -> IResult<&str, NaEdit> {
use crate::hgvs::edit::UncertainDupExtent;
let (input, _) = tag("dup").parse(input)?;
let bytes = input.as_bytes();
if bytes.is_empty() {
return Ok((
input,
NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
},
));
}
let (input, sequence, length) = match bytes[0] {
c if is_iupac_base(c) => {
let (remaining, seq) = parse_sequence(input)?;
(remaining, Some(seq), None)
}
b'0'..=b'9' => {
let (remaining, len_str) = digit1::<&str, nom::error::Error<&str>>(input)?;
(remaining, None, Some(len_str.parse().unwrap_or(0)))
}
_ => (input, None, None),
};
let bytes = input.as_bytes();
let (input, uncertain_extent) = if bytes.is_empty() {
(input, None)
} else {
match bytes[0] {
b'?' => (&input[1..], Some(UncertainDupExtent::Unknown)),
b'(' => {
let remaining = &input[1..];
if let Some(stripped) = remaining.strip_prefix("?)") {
(stripped, Some(UncertainDupExtent::Unknown))
} else if let Ok((remaining, start_str)) =
digit1::<&str, nom::error::Error<&str>>(remaining)
{
if let Ok((remaining, _)) =
char::<_, nom::error::Error<&str>>('_').parse(remaining)
{
if let Ok((remaining, end_str)) =
digit1::<&str, nom::error::Error<&str>>(remaining)
{
if let Ok((remaining, _)) =
char::<_, nom::error::Error<&str>>(')').parse(remaining)
{
let start = start_str.parse().unwrap_or(0);
let end = end_str.parse().unwrap_or(0);
(remaining, Some(UncertainDupExtent::Range(start, end)))
} else {
(input, None)
}
} else {
(input, None)
}
} else {
(input, None)
}
} else {
(input, None)
}
}
_ => (input, None),
}
};
Ok((
input,
NaEdit::Duplication {
sequence,
length,
uncertain_extent,
},
))
}
fn parse_inversion(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("inv").parse(input)?;
if let Ok((remaining, seq)) = parse_sequence(input) {
return Ok((
remaining,
NaEdit::Inversion {
sequence: Some(seq),
length: None,
},
));
}
if let Ok((remaining, len_str)) = digit1::<&str, nom::error::Error<&str>>(input) {
if let Ok(len) = len_str.parse::<u64>() {
return Ok((
remaining,
NaEdit::Inversion {
sequence: None,
length: Some(len),
},
));
}
}
Ok((
input,
NaEdit::Inversion {
sequence: None,
length: None,
},
))
}
#[inline]
fn scan_digits(bytes: &[u8]) -> (u64, usize) {
let mut end = 0;
let mut value = 0u64;
while end < bytes.len() && bytes[end].is_ascii_digit() {
value = value * 10 + (bytes[end] - b'0') as u64;
end += 1;
}
(value, end)
}
#[inline]
fn parse_repeat_count(input: &str) -> IResult<&str, RepeatCount> {
let bytes = input.as_bytes();
if bytes.is_empty() || bytes[0] != b'[' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let inner = &bytes[1..];
if inner.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
match inner[0] {
b'(' => {
let after_paren = &inner[1..];
if !after_paren.is_empty() && after_paren[0] == b'?' {
if after_paren.len() > 1 && after_paren[1] == b'_' {
let after_underscore = &after_paren[2..];
if !after_underscore.is_empty() && after_underscore[0] == b'?' {
if after_underscore.len() > 1
&& after_underscore[1] == b')'
&& after_underscore.len() > 2
&& after_underscore[2] == b']'
{
return Ok((&input[7..], RepeatCount::Unknown)); }
} else {
let (num, consumed) = scan_digits(after_underscore);
if consumed > 0 {
let check_pos = consumed;
if after_underscore.len() > check_pos + 1
&& after_underscore[check_pos] == b')'
&& after_underscore[check_pos + 1] == b']'
{
return Ok((
&input[6 + consumed..],
RepeatCount::MaxUncertain(num),
));
}
}
}
}
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let (num1, consumed1) = scan_digits(after_paren);
if consumed1 == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)));
}
let after_num1 = &after_paren[consumed1..];
if after_num1.is_empty() || after_num1[0] != b'_' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let after_underscore = &after_num1[1..];
if !after_underscore.is_empty() && after_underscore[0] == b'?' {
if after_underscore.len() > 1
&& after_underscore[1] == b')'
&& after_underscore.len() > 2
&& after_underscore[2] == b']'
{
let total_consumed = 6 + consumed1;
return Ok((&input[total_consumed..], RepeatCount::MinUncertain(num1)));
}
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let (num2, consumed2) = scan_digits(after_underscore);
if consumed2 == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)));
}
let after_num2 = &after_underscore[consumed2..];
if after_num2.len() < 2 || after_num2[0] != b')' || after_num2[1] != b']' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let total_consumed = 5 + consumed1 + consumed2;
Ok((&input[total_consumed..], RepeatCount::Range(num1, num2)))
}
b'?' => {
if inner.len() > 1 && inner[1] == b'_' {
if inner.len() > 2 && inner[2] == b'?' {
if inner.len() > 3 && inner[3] == b']' {
return Ok((&input[5..], RepeatCount::Unknown));
}
} else {
let (value, consumed) = scan_digits(&inner[2..]);
if consumed > 0 {
let bracket_pos = 2 + consumed;
if inner.len() > bracket_pos && inner[bracket_pos] == b']' {
return Ok((
&input[bracket_pos + 2..],
RepeatCount::MaxUncertain(value),
));
}
}
}
} else if inner.len() > 1 && inner[1] == b']' {
return Ok((&input[3..], RepeatCount::Unknown));
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'0'..=b'9' => {
let (num1, consumed1) = scan_digits(inner);
if consumed1 == 0 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Digit,
)));
}
let after_num1 = &inner[consumed1..];
if after_num1.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if after_num1[0] == b']' {
return Ok((&input[consumed1 + 2..], RepeatCount::Exact(num1)));
} else if after_num1[0] == b'_' {
if after_num1.len() > 1 && after_num1[1] == b'?' {
if after_num1.len() > 2 && after_num1[2] == b']' {
return Ok((&input[consumed1 + 4..], RepeatCount::MinUncertain(num1)));
}
} else {
let (num2, consumed2) = scan_digits(&after_num1[1..]);
if consumed2 > 0 {
let bracket_pos = consumed1 + 1 + consumed2;
if inner.len() > bracket_pos && inner[bracket_pos] == b']' {
return Ok((&input[bracket_pos + 2..], RepeatCount::Range(num1, num2)));
}
}
}
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
_ => Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
))),
}
}
#[inline]
fn parse_repeat(input: &str) -> IResult<&str, NaEdit> {
let (input, first_sequence) = parse_opt_sequence(input)?;
let (input, first_count) = parse_repeat_count(input)?;
let mut remaining = input;
if let Ok((after_seq, next_seq)) = parse_sequence(remaining) {
if parse_repeat_count(after_seq).is_ok() {
let mut units = vec![RepeatUnit {
sequence: first_sequence.unwrap_or_else(|| Sequence::new(vec![])),
count: first_count,
}];
let (after_count, next_count) = parse_repeat_count(after_seq)?;
units.push(RepeatUnit {
sequence: next_seq,
count: next_count,
});
remaining = after_count;
while let Ok((after_seq, seq)) = parse_sequence(remaining) {
if let Ok((after_count, count)) = parse_repeat_count(after_seq) {
units.push(RepeatUnit {
sequence: seq,
count,
});
remaining = after_count;
} else {
break;
}
}
return Ok((remaining, NaEdit::MultiRepeat { units }));
}
}
let mut additional_counts = Vec::new();
while let Ok((next_input, additional)) = parse_repeat_count(remaining) {
additional_counts.push(additional);
remaining = next_input;
}
let (remaining, trailing) = parse_opt_sequence(remaining)?;
Ok((
remaining,
NaEdit::Repeat {
sequence: first_sequence,
count: first_count,
additional_counts,
trailing,
},
))
}
#[inline]
fn parse_identity(input: &str) -> IResult<&str, NaEdit> {
let bytes = input.as_bytes();
if bytes.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if bytes[0] == b'=' {
let (remaining, sequence) = parse_opt_sequence(&input[1..])?;
Ok((
remaining,
NaEdit::Identity {
sequence,
whole_entity: false,
},
))
} else if is_iupac_base(bytes[0]) {
let (remaining, sequence) = parse_sequence(input)?;
let remaining_bytes = remaining.as_bytes();
if remaining_bytes.is_empty() || remaining_bytes[0] != b'=' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
Ok((
&remaining[1..],
NaEdit::Identity {
sequence: Some(sequence),
whole_entity: false,
},
))
} else {
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
}
fn parse_conversion(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("con").parse(input)?;
let (input, source) = take_while1(|c: char| !c.is_whitespace()).parse(input)?;
Ok((
input,
NaEdit::Conversion {
source: source.to_string(),
},
))
}
fn parse_methylation(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = char('|').parse(input)?;
alt((
map(tag("gom"), |_| NaEdit::Methylation {
status: MethylationStatus::GainOfMethylation,
}),
map(tag("lom"), |_| NaEdit::Methylation {
status: MethylationStatus::LossOfMethylation,
}),
map(tag("met="), |_| NaEdit::Methylation {
status: MethylationStatus::Unchanged,
}),
))
.parse(input)
}
fn parse_copy_number(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = tag("copy").parse(input)?;
let (input, count_str) = digit1.parse(input)?;
let count = count_str.parse::<u64>().unwrap_or(2);
Ok((input, NaEdit::CopyNumber { count }))
}
fn parse_parenthesized_repeat(input: &str) -> IResult<&str, NaEdit> {
let (input, _) = char('(').parse(input)?;
let (input, min_str) = digit1.parse(input)?;
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('_').parse(input) {
if let Ok((remaining, _)) = char::<_, nom::error::Error<&str>>('?').parse(remaining) {
let (remaining, _) = char(')').parse(remaining)?;
return Ok((
remaining,
NaEdit::Repeat {
sequence: None,
count: RepeatCount::MinUncertain(min_str.parse().unwrap_or(0)),
additional_counts: Vec::new(),
trailing: None,
},
));
}
let (remaining, max_str) = digit1.parse(remaining)?;
let (remaining, _) = char(')').parse(remaining)?;
return Ok((
remaining,
NaEdit::Repeat {
sequence: None,
count: RepeatCount::Range(
min_str.parse().unwrap_or(0),
max_str.parse().unwrap_or(0),
),
additional_counts: Vec::new(),
trailing: None,
},
));
}
let (input, _) = char(')').parse(input)?;
Ok((
input,
NaEdit::Repeat {
sequence: None,
count: RepeatCount::Exact(min_str.parse().unwrap_or(0)),
additional_counts: Vec::new(),
trailing: None,
},
))
}
#[inline]
fn parse_reference_sequence(input: &str) -> IResult<&str, NaEdit> {
let (remaining, sequence) = parse_sequence(input)?;
if sequence.len() < 2 {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Verify,
)));
}
Ok((
remaining,
NaEdit::Identity {
sequence: Some(sequence),
whole_entity: false,
},
))
}
pub fn parse_na_edit(input: &str) -> IResult<&str, NaEdit> {
alt((
parse_substitution,
parse_substitution_no_ref, parse_multibase_substitution, parse_delins_with_deleted_seq, parse_delins_with_deleted_count, parse_delins, parse_deletion,
parse_insertion,
parse_dupins, parse_duplication,
parse_inversion,
parse_conversion, parse_methylation, parse_copy_number, parse_repeat,
parse_parenthesized_repeat, parse_identity,
parse_reference_sequence, ))
.parse(input)
}
fn parse_protein_substitution(input: &str) -> IResult<&str, ProteinEdit> {
let (remaining, alternative) = parse_amino_acid.parse(input)?;
let remaining = if let Ok((rest, _)) = digit1::<_, nom::error::Error<&str>>.parse(remaining) {
rest
} else {
remaining
};
Ok((
remaining,
ProteinEdit::Substitution {
reference: AminoAcid::Xaa, alternative,
},
))
}
fn parse_protein_deletion(input: &str) -> IResult<&str, ProteinEdit> {
if let Ok((remaining, ext)) = parse_protein_del_extension(input) {
return Ok((remaining, ext));
}
let (remaining, _) = tag("del").parse(input)?;
if let Ok((remaining2, count_str)) = digit1::<_, nom::error::Error<&str>>.parse(remaining) {
if let Ok(count) = count_str.parse::<u64>() {
return Ok((
remaining2,
ProteinEdit::Deletion {
sequence: None,
count: Some(count),
},
));
}
}
if let Ok((remaining2, seq)) = parse_amino_acid_seq(remaining) {
return Ok((
remaining2,
ProteinEdit::Deletion {
sequence: Some(seq),
count: None,
},
));
}
Ok((
remaining,
ProteinEdit::Deletion {
sequence: None,
count: None,
},
))
}
fn parse_protein_del_extension(input: &str) -> IResult<&str, ProteinEdit> {
let (input, _) = tag("delext").parse(input)?;
let (input, result) = alt((
map(tag("Ter?"), |_| (ExtDirection::CTerminal, None)),
map(tag("*?"), |_| (ExtDirection::CTerminal, None)),
map(preceded(tag("Ter"), digit1), |n: &str| {
(ExtDirection::CTerminal, Some(n.parse::<i64>().unwrap_or(0)))
}),
map(preceded(tag("*"), digit1), |n: &str| {
(ExtDirection::CTerminal, Some(n.parse::<i64>().unwrap_or(0)))
}),
map(tag("?"), |_| (ExtDirection::CTerminal, None)),
map(tag(""), |_| (ExtDirection::CTerminal, None)),
))
.parse(input)?;
Ok((
input,
ProteinEdit::Extension {
new_aa: None, direction: result.0,
count: result.1,
},
))
}
fn parse_protein_duplication(input: &str) -> IResult<&str, ProteinEdit> {
map(tag("dup"), |_| ProteinEdit::Duplication).parse(input)
}
fn parse_protein_frameshift(input: &str) -> IResult<&str, ProteinEdit> {
let (input, new_aa) = opt(parse_amino_acid).parse(input)?;
let (input, _) = tag("fs").parse(input)?;
let (input, ter_pos) = opt(alt((
map(tag("Ter?"), |_| Some(None)),
map(tag("*?"), |_| Some(None)),
preceded(
tag("Ter"),
map(digit1, |s: &str| Some(s.parse::<u64>().ok())),
),
preceded(tag("*"), map(digit1, |s: &str| Some(s.parse::<u64>().ok()))),
)))
.parse(input)?;
Ok((
input,
ProteinEdit::Frameshift {
new_aa,
ter_pos: ter_pos.flatten().flatten(),
},
))
}
fn parse_protein_identity(input: &str) -> IResult<&str, ProteinEdit> {
alt((
map(tag("(=)"), |_| ProteinEdit::Identity {
predicted: true,
whole_protein: false, }),
map(tag("="), |_| ProteinEdit::Identity {
predicted: false,
whole_protein: false, }),
))
.parse(input)
}
fn parse_protein_unknown(input: &str) -> IResult<&str, ProteinEdit> {
map(tag("?"), |_| ProteinEdit::position_unknown()).parse(input)
}
fn parse_protein_no_protein(input: &str) -> IResult<&str, ProteinEdit> {
alt((
map(tag("0?"), |_| ProteinEdit::NoProtein { predicted: true }),
map(tag("0"), |_| ProteinEdit::NoProtein { predicted: false }),
))
.parse(input)
}
fn parse_amino_acid_seq(input: &str) -> IResult<&str, AminoAcidSeq> {
map(many1(parse_amino_acid), AminoAcidSeq::new).parse(input)
}
fn parse_protein_insertion(input: &str) -> IResult<&str, ProteinEdit> {
map(preceded(tag("ins"), parse_amino_acid_seq), |sequence| {
ProteinEdit::Insertion { sequence }
})
.parse(input)
}
fn parse_protein_delins(input: &str) -> IResult<&str, ProteinEdit> {
map(preceded(tag("delins"), parse_amino_acid_seq), |sequence| {
ProteinEdit::Delins { sequence }
})
.parse(input)
}
fn parse_protein_extension(input: &str) -> IResult<&str, ProteinEdit> {
let (input, new_aa) = opt(parse_amino_acid).parse(input)?;
let (input, _) = tag("ext").parse(input)?;
let (input, result) = alt((
map(preceded(char('-'), digit1), |n: &str| {
(
ExtDirection::NTerminal,
Some(-(n.parse::<i64>().unwrap_or(0))),
)
}),
map(tag("Ter?"), |_| (ExtDirection::CTerminal, None)),
map(tag("*?"), |_| (ExtDirection::CTerminal, None)),
map(preceded(tag("Ter"), digit1), |n: &str| {
(ExtDirection::CTerminal, Some(n.parse::<i64>().unwrap_or(0)))
}),
map(preceded(tag("*"), digit1), |n: &str| {
(ExtDirection::CTerminal, Some(n.parse::<i64>().unwrap_or(0)))
}),
map(tag("?"), |_| (ExtDirection::CTerminal, None)),
map(tag(""), |_| (ExtDirection::CTerminal, None)),
))
.parse(input)?;
Ok((
input,
ProteinEdit::Extension {
new_aa,
direction: result.0,
count: result.1,
},
))
}
fn parse_uncertain_extension_annotation(input: &str) -> IResult<&str, ProteinEdit> {
let bytes = input.as_bytes();
if bytes.is_empty() || bytes[0] != b'(' {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let inner = &bytes[1..];
if inner.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
if inner[0].is_ascii_digit() {
let (num, consumed) = scan_digits(inner);
if consumed > 0 {
let after_num = &inner[consumed..];
if after_num.len() >= 3
&& after_num[0] == b'_'
&& after_num[1] == b'?'
&& after_num[2] == b')'
{
let total_consumed = 1 + consumed + 3; return Ok((
&input[total_consumed..],
ProteinEdit::Extension {
new_aa: None,
direction: ExtDirection::CTerminal,
count: Some(num as i64), },
));
}
}
}
if inner.len() >= 2 && inner[0] == b'?' && inner[1] == b'_' {
let (num, consumed) = scan_digits(&inner[2..]);
if consumed > 0 {
let after_num = &inner[2 + consumed..];
if !after_num.is_empty() && after_num[0] == b')' {
let total_consumed = 1 + 2 + consumed + 1; return Ok((
&input[total_consumed..],
ProteinEdit::Extension {
new_aa: None,
direction: ExtDirection::CTerminal,
count: Some(num as i64), },
));
}
}
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
#[inline]
fn parse_single_protein_repeat_unit(input: &str) -> IResult<&str, (AminoAcidSeq, RepeatCount)> {
let mut aas = Vec::new();
let mut remaining = input;
while !remaining.is_empty() {
let bytes = remaining.as_bytes();
if bytes[0] == b'[' {
break;
}
if let Ok((rest, aa)) = parse_amino_acid_one_letter(remaining) {
aas.push(aa);
remaining = rest;
} else {
break;
}
}
if aas.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
}
let (remaining, count) = parse_repeat_count(remaining)?;
Ok((remaining, (AminoAcidSeq::new(aas), count)))
}
#[inline]
fn parse_protein_repeat(input: &str) -> IResult<&str, ProteinEdit> {
let (mut remaining, first_unit) = parse_single_protein_repeat_unit(input)?;
let mut units = vec![first_unit];
while !remaining.is_empty() {
let bytes = remaining.as_bytes();
if !bytes[0].is_ascii_uppercase() {
break;
}
if let Ok((rest, unit)) = parse_single_protein_repeat_unit(remaining) {
units.push(unit);
remaining = rest;
} else {
break;
}
}
if units.len() == 1 {
let (sequence, count) = units.pop().unwrap();
Ok((remaining, ProteinEdit::Repeat { sequence, count }))
} else {
Ok((remaining, ProteinEdit::MultiRepeat { units }))
}
}
#[inline]
pub fn parse_protein_edit(input: &str) -> IResult<&str, ProteinEdit> {
let bytes = input.as_bytes();
if bytes.is_empty() {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Eof,
)));
}
match bytes[0] {
b'f' => {
if bytes.len() >= 2 && bytes[1] == b's' {
return parse_protein_frameshift(input);
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'd' => {
if bytes.len() >= 6 && &bytes[..6] == b"delins" {
return parse_protein_delins(input);
}
if bytes.len() >= 3 && &bytes[..3] == b"del" {
return parse_protein_deletion(input);
}
if bytes.len() >= 3 && &bytes[..3] == b"dup" {
return parse_protein_duplication(input);
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'i' => {
if bytes.len() >= 3 && &bytes[..3] == b"ins" {
return parse_protein_insertion(input);
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'e' => {
if bytes.len() >= 3 && &bytes[..3] == b"ext" {
return parse_protein_extension(input);
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'=' => parse_protein_identity(input),
b'(' => {
if bytes.len() >= 3 && bytes[1] == b'=' && bytes[2] == b')' {
return parse_protein_identity(input);
}
if let Ok(result) = parse_uncertain_extension_annotation(input) {
return Ok(result);
}
Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)))
}
b'0' => parse_protein_no_protein(input),
b'?' => parse_protein_unknown(input),
b'[' => {
let (remaining, count) = parse_repeat_count(input)?;
Ok((
remaining,
ProteinEdit::Repeat {
sequence: AminoAcidSeq::new(Vec::new()),
count,
},
))
}
_ => {
if let Ok(result) = parse_protein_frameshift(input) {
return Ok(result);
}
if let Ok(result) = parse_protein_extension(input) {
return Ok(result);
}
if bytes.len() >= 3 && &bytes[..3] == b"dup" {
return parse_protein_duplication(input);
}
if let Ok(result) = parse_protein_repeat(input) {
return Ok(result);
}
parse_protein_substitution(input)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hgvs::edit::InsertedPart;
#[test]
fn test_parse_substitution() {
let (remaining, edit) = parse_na_edit("A>G").unwrap();
assert_eq!(remaining, "");
assert!(matches!(edit, NaEdit::Substitution { .. }));
}
#[test]
fn test_parse_substitution_iupac_ambiguity() {
let (remaining, edit) = parse_na_edit("G>H").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Substitution {
reference,
alternative,
} = edit
{
assert_eq!(reference, Base::G);
assert_eq!(alternative, Base::H);
} else {
panic!("Expected substitution");
}
let (remaining, edit) = parse_na_edit("A>R").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Substitution { alternative, .. } = edit {
assert_eq!(alternative, Base::R);
}
let (remaining, edit) = parse_na_edit("C>Y").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Substitution { alternative, .. } = edit {
assert_eq!(alternative, Base::Y);
}
}
#[test]
fn test_parse_deletion_simple() {
let (remaining, edit) = parse_na_edit("del").unwrap();
assert_eq!(remaining, "");
assert!(matches!(
edit,
NaEdit::Deletion {
sequence: None,
length: None
}
));
}
#[test]
fn test_parse_deletion_with_seq() {
let (remaining, edit) = parse_na_edit("delATG").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Deletion {
sequence: Some(seq),
length: None,
} = edit
{
assert_eq!(seq.len(), 3);
} else {
panic!("Expected deletion with sequence");
}
}
#[test]
fn test_parse_deletion_with_length() {
let (remaining, edit) = parse_na_edit("del101").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Deletion {
sequence: None,
length: Some(len),
} = edit
{
assert_eq!(len, 101);
} else {
panic!("Expected deletion with length");
}
}
#[test]
fn test_parse_insertion() {
let (remaining, edit) = parse_na_edit("insATG").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
assert_eq!(sequence.len(), Some(3));
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_delins() {
let (remaining, edit) = parse_na_edit("delinsATG").unwrap();
assert_eq!(remaining, "");
assert!(matches!(edit, NaEdit::Delins { .. }));
}
#[test]
fn test_parse_duplication() {
let (remaining, edit) = parse_na_edit("dup").unwrap();
assert_eq!(remaining, "");
assert!(matches!(
edit,
NaEdit::Duplication {
sequence: None,
length: None,
..
}
));
}
#[test]
fn test_parse_duplication_with_length() {
let (remaining, edit) = parse_na_edit("dup101").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Duplication {
sequence: None,
length: Some(len),
..
} = edit
{
assert_eq!(len, 101);
} else {
panic!("Expected duplication with length");
}
}
#[test]
fn test_parse_inversion() {
let (remaining, edit) = parse_na_edit("inv").unwrap();
assert_eq!(remaining, "");
assert!(matches!(
edit,
NaEdit::Inversion {
sequence: None,
length: None
}
));
}
#[test]
fn test_parse_inversion_with_length() {
let (remaining, edit) = parse_na_edit("inv3").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Inversion {
sequence: None,
length: Some(len),
} = edit
{
assert_eq!(len, 3);
} else {
panic!("Expected inversion with length");
}
}
#[test]
fn test_parse_inversion_with_sequence() {
let (remaining, edit) = parse_na_edit("invATG").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Inversion {
sequence: Some(seq),
length: None,
} = edit
{
assert_eq!(seq.len(), 3);
} else {
panic!("Expected inversion with sequence");
}
}
#[test]
fn test_parse_repeat_exact() {
let (remaining, edit) = parse_na_edit("CAG[12]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Repeat {
count,
additional_counts,
..
} = edit
{
assert!(matches!(count, RepeatCount::Exact(12)));
assert!(additional_counts.is_empty());
} else {
panic!("Expected repeat");
}
}
#[test]
fn test_parse_repeat_range() {
let (remaining, edit) = parse_na_edit("[10_15]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Repeat {
count,
additional_counts,
..
} = edit
{
assert!(matches!(count, RepeatCount::Range(10, 15)));
assert!(additional_counts.is_empty());
} else {
panic!("Expected repeat");
}
}
#[test]
fn test_parse_repeat_genotype() {
let (remaining, edit) = parse_na_edit("A[6][1]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Repeat {
sequence,
count,
additional_counts,
trailing,
} = edit
{
assert!(sequence.is_some());
assert!(matches!(count, RepeatCount::Exact(6)));
assert_eq!(additional_counts.len(), 1);
assert!(matches!(additional_counts[0], RepeatCount::Exact(1)));
assert!(trailing.is_none());
} else {
panic!("Expected repeat");
}
let (remaining, edit) = parse_na_edit("[4][5][6]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Repeat {
count,
additional_counts,
..
} = edit
{
assert!(matches!(count, RepeatCount::Exact(4)));
assert_eq!(additional_counts.len(), 2);
assert!(matches!(additional_counts[0], RepeatCount::Exact(5)));
assert!(matches!(additional_counts[1], RepeatCount::Exact(6)));
} else {
panic!("Expected repeat");
}
}
#[test]
fn test_parse_identity() {
let (remaining, edit) = parse_na_edit("=").unwrap();
assert_eq!(remaining, "");
assert!(matches!(edit, NaEdit::Identity { .. }));
}
#[test]
fn test_parse_identity_with_ref_base() {
let (remaining, edit) = parse_na_edit("G=").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Identity {
sequence: Some(seq),
whole_entity: false,
} = edit
{
assert_eq!(seq.len(), 1);
} else {
panic!("Expected identity with sequence");
}
let (remaining, edit) = parse_na_edit("ATG=").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Identity {
sequence: Some(seq),
whole_entity: false,
} = edit
{
assert_eq!(seq.len(), 3);
} else {
panic!("Expected identity with sequence");
}
}
#[test]
fn test_parse_reference_sequence() {
let (remaining, edit) = parse_na_edit("TCACA").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Identity {
sequence: Some(seq),
whole_entity: false,
} = edit
{
assert_eq!(seq.len(), 5);
assert_eq!(seq.to_string(), "TCACA");
} else {
panic!("Expected identity with sequence, got {:?}", edit);
}
let (remaining, edit) = parse_na_edit("ATGCATGC").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Identity {
sequence: Some(seq),
..
} = edit
{
assert_eq!(seq.len(), 8);
} else {
panic!("Expected identity with sequence");
}
}
#[test]
fn test_parse_protein_substitution() {
let (remaining, edit) = parse_protein_edit("Glu").unwrap();
assert_eq!(remaining, "");
assert!(matches!(edit, ProteinEdit::Substitution { .. }));
}
#[test]
fn test_parse_protein_frameshift() {
let (remaining, edit) = parse_protein_edit("fsTer12").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Frameshift { new_aa, ter_pos } = edit {
assert_eq!(new_aa, None);
assert_eq!(ter_pos, Some(12));
} else {
panic!("Expected frameshift");
}
let (remaining, edit) = parse_protein_edit("ProfsTer23").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Frameshift { new_aa, ter_pos } = edit {
assert_eq!(new_aa, Some(crate::hgvs::location::AminoAcid::Pro));
assert_eq!(ter_pos, Some(23));
} else {
panic!("Expected frameshift");
}
}
#[test]
fn test_parse_protein_insertion() {
let (remaining, edit) = parse_protein_edit("insGln").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Insertion { sequence } = edit {
assert_eq!(sequence.len(), 1);
} else {
panic!("Expected insertion");
}
let (remaining, edit) = parse_protein_edit("insGlyPro").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Insertion { sequence } = edit {
assert_eq!(sequence.len(), 2);
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_protein_delins() {
let (remaining, edit) = parse_protein_edit("delinsTrpVal").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Delins { sequence } = edit {
assert_eq!(sequence.len(), 2);
} else {
panic!("Expected delins");
}
}
#[test]
fn test_parse_protein_extension() {
let (remaining, edit) = parse_protein_edit("ext-5").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Extension {
new_aa,
direction,
count,
} = edit
{
assert_eq!(new_aa, None);
assert_eq!(direction, ExtDirection::NTerminal);
assert_eq!(count, Some(-5));
} else {
panic!("Expected extension");
}
let (remaining, edit) = parse_protein_edit("extTer17").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Extension {
new_aa,
direction,
count,
} = edit
{
assert_eq!(new_aa, None);
assert_eq!(direction, ExtDirection::CTerminal);
assert_eq!(count, Some(17));
} else {
panic!("Expected extension");
}
let (remaining, edit) = parse_protein_edit("ext*?").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Extension {
new_aa,
direction,
count,
} = edit
{
assert_eq!(new_aa, None);
assert_eq!(direction, ExtDirection::CTerminal);
assert_eq!(count, None);
} else {
panic!("Expected extension");
}
let (remaining, edit) = parse_protein_edit("Glnext*17").unwrap();
assert_eq!(remaining, "");
if let ProteinEdit::Extension {
new_aa,
direction,
count,
} = edit
{
assert_eq!(new_aa, Some(crate::hgvs::location::AminoAcid::Gln));
assert_eq!(direction, ExtDirection::CTerminal);
assert_eq!(count, Some(17));
} else {
panic!("Expected extension");
}
}
#[test]
fn test_parse_insertion_count() {
let (remaining, edit) = parse_na_edit("ins10").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
assert!(matches!(sequence, InsertedSequence::Count(10)));
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_insertion_parenthesized_count() {
let (remaining, edit) = parse_na_edit("ins(10)").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
assert!(matches!(sequence, InsertedSequence::Count(10)));
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_insertion_range() {
let (remaining, edit) = parse_na_edit("ins(10_20)").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
assert!(matches!(sequence, InsertedSequence::Range(10, 20)));
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_insertion_repeat() {
let (remaining, edit) = parse_na_edit("insA[10]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Repeat { base, count } = sequence {
assert_eq!(base, Base::A);
assert!(matches!(count, RepeatCount::Exact(10)));
} else {
panic!("Expected repeat insertion");
}
} else {
panic!("Expected insertion");
}
let (remaining, edit) = parse_na_edit("insN[15]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Repeat { base, count } = sequence {
assert_eq!(base, Base::N);
assert!(matches!(count, RepeatCount::Exact(15)));
} else {
panic!("Expected repeat insertion");
}
} else {
panic!("Expected insertion");
}
let (remaining, edit) = parse_na_edit("insN[15_30]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Repeat { base, count } = sequence {
assert_eq!(base, Base::N);
assert!(matches!(count, RepeatCount::Range(15, 30)));
} else {
panic!("Expected repeat insertion");
}
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_insertion_complex() {
let (remaining, edit) = parse_na_edit("ins[A[10];T]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Complex(parts) = sequence {
assert_eq!(parts.len(), 2);
} else {
panic!("Expected complex insertion");
}
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_delins_count() {
let (remaining, edit) = parse_na_edit("delins10").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Delins { sequence } = edit {
assert!(matches!(sequence, InsertedSequence::Count(10)));
} else {
panic!("Expected delins");
}
}
#[test]
fn test_parse_delins_n_repeat() {
let (remaining, edit) = parse_na_edit("delinsN[12]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Delins { sequence } = edit {
if let InsertedSequence::Repeat { base, count } = sequence {
assert_eq!(base, Base::N);
assert!(matches!(count, RepeatCount::Exact(12)));
} else {
panic!("Expected repeat delins");
}
} else {
panic!("Expected delins");
}
}
#[test]
fn test_parse_methylation_gom() {
let (remaining, edit) = parse_na_edit("|gom").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Methylation { status } = edit {
assert_eq!(status, MethylationStatus::GainOfMethylation);
} else {
panic!("Expected methylation");
}
}
#[test]
fn test_parse_methylation_lom() {
let (remaining, edit) = parse_na_edit("|lom").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Methylation { status } = edit {
assert_eq!(status, MethylationStatus::LossOfMethylation);
} else {
panic!("Expected methylation");
}
}
#[test]
fn test_parse_methylation_unchanged() {
let (remaining, edit) = parse_na_edit("|met=").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Methylation { status } = edit {
assert_eq!(status, MethylationStatus::Unchanged);
} else {
panic!("Expected methylation");
}
}
#[test]
fn test_parse_insertion_external_sequence_reference() {
let (remaining, edit) = parse_na_edit("ins[PQ998981.1:g.1_6057]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Reference(ref_str) = sequence {
assert_eq!(ref_str, "PQ998981.1:g.1_6057");
} else {
panic!("Expected reference insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
let (remaining, edit) = parse_na_edit("ins[MF045863.1:g.1_36978]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Reference(ref_str) = sequence {
assert_eq!(ref_str, "MF045863.1:g.1_36978");
} else {
panic!("Expected reference insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
let (remaining, edit) = parse_na_edit("ins[MT113356.1:g.1_2409]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Reference(ref_str) = sequence {
assert_eq!(ref_str, "MT113356.1:g.1_2409");
} else {
panic!("Expected reference insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_is_reference_accession_prefix() {
assert!(is_reference_accession_prefix("NC_000001.11:g.123"));
assert!(is_reference_accession_prefix("NM_000001.1:c.123"));
assert!(is_reference_accession_prefix("NG_000001.1:g.123"));
assert!(is_reference_accession_prefix("ENST00000123456:c.123"));
assert!(is_reference_accession_prefix("ENSP00000123456:p.123"));
assert!(is_reference_accession_prefix("LRG_1:g.123"));
assert!(is_reference_accession_prefix("PQ998981.1:g.1"));
assert!(is_reference_accession_prefix("MF045863.1:g.1"));
assert!(is_reference_accession_prefix("MT113356.1:g.1"));
assert!(is_reference_accession_prefix("KT192064.1:1"));
assert!(is_reference_accession_prefix("KY923049.1:g.1"));
assert!(is_reference_accession_prefix("DQ831669.1:1"));
assert!(is_reference_accession_prefix("AC010542.7:g.1"));
assert!(is_reference_accession_prefix("AB191243.1:g.1"));
assert!(is_reference_accession_prefix("PP887427.1:g.1"));
assert!(is_reference_accession_prefix("U12345.1:g.1"));
assert!(!is_reference_accession_prefix("123ABC:g.1"));
assert!(!is_reference_accession_prefix(":g.1"));
}
#[test]
fn test_parse_complex_insertion_with_external_ref() {
let (remaining, edit) = parse_na_edit("ins[TCTT;KT192064.1:1_310]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Complex(parts) = sequence {
assert_eq!(parts.len(), 2);
assert!(matches!(&parts[0], InsertedPart::Literal(_)));
if let InsertedPart::ExternalRef(ref_str) = &parts[1] {
assert_eq!(ref_str, "KT192064.1:1_310");
} else {
panic!("Expected external reference, got {:?}", parts[1]);
}
} else {
panic!("Expected complex insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_complex_delins_with_inversion() {
let (remaining, edit) =
parse_na_edit("delins[AGAAGGAAATTT;45310743_46521014;45043709_45310738inv]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Delins { sequence } = edit {
if let InsertedSequence::Complex(parts) = sequence {
assert_eq!(parts.len(), 3);
assert!(matches!(&parts[0], InsertedPart::Literal(_)));
assert!(matches!(
&parts[1],
InsertedPart::PositionRange {
start: 45310743,
end: 46521014
}
));
assert!(matches!(
&parts[2],
InsertedPart::PositionRangeInv {
start: 45043709,
end: 45310738
}
));
} else {
panic!("Expected complex delins, got {:?}", sequence);
}
} else {
panic!("Expected delins");
}
}
#[test]
fn test_parse_insertion_with_multiple_external_refs() {
let (remaining, edit) =
parse_na_edit("ins[80114172_80114186;NC_000020.11:g.2823027_2826302;AAA]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::Complex(parts) = sequence {
assert_eq!(parts.len(), 3);
assert!(matches!(
&parts[0],
InsertedPart::PositionRange {
start: 80114172,
end: 80114186
}
));
if let InsertedPart::ExternalRef(ref_str) = &parts[1] {
assert_eq!(ref_str, "NC_000020.11:g.2823027_2826302");
} else {
panic!("Expected external reference, got {:?}", parts[1]);
}
assert!(matches!(&parts[2], InsertedPart::Literal(_)));
} else {
panic!("Expected complex insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
}
#[test]
fn test_parse_substitution_no_ref() {
let (remaining, edit) = parse_na_edit(">A").unwrap();
assert_eq!(remaining, "");
if let NaEdit::SubstitutionNoRef { alternative } = edit {
assert_eq!(alternative, Base::A);
} else {
panic!("Expected substitution without ref, got {:?}", edit);
}
let (remaining, edit) = parse_na_edit(">G").unwrap();
assert_eq!(remaining, "");
if let NaEdit::SubstitutionNoRef { alternative } = edit {
assert_eq!(alternative, Base::G);
} else {
panic!("Expected substitution without ref");
}
let (remaining, edit) = parse_na_edit(">C").unwrap();
assert_eq!(remaining, "");
if let NaEdit::SubstitutionNoRef { alternative } = edit {
assert_eq!(alternative, Base::C);
} else {
panic!("Expected substitution without ref");
}
let (remaining, edit) = parse_na_edit(">T").unwrap();
assert_eq!(remaining, "");
if let NaEdit::SubstitutionNoRef { alternative } = edit {
assert_eq!(alternative, Base::T);
} else {
panic!("Expected substitution without ref");
}
}
#[test]
fn test_parse_delins_sequence_repeat() {
let (remaining, edit) = parse_na_edit("delinsTCGGCAGCGGCACAGCGAGG[13]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Delins { sequence } = edit {
if let InsertedSequence::SequenceRepeat {
sequence: seq,
count,
} = sequence
{
assert_eq!(seq.len(), 20);
assert!(matches!(count, RepeatCount::Exact(13)));
} else {
panic!("Expected sequence repeat delins, got {:?}", sequence);
}
} else {
panic!("Expected delins");
}
let (remaining, edit) = parse_na_edit("delinsAAAGG[400_2000]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Delins { sequence } = edit {
if let InsertedSequence::SequenceRepeat {
sequence: seq,
count,
} = sequence
{
assert_eq!(seq.len(), 5);
assert!(matches!(count, RepeatCount::Range(400, 2000)));
} else {
panic!("Expected sequence repeat delins, got {:?}", sequence);
}
} else {
panic!("Expected delins");
}
}
#[test]
fn test_parse_insertion_sequence_repeat() {
let (remaining, edit) = parse_na_edit("insTCGGCAGCGGCACAGCGAGG[13]").unwrap();
assert_eq!(remaining, "");
if let NaEdit::Insertion { sequence } = edit {
if let InsertedSequence::SequenceRepeat {
sequence: seq,
count,
} = sequence
{
assert_eq!(seq.len(), 20);
assert!(matches!(count, RepeatCount::Exact(13)));
} else {
panic!("Expected sequence repeat insertion, got {:?}", sequence);
}
} else {
panic!("Expected insertion");
}
}
}