use super::context::MaveContext;
use crate::hgvs::parser::parse_hgvs;
use crate::hgvs::variant::HgvsVariant;
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum MaveParseError {
EmptyInput,
UnknownCoordinateType(String),
MissingAccession { coord_type: char, input: String },
HgvsParseError { input: String, message: String },
AlreadyHasAccession(String),
}
impl fmt::Display for MaveParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::EmptyInput => write!(f, "Empty input"),
Self::UnknownCoordinateType(input) => {
write!(f, "Cannot determine coordinate type from '{}'", input)
}
Self::MissingAccession { coord_type, input } => {
write!(
f,
"No accession in context for '{}.' variants: '{}'",
coord_type, input
)
}
Self::HgvsParseError { input, message } => {
write!(f, "Failed to parse '{}': {}", input, message)
}
Self::AlreadyHasAccession(input) => {
write!(
f,
"Input already has accession, use parse_hgvs directly: '{}'",
input
)
}
}
}
}
impl std::error::Error for MaveParseError {}
pub fn parse_mave_hgvs(input: &str, context: &MaveContext) -> Result<HgvsVariant, MaveParseError> {
let input = input.trim();
if input.is_empty() {
return Err(MaveParseError::EmptyInput);
}
if input.contains(':') {
return parse_hgvs(input).map_err(|e| MaveParseError::HgvsParseError {
input: input.to_string(),
message: e.to_string(),
});
}
let coord_type = detect_coordinate_type(input)?;
let accession = context
.accession_for_coordinate_type(coord_type)
.ok_or_else(|| MaveParseError::MissingAccession {
coord_type,
input: input.to_string(),
})?;
let full_hgvs = format!("{}:{}", accession, input);
parse_hgvs(&full_hgvs).map_err(|e| MaveParseError::HgvsParseError {
input: input.to_string(),
message: e.to_string(),
})
}
pub fn parse_mave_hgvs_lenient(
input: &str,
context: &MaveContext,
) -> Result<HgvsVariant, MaveParseError> {
let input = input.trim();
if input.is_empty() {
return Err(MaveParseError::EmptyInput);
}
if input.contains(':') {
return parse_hgvs(input).map_err(|e| MaveParseError::HgvsParseError {
input: input.to_string(),
message: e.to_string(),
});
}
parse_mave_hgvs(input, context)
}
fn detect_coordinate_type(input: &str) -> Result<char, MaveParseError> {
let chars: Vec<char> = input.chars().collect();
if chars.len() >= 2 && chars[1] == '.' {
let coord = chars[0].to_ascii_lowercase();
if ['p', 'c', 'g', 'n', 'r', 'm', 'o'].contains(&coord) {
return Ok(coord);
}
}
Err(MaveParseError::UnknownCoordinateType(input.to_string()))
}
pub fn is_mave_short_form(input: &str) -> bool {
let input = input.trim();
!input.is_empty() && !input.contains(':') && detect_coordinate_type(input).is_ok()
}
#[cfg(test)]
mod tests {
use super::*;
fn test_context() -> MaveContext {
MaveContext::new()
.with_protein_accession("NP_000509.1")
.with_coding_accession("NM_000518.5")
.with_genomic_accession("NC_000011.10")
.with_gene_symbol("HBB")
}
#[test]
fn test_parse_protein_variant() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Glu6Val", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert_eq!(variant.to_string(), "NP_000509.1:p.Glu6Val");
}
#[test]
fn test_parse_protein_missense() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Leu11Pro", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert_eq!(variant.to_string(), "NP_000509.1:p.Leu11Pro");
}
#[test]
fn test_parse_coding_variant() {
let ctx = test_context();
let result = parse_mave_hgvs("c.20A>T", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert_eq!(variant.to_string(), "NM_000518.5:c.20A>T");
}
#[test]
fn test_parse_genomic_variant() {
let ctx = test_context();
let result = parse_mave_hgvs("g.12345A>G", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert_eq!(variant.to_string(), "NC_000011.10:g.12345A>G");
}
#[test]
fn test_parse_full_form_passthrough() {
let ctx = test_context();
let result = parse_mave_hgvs("NP_000509.1:p.Glu6Val", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert_eq!(variant.to_string(), "NP_000509.1:p.Glu6Val");
}
#[test]
fn test_parse_empty_input() {
let ctx = test_context();
let result = parse_mave_hgvs("", &ctx);
assert!(matches!(result, Err(MaveParseError::EmptyInput)));
}
#[test]
fn test_parse_unknown_coordinate_type() {
let ctx = test_context();
let result = parse_mave_hgvs("x.123", &ctx);
assert!(matches!(
result,
Err(MaveParseError::UnknownCoordinateType(_))
));
}
#[test]
fn test_parse_missing_accession() {
let ctx = MaveContext::new().with_protein_accession("NP_000509.1");
let result = parse_mave_hgvs("c.20A>T", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
}
#[test]
fn test_parse_protein_deletion() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Leu11del", &ctx);
assert!(result.is_ok());
let variant = result.unwrap();
assert!(variant.to_string().contains("p.Leu11del"));
}
#[test]
fn test_parse_protein_insertion() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Leu11_Pro12insAla", &ctx);
assert!(result.is_ok());
}
#[test]
fn test_parse_protein_frameshift() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Glu6fs", &ctx);
assert!(result.is_ok());
}
#[test]
fn test_parse_protein_extension() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Ter147Tyrext*30", &ctx);
assert!(result.is_ok());
}
#[test]
fn test_detect_coordinate_type_protein() {
assert_eq!(detect_coordinate_type("p.Glu6Val"), Ok('p'));
}
#[test]
fn test_detect_coordinate_type_coding() {
assert_eq!(detect_coordinate_type("c.20A>T"), Ok('c'));
}
#[test]
fn test_detect_coordinate_type_genomic() {
assert_eq!(detect_coordinate_type("g.12345A>G"), Ok('g'));
}
#[test]
fn test_detect_coordinate_type_noncoding() {
assert_eq!(detect_coordinate_type("n.100A>G"), Ok('n'));
}
#[test]
fn test_detect_coordinate_type_invalid() {
assert!(detect_coordinate_type("invalid").is_err());
assert!(detect_coordinate_type("x.123").is_err());
}
#[test]
fn test_is_mave_short_form() {
assert!(is_mave_short_form("p.Glu6Val"));
assert!(is_mave_short_form("c.20A>T"));
assert!(!is_mave_short_form("NP_000509.1:p.Glu6Val"));
assert!(!is_mave_short_form(""));
assert!(!is_mave_short_form("invalid"));
}
#[test]
fn test_lenient_parse_short_form() {
let ctx = test_context();
let result = parse_mave_hgvs_lenient("p.Glu6Val", &ctx);
assert!(result.is_ok());
}
#[test]
fn test_lenient_parse_full_form() {
let ctx = test_context();
let result = parse_mave_hgvs_lenient("NP_000509.1:p.Glu6Val", &ctx);
assert!(result.is_ok());
}
#[test]
fn test_allele_variant() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[32T>C;39G>A]", &ctx);
if let Err(e) = &result {
assert!(
!matches!(e, MaveParseError::MissingAccession { .. }),
"Should have injected accession"
);
}
}
#[test]
fn test_error_display() {
let err = MaveParseError::EmptyInput;
assert_eq!(err.to_string(), "Empty input");
let err = MaveParseError::MissingAccession {
coord_type: 'c',
input: "c.20A>T".to_string(),
};
assert!(err.to_string().contains("No accession"));
}
#[test]
fn test_allele_simple_coding() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[20A>T;30G>C]", &ctx);
assert!(result.is_ok(), "Failed to parse: {:?}", result.err());
let variant = result.unwrap();
let output = variant.to_string();
assert!(
output.contains("NM_000518.5"),
"Output should contain accession: {}",
output
);
assert!(
output.contains("c.[") || output.contains("c.20"),
"Output should contain allele notation: {}",
output
);
}
#[test]
fn test_allele_multiple_variants() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[32T>C;39G>A;42A>G]", &ctx);
assert!(result.is_ok(), "Failed to parse multi-variant allele");
}
#[test]
fn test_allele_protein() {
let ctx = test_context();
let result = parse_mave_hgvs("p.[Glu6Val;Lys17Arg]", &ctx);
if let Ok(variant) = result {
let output = variant.to_string();
assert!(
output.contains("NP_000509.1"),
"Output should contain protein accession: {}",
output
);
}
}
#[test]
fn test_allele_genomic() {
let ctx = test_context();
let result = parse_mave_hgvs("g.[12345A>G;12350C>T]", &ctx);
if let Err(e) = &result {
assert!(
!matches!(e, MaveParseError::MissingAccession { .. }),
"Accession should be injected: {:?}",
e
);
} else {
let variant = result.unwrap();
let output = variant.to_string();
assert!(
output.contains("NC_000011.10"),
"Output should contain genomic accession: {}",
output
);
}
}
#[test]
fn test_allele_roundtrip_structure() {
let ctx = test_context();
let input = "c.[20A>T;30G>C]";
let result = parse_mave_hgvs(input, &ctx);
if let Ok(variant) = result {
let output = variant.to_string();
assert!(
output.contains("NM_000518.5"),
"Should contain accession: {}",
output
);
let reparsed = crate::hgvs::parser::parse_hgvs(&output);
assert!(
reparsed.is_ok(),
"Reparsed variant should be valid HGVS: {}",
output
);
}
}
#[test]
fn test_single_variant_in_brackets() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[20A>T]", &ctx);
assert!(result.is_ok(), "Should parse single variant in brackets");
}
#[test]
fn test_mavedb_real_pattern_1() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[32T>C;39G>A;42A>G;81C>T;99T>C;105G>T]", &ctx);
assert!(result.is_ok(), "Should parse real MaveDB allele pattern");
}
#[test]
fn test_mavedb_real_pattern_2() {
let ctx = test_context();
let result = parse_mave_hgvs("c.[16G>C;17A>C;18A>G;39G>A;42A>G]", &ctx);
assert!(result.is_ok(), "Should parse real MaveDB allele pattern");
}
#[test]
fn test_protein_complex_allele() {
let ctx = test_context();
let result = parse_mave_hgvs("p.[Glu6Val;Lys17del]", &ctx);
if let Err(e) = &result {
assert!(
!matches!(e, MaveParseError::MissingAccession { .. }),
"Accession should be injected even for complex alleles"
);
}
}
#[test]
fn test_allele_with_whitespace() {
let ctx = test_context();
let result = parse_mave_hgvs(" c.[20A>T;30G>C] ", &ctx);
assert!(result.is_ok(), "Should handle whitespace around allele");
}
#[test]
fn test_error_recovery_whitespace_only() {
let ctx = test_context();
let result = parse_mave_hgvs(" ", &ctx);
assert!(matches!(result, Err(MaveParseError::EmptyInput)));
}
#[test]
fn test_error_recovery_tab_and_newline() {
let ctx = test_context();
let result = parse_mave_hgvs("\t\n", &ctx);
assert!(matches!(result, Err(MaveParseError::EmptyInput)));
}
#[test]
fn test_error_recovery_invalid_coord_type_uppercase() {
let ctx = test_context();
let result = parse_mave_hgvs("P.Glu6Val", &ctx);
if let Err(e) = &result {
assert!(
!matches!(e, MaveParseError::MissingAccession { .. }),
"Should inject accession even with uppercase coord type"
);
}
}
#[test]
fn test_error_recovery_invalid_coord_type_z() {
let ctx = test_context();
let result = parse_mave_hgvs("z.123A>G", &ctx);
assert!(matches!(
result,
Err(MaveParseError::UnknownCoordinateType(_))
));
if let Err(e) = result {
let msg = e.to_string();
assert!(msg.contains("z.123A>G"), "Error should include input");
}
}
#[test]
fn test_error_recovery_numeric_coord_type() {
let ctx = test_context();
let result = parse_mave_hgvs("1.123A>G", &ctx);
assert!(matches!(
result,
Err(MaveParseError::UnknownCoordinateType(_))
));
}
#[test]
fn test_error_recovery_no_dot_after_coord() {
let ctx = test_context();
let result = parse_mave_hgvs("cGlu6Val", &ctx);
assert!(matches!(
result,
Err(MaveParseError::UnknownCoordinateType(_))
));
}
#[test]
fn test_error_recovery_missing_protein_accession() {
let ctx = MaveContext::new().with_coding_accession("NM_000518.5");
let result = parse_mave_hgvs("p.Glu6Val", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
if let Err(MaveParseError::MissingAccession { coord_type, input }) = result {
assert_eq!(coord_type, 'p');
assert_eq!(input, "p.Glu6Val");
}
}
#[test]
fn test_error_recovery_missing_genomic_accession() {
let ctx = MaveContext::new().with_protein_accession("NP_000509.1");
let result = parse_mave_hgvs("g.12345A>G", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
}
#[test]
fn test_error_recovery_missing_noncoding_accession() {
let ctx = MaveContext::new().with_protein_accession("NP_000509.1");
let result = parse_mave_hgvs("n.100A>G", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
}
#[test]
fn test_error_recovery_invalid_hgvs_after_accession() {
let ctx = test_context();
let result = parse_mave_hgvs("c.invalid_syntax", &ctx);
assert!(matches!(result, Err(MaveParseError::HgvsParseError { .. })));
if let Err(MaveParseError::HgvsParseError { input, message }) = result {
assert_eq!(input, "c.invalid_syntax");
assert!(!message.is_empty());
}
}
#[test]
fn test_error_recovery_invalid_position() {
let ctx = test_context();
let result = parse_mave_hgvs("c.XXX>G", &ctx);
assert!(matches!(result, Err(MaveParseError::HgvsParseError { .. })));
}
#[test]
fn test_error_recovery_invalid_protein_aa() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Xxx6Val", &ctx);
if let Err(e) = &result {
assert!(!matches!(e, MaveParseError::MissingAccession { .. }));
}
}
#[test]
fn test_error_recovery_full_form_invalid() {
let ctx = test_context();
let result = parse_mave_hgvs("NP_000509.1:p.invalid", &ctx);
assert!(matches!(result, Err(MaveParseError::HgvsParseError { .. })));
}
#[test]
fn test_error_recovery_partial_accession() {
let ctx = test_context();
let result = parse_mave_hgvs("NP:p.Glu6Val", &ctx);
let _ = result;
}
#[test]
fn test_error_message_empty_input() {
let err = MaveParseError::EmptyInput;
assert_eq!(format!("{}", err), "Empty input");
}
#[test]
fn test_error_message_unknown_coordinate() {
let err = MaveParseError::UnknownCoordinateType("x.123".to_string());
let msg = format!("{}", err);
assert!(msg.contains("Cannot determine coordinate type"));
assert!(msg.contains("x.123"));
}
#[test]
fn test_error_message_missing_accession() {
let err = MaveParseError::MissingAccession {
coord_type: 'c',
input: "c.20A>T".to_string(),
};
let msg = format!("{}", err);
assert!(msg.contains("No accession"));
assert!(msg.contains("c."));
assert!(msg.contains("c.20A>T"));
}
#[test]
fn test_error_message_hgvs_parse_error() {
let err = MaveParseError::HgvsParseError {
input: "c.invalid".to_string(),
message: "expected number".to_string(),
};
let msg = format!("{}", err);
assert!(msg.contains("Failed to parse"));
assert!(msg.contains("c.invalid"));
assert!(msg.contains("expected number"));
}
#[test]
fn test_error_message_already_has_accession() {
let err = MaveParseError::AlreadyHasAccession("NP_000509.1:p.Glu6Val".to_string());
let msg = format!("{}", err);
assert!(msg.contains("already has accession"));
assert!(msg.contains("NP_000509.1:p.Glu6Val"));
}
#[test]
fn test_error_is_std_error() {
let err = MaveParseError::EmptyInput;
let _: &dyn std::error::Error = &err;
}
#[test]
fn test_lenient_parse_recovers_from_missing_accession() {
let ctx = test_context();
let short_result = parse_mave_hgvs_lenient("p.Glu6Val", &ctx);
assert!(short_result.is_ok());
let full_result = parse_mave_hgvs_lenient("NP_000509.1:p.Glu6Val", &ctx);
assert!(full_result.is_ok());
let invalid_result = parse_mave_hgvs_lenient("x.123", &ctx);
assert!(invalid_result.is_err());
}
#[test]
fn test_lenient_parse_empty_context() {
let ctx = MaveContext::new();
let result = parse_mave_hgvs_lenient("p.Glu6Val", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
}
#[test]
fn test_is_short_form_edge_cases() {
assert!(is_mave_short_form("p.Glu6Val"));
assert!(is_mave_short_form("c.20A>T"));
assert!(is_mave_short_form("g.12345del"));
assert!(is_mave_short_form("n.100A>G"));
assert!(is_mave_short_form("r.100a>g"));
assert!(is_mave_short_form("m.100A>G"));
assert!(!is_mave_short_form("NP_000509.1:p.Glu6Val")); assert!(!is_mave_short_form("")); assert!(!is_mave_short_form(" ")); assert!(!is_mave_short_form("invalid")); assert!(!is_mave_short_form("x.123")); assert!(!is_mave_short_form("12345")); assert!(!is_mave_short_form("Glu6Val")); }
#[test]
fn test_coordinate_detection_all_types() {
assert_eq!(detect_coordinate_type("p.Glu6Val"), Ok('p'));
assert_eq!(detect_coordinate_type("c.20A>T"), Ok('c'));
assert_eq!(detect_coordinate_type("g.12345A>G"), Ok('g'));
assert_eq!(detect_coordinate_type("n.100A>G"), Ok('n'));
assert_eq!(detect_coordinate_type("r.100a>g"), Ok('r'));
assert_eq!(detect_coordinate_type("m.100A>G"), Ok('m'));
assert_eq!(detect_coordinate_type("o.100A>G"), Ok('o'));
}
#[test]
fn test_coordinate_detection_case_insensitive() {
assert_eq!(detect_coordinate_type("P.Glu6Val"), Ok('p'));
assert_eq!(detect_coordinate_type("C.20A>T"), Ok('c'));
assert_eq!(detect_coordinate_type("G.12345A>G"), Ok('g'));
}
#[test]
fn test_context_accession_mapping() {
let ctx = MaveContext::new()
.with_protein_accession("NP_001")
.with_coding_accession("NM_001")
.with_genomic_accession("NC_001")
.with_noncoding_accession("NR_001");
assert_eq!(ctx.accession_for_coordinate_type('p'), Some("NP_001"));
assert_eq!(ctx.accession_for_coordinate_type('c'), Some("NM_001"));
assert_eq!(ctx.accession_for_coordinate_type('g'), Some("NC_001"));
assert_eq!(ctx.accession_for_coordinate_type('n'), Some("NR_001"));
assert_eq!(ctx.accession_for_coordinate_type('r'), Some("NM_001"));
assert_eq!(ctx.accession_for_coordinate_type('m'), Some("NC_001"));
}
#[test]
fn test_multiple_parse_attempts() {
let ctx = test_context();
let inputs = vec![
"p.Glu6Val",
"c.20A>T",
"p.Lys17Arg",
"c.30G>C",
"g.12345A>G",
];
for input in inputs {
let result = parse_mave_hgvs(input, &ctx);
assert!(result.is_ok(), "Failed to parse: {}", input);
}
}
#[test]
fn test_consecutive_errors_dont_accumulate() {
let ctx = MaveContext::new();
for _ in 0..5 {
let result = parse_mave_hgvs("p.Glu6Val", &ctx);
assert!(matches!(
result,
Err(MaveParseError::MissingAccession { .. })
));
}
}
#[test]
fn test_special_characters_in_input() {
let ctx = test_context();
let inputs = vec![
"p.Glu6Val\0", "c.20A>G\n", "g.123<>", "c.20A>G\"", ];
for input in inputs {
let result = parse_mave_hgvs(input, &ctx);
let _ = result;
}
}
#[test]
fn test_very_long_input() {
let ctx = test_context();
let result = parse_mave_hgvs("c.999999999999A>G", &ctx);
let _ = result;
}
#[test]
fn test_unicode_in_input() {
let ctx = test_context();
let result = parse_mave_hgvs("p.Glu6Välñ", &ctx);
assert!(result.is_err());
}
}