use super::corrections::{
correct_accession_prefix_case, correct_amino_acid_case_in_protein, correct_dash_characters,
correct_deprecated_con, correct_deprecated_protein_forms, correct_empty_delins,
correct_missing_coordinate_prefix, correct_old_allele_format, correct_old_substitution_syntax,
correct_protein_arrow, correct_quote_characters, correct_redundant_repeat_label,
correct_single_letter_aa_in_protein, correct_single_position_range, correct_whitespace,
detect_del_size_suffix, detect_deprecated_ivs, detect_missing_versions, detect_position_zero,
strip_trailing_annotation, DetectedCorrection,
};
use super::types::{ErrorType, ResolvedAction};
use super::ErrorConfig;
use crate::error::{Diagnostic, ErrorCode, FerroError, SourceSpan};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CorrectionWarning {
pub error_type: ErrorType,
pub message: String,
pub span: Option<(usize, usize)>,
pub original: String,
pub corrected: String,
}
impl CorrectionWarning {
pub fn new(
error_type: ErrorType,
message: impl Into<String>,
span: Option<(usize, usize)>,
original: impl Into<String>,
corrected: impl Into<String>,
) -> Self {
Self {
error_type,
message: message.into(),
span,
original: original.into(),
corrected: corrected.into(),
}
}
pub fn from_correction(correction: &DetectedCorrection) -> Self {
Self {
error_type: correction.error_type,
message: correction.warning_message(),
span: Some((correction.start, correction.end)),
original: correction.original.clone(),
corrected: correction.corrected.clone(),
}
}
}
#[derive(Debug, Clone)]
pub struct PreprocessResult {
pub original: String,
pub preprocessed: String,
pub warnings: Vec<CorrectionWarning>,
pub success: bool,
pub error: Option<FerroError>,
}
impl PreprocessResult {
pub fn unchanged(input: String) -> Self {
Self {
original: input.clone(),
preprocessed: input,
warnings: Vec::new(),
success: true,
error: None,
}
}
pub fn corrected(
original: String,
preprocessed: String,
warnings: Vec<CorrectionWarning>,
) -> Self {
Self {
original,
preprocessed,
warnings,
success: true,
error: None,
}
}
pub fn failed(original: String, error: FerroError) -> Self {
Self {
original: original.clone(),
preprocessed: original,
warnings: Vec::new(),
success: false,
error: Some(error),
}
}
pub fn has_corrections(&self) -> bool {
self.original != self.preprocessed
}
pub fn has_warnings(&self) -> bool {
!self.warnings.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct InputPreprocessor {
config: ErrorConfig,
}
impl InputPreprocessor {
pub fn new(config: ErrorConfig) -> Self {
Self { config }
}
pub fn strict() -> Self {
Self::new(ErrorConfig::strict())
}
pub fn lenient() -> Self {
Self::new(ErrorConfig::lenient())
}
pub fn silent() -> Self {
Self::new(ErrorConfig::silent())
}
fn action_for(&self, error_type: ErrorType) -> ResolvedAction {
self.config.action_for(error_type)
}
pub fn preprocess(&self, input: &str) -> PreprocessResult {
let mut current = input.to_string();
let mut all_warnings = Vec::new();
if let Some(pos) = detect_position_zero(¤t) {
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
pos,
"Position 0 is not valid in HGVS notation",
Diagnostic::new()
.with_code(ErrorCode::InvalidPosition)
.with_span(SourceSpan::new(pos, pos + 1))
.with_source(input)
.with_hint("HGVS positions start at 1, not 0"),
),
);
}
let (corrected, corrections) = correct_dash_characters(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::WrongDashCharacter);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!("Invalid dash character '{}', expected '-'", first.original),
Diagnostic::new()
.with_code(ErrorCode::UnexpectedChar)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone()),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {
}
}
}
let (corrected, corrections) = correct_quote_characters(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::WrongQuoteCharacter);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Invalid quote character '{}', expected regular quotes",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::UnexpectedChar)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone()),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_whitespace(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::ExtraWhitespace);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Extra whitespace in HGVS description",
Diagnostic::new()
.with_code(ErrorCode::UnexpectedChar)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone()),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_accession_prefix_case(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::LowercaseAccessionPrefix);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Lowercase accession prefix '{}', expected uppercase",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::InvalidAccession)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone()),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let missing_versions = detect_missing_versions(¤t);
if !missing_versions.is_empty() {
let action = self.action_for(ErrorType::MissingVersion);
match action {
ResolvedAction::Reject => {
let first = &missing_versions[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Accession '{}' is missing a version suffix",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::InvalidAccession)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_hint(
"RefSeq accessions require a `.<version>` suffix (e.g. NM_000088.3, NC_000023.11)",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &missing_versions {
all_warnings.push(CorrectionWarning::from_correction(c));
}
}
ResolvedAction::SilentCorrect | ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_protein_arrow(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::ProteinSubstitutionArrow);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Arrow '>' in protein substitution is not standard HGVS",
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint("Use p.Val600Glu instead of p.Val600>Glu"),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_amino_acid_case_in_protein(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::LowercaseAminoAcid);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Lowercase or mis-cased amino-acid code '{}', expected '{}'",
first.original, first.corrected
),
Diagnostic::new()
.with_code(ErrorCode::InvalidAminoAcid)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"HGVS prefers three-letter amino-acid codes with a leading capital (e.g. Val, Glu, Ala)",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected_full, corrections) = correct_deprecated_protein_forms(¤t);
if !corrections.is_empty() {
for c in &corrections {
if matches!(self.action_for(c.error_type), ResolvedAction::Reject) {
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
c.start,
format!(
"Deprecated protein notation '{}', use '{}'",
c.original, c.corrected
),
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(c.start, c.end))
.with_source(input)
.with_suggestion(corrected_full.clone())
.with_hint(
"HGVS uses 'Ter' (or 'fsTerN') for translation termination; \
'X' and '*' are deprecated alternatives.",
),
),
);
}
}
let mut rebuilt = String::with_capacity(current.len());
let mut cursor = 0usize;
for c in &corrections {
rebuilt.push_str(¤t[cursor..c.start]);
match self.action_for(c.error_type) {
ResolvedAction::Accept => {
rebuilt.push_str(¤t[c.start..c.end]);
}
ResolvedAction::WarnCorrect => {
all_warnings.push(CorrectionWarning::from_correction(c));
rebuilt.push_str(&c.corrected);
}
ResolvedAction::SilentCorrect => {
rebuilt.push_str(&c.corrected);
}
ResolvedAction::Reject => unreachable!("rejected above"),
}
cursor = c.end;
}
rebuilt.push_str(¤t[cursor..]);
current = rebuilt;
}
let (corrected, corrections) = correct_single_letter_aa_in_protein(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::SingleLetterAminoAcid);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Single-letter amino-acid code '{}', expected three-letter '{}'",
first.original, first.corrected
),
Diagnostic::new()
.with_code(ErrorCode::InvalidAminoAcid)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"HGVS recommends three-letter amino-acid codes (e.g. Val, Glu, Ala) over one-letter abbreviations",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_missing_coordinate_prefix(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::MissingCoordinatePrefix);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Missing coordinate type prefix (e.g., 'g.' for genomic)",
Diagnostic::new()
.with_code(ErrorCode::InvalidAccession)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint("For genomic accessions (NC_, NG_), add 'g.' before the position"),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = strip_trailing_annotation(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::TrailingAnnotation);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Trailing annotation '{}' is not valid HGVS syntax",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::UnexpectedChar)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint("Protein consequence annotations should be separate from the HGVS expression"),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_old_allele_format(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::OldAlleleFormat);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Old/deprecated allele format with coordinate type inside brackets",
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"Use c.[edit1;edit2] format instead of [c.edit1;c.edit2]",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_single_position_range(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::SinglePositionRange);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Single-position range '{}' is non-canonical, expected '{}'",
first.original, first.corrected
),
Diagnostic::new()
.with_code(ErrorCode::InvalidPosition)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"HGVS recommends a single position for del/dup/inv when the range collapses to one base",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_empty_delins(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::EmptyDelinsInsert);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Deletion-insertion has empty inserted sequence",
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"An empty `delins` is semantically equivalent to a plain deletion (`del`)",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_redundant_repeat_label(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::RedundantRepeatLabel);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Repeat description has redundant base label '{}'",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"RNA repeat descriptions should omit the base label when positions already define the unit",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let del_size_hits = detect_del_size_suffix(¤t);
if !del_size_hits.is_empty() {
let action = self.action_for(ErrorType::DelSizeSuffix);
match action {
ResolvedAction::Reject => {
let first = &del_size_hits[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
format!(
"Deletion '{}' uses size-count suffix; canonical form names both endpoints",
first.original
),
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_hint(
"Write `g.<start>_<end>del` instead of `g.<start>del<size>`",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &del_size_hits {
all_warnings.push(CorrectionWarning::from_correction(c));
}
}
ResolvedAction::SilentCorrect | ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_old_substitution_syntax(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::OldSubstitutionSyntax);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Deprecated multi-base substitution syntax",
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"HGVS reserves '>' for single-base substitutions; use 'delins' for multi-base changes",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let (corrected, corrections) = correct_deprecated_con(¤t);
if !corrections.is_empty() {
let action = self.action_for(ErrorType::DeprecatedConSyntax);
match action {
ResolvedAction::Reject => {
let first = &corrections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Deprecated 'con' (conversion) edit syntax",
Diagnostic::new()
.with_code(ErrorCode::InvalidEdit)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_suggestion(corrected.clone())
.with_hint(
"HGVS retired the 'con' edit type; describe conversions as 'delins<source>'",
),
),
);
}
ResolvedAction::WarnCorrect => {
for c in &corrections {
all_warnings.push(CorrectionWarning::from_correction(c));
}
current = corrected;
}
ResolvedAction::SilentCorrect => {
current = corrected;
}
ResolvedAction::Accept => {}
}
}
let detections = detect_deprecated_ivs(¤t);
if !detections.is_empty() {
let action = self.action_for(ErrorType::DeprecatedIvsNotation);
match action {
ResolvedAction::Reject
| ResolvedAction::WarnCorrect
| ResolvedAction::SilentCorrect => {
let first = &detections[0];
return PreprocessResult::failed(
input.to_string(),
FerroError::parse_with_diagnostic(
first.start,
"Retracted c.IVS intronic notation",
Diagnostic::new()
.with_code(ErrorCode::UnexpectedChar)
.with_span(SourceSpan::new(first.start, first.end))
.with_source(input)
.with_hint(
"Use the canonical intronic-offset form (e.g. c.88+2T>G) — IVS notation has been retracted by HGVS and is ambiguous without genomic context",
),
),
);
}
ResolvedAction::Accept => {}
}
}
if current == input && all_warnings.is_empty() {
PreprocessResult::unchanged(input.to_string())
} else {
PreprocessResult::corrected(input.to_string(), current, all_warnings)
}
}
}
impl Default for InputPreprocessor {
fn default() -> Self {
Self::strict()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::error_handling::ErrorOverride;
#[test]
fn test_preprocess_result_unchanged() {
let result = PreprocessResult::unchanged("c.100A>G".to_string());
assert!(result.success);
assert!(!result.has_corrections());
assert!(!result.has_warnings());
assert_eq!(result.original, "c.100A>G");
assert_eq!(result.preprocessed, "c.100A>G");
}
#[test]
fn test_preprocess_result_corrected() {
let result = PreprocessResult::corrected(
"c.100\u{2013}200del".to_string(),
"c.100-200del".to_string(),
vec![CorrectionWarning::new(
ErrorType::WrongDashCharacter,
"test warning",
Some((5, 8)),
"\u{2013}",
"-",
)],
);
assert!(result.success);
assert!(result.has_corrections());
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_strict_valid_input() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("c.100A>G");
assert!(result.success);
assert!(!result.has_corrections());
}
#[test]
fn test_preprocessor_strict_rejects_en_dash() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_strict_rejects_whitespace() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess(" c.100A>G ");
assert!(!result.success);
}
#[test]
fn test_preprocessor_strict_rejects_position_zero() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("c.0A>G");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_lenient_corrects_en_dash() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(result.success);
assert_eq!(result.preprocessed, "c.100-200del");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_corrects_whitespace() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess(" c.100A>G ");
assert!(result.success);
assert_eq!(result.preprocessed, "c.100A>G");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_corrects_protein_arrow() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("p.Val600>Glu");
assert!(result.success);
assert_eq!(result.preprocessed, "p.Val600Glu");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_rejects_position_zero() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("c.0A>G");
assert!(!result.success);
}
#[test]
fn test_preprocessor_silent_corrects_without_warnings() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(result.success);
assert_eq!(result.preprocessed, "c.100-200del");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_silent_corrects_multiple() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess(" nm_000088.3:c.100\u{2013}200del ");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.100-200del");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_override_reject_in_lenient() {
let config = ErrorConfig::lenient()
.with_override(ErrorType::WrongDashCharacter, ErrorOverride::Reject);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(!result.success);
}
#[test]
fn test_preprocessor_override_silent_in_lenient() {
let config = ErrorConfig::lenient()
.with_override(ErrorType::WrongDashCharacter, ErrorOverride::SilentCorrect);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(result.success);
assert!(!result.has_warnings()); }
#[test]
fn test_preprocessor_override_correct_in_strict() {
let config = ErrorConfig::strict()
.with_override(ErrorType::WrongDashCharacter, ErrorOverride::WarnCorrect);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("c.100\u{2013}200del");
assert!(result.success);
assert!(result.has_warnings());
assert_eq!(result.preprocessed, "c.100-200del");
}
#[test]
fn test_correction_warning_from_correction() {
let correction =
DetectedCorrection::new(ErrorType::WrongDashCharacter, "\u{2013}", "-", 5, 8);
let warning = CorrectionWarning::from_correction(&correction);
assert_eq!(warning.error_type, ErrorType::WrongDashCharacter);
assert!(warning.message.contains("dash"));
assert_eq!(warning.span, Some((5, 8)));
}
#[test]
fn test_preprocessor_strict_rejects_trailing_annotation() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_lenient_strips_trailing_annotation() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_silent_strips_trailing_annotation() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_clinvar_pattern() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_003467.3(CXCR4):c.708G>A (p.Lys236=)");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_003467.3(CXCR4):c.708G>A");
}
#[test]
fn test_preprocessor_override_accept_trailing_annotation() {
let config = ErrorConfig::strict()
.with_override(ErrorType::ExtraWhitespace, ErrorOverride::SilentCorrect)
.with_override(ErrorType::TrailingAnnotation, ErrorOverride::WarnCorrect);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_override_trailing_annotation_no_space() {
let config = ErrorConfig::strict()
.with_override(ErrorType::TrailingAnnotation, ErrorOverride::WarnCorrect);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NM_000088.3:c.459A>G(p.Lys153=)");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_strict_rejects_missing_prefix() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NC_000017.11:12345A>G");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_lenient_adds_missing_prefix() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NC_000017.11:12345A>G");
assert!(result.success);
assert_eq!(result.preprocessed, "NC_000017.11:g.12345A>G");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_adds_missing_prefix_uncertain() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NC_000017.11:(?_31094927)_(31377677_?)del");
assert!(result.success);
assert_eq!(
result.preprocessed,
"NC_000017.11:g.(?_31094927)_(31377677_?)del"
);
}
#[test]
fn test_preprocessor_silent_adds_missing_prefix() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("NC_000017.11:12345A>G");
assert!(result.success);
assert_eq!(result.preprocessed, "NC_000017.11:g.12345A>G");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_strict_rejects_deprecated_stop_star() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97*");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_strict_rejects_deprecated_stop_x() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97X");
assert!(!result.success);
}
#[test]
fn test_preprocessor_strict_rejects_deprecated_frameshift_star() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97fs*23");
assert!(!result.success);
}
#[test]
fn test_preprocessor_strict_rejects_deprecated_frameshift_x() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97fsX23");
assert!(!result.success);
}
#[test]
fn test_preprocessor_lenient_corrects_deprecated_stop_star() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97*");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97Ter");
assert_eq!(result.warnings.len(), 1);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedStopCodonStar
);
}
#[test]
fn test_preprocessor_lenient_corrects_deprecated_stop_x() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97X");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97Ter");
assert_eq!(result.warnings.len(), 1);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedStopCodonX
);
}
#[test]
fn test_preprocessor_lenient_corrects_deprecated_frameshift_star() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97fs*23");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97fsTer23");
assert_eq!(result.warnings.len(), 1);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedFrameshiftStar
);
}
#[test]
fn test_preprocessor_lenient_corrects_deprecated_frameshift_x() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NP_000079.2:p.Arg97fsX23");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97fsTer23");
assert_eq!(result.warnings.len(), 1);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedFrameshiftX
);
}
#[test]
fn test_preprocessor_silent_corrects_deprecated_without_warnings() {
let preprocessor = InputPreprocessor::silent();
for input in [
"NP_000079.2:p.Arg97*",
"NP_000079.2:p.Arg97X",
"NP_000079.2:p.Arg97fs*23",
"NP_000079.2:p.Arg97fsX23",
] {
let result = preprocessor.preprocess(input);
assert!(result.success, "expected success for {}", input);
assert!(!result.has_warnings(), "expected no warnings for {}", input);
assert!(
result.preprocessed.contains("Ter"),
"expected Ter in {}",
result.preprocessed
);
}
}
#[test]
fn test_preprocessor_lenient_canonical_no_warnings() {
let preprocessor = InputPreprocessor::lenient();
for input in [
"NP_000079.2:p.Arg97Ter",
"NP_000079.2:p.Arg97ProfsTer23",
"NP_000079.2:p.Tyr180fs",
"NP_000079.2:p.Val600Glu",
"NP_000079.2:p.Arg782Xaa",
] {
let result = preprocessor.preprocess(input);
assert!(result.success, "expected success for {}", input);
assert_eq!(
result.preprocessed, input,
"expected unchanged for {}",
input
);
assert!(
!result.has_warnings(),
"expected no warnings for {}, got {:?}",
input,
result.warnings
);
}
}
#[test]
fn test_preprocessor_lenient_compound_protein_allele_two_warnings() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NP_000079.2:p.[Arg97*;Arg100X]");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.[Arg97Ter;Arg100Ter]");
assert_eq!(result.warnings.len(), 2);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedStopCodonStar
);
assert_eq!(
result.warnings[1].error_type,
ErrorType::DeprecatedStopCodonX
);
}
#[test]
fn test_preprocessor_lenient_idempotent_on_corrected_output() {
let preprocessor = InputPreprocessor::lenient();
let first = preprocessor.preprocess("NP_000079.2:p.Arg97fs*23");
let second = preprocessor.preprocess(&first.preprocessed);
assert_eq!(second.preprocessed, first.preprocessed);
assert!(!second.has_warnings());
}
#[test]
fn test_preprocessor_override_accept_keeps_deprecated_form() {
let config = ErrorConfig::lenient()
.with_override(ErrorType::DeprecatedStopCodonStar, ErrorOverride::Accept);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NP_000079.2:p.Arg97*");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97*");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_override_silent_in_strict_mode() {
let config = ErrorConfig::strict().with_override(
ErrorType::DeprecatedFrameshiftStar,
ErrorOverride::SilentCorrect,
);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NP_000079.2:p.Arg97fs*23");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.Arg97fsTer23");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_lenient_does_not_affect_cds_utr_position() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_000088.3:c.*5A>G");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.*5A>G");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_partial_accept_only_rewrites_non_accept_codes() {
let config = ErrorConfig::lenient()
.with_override(ErrorType::DeprecatedStopCodonStar, ErrorOverride::Accept);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NP_000079.2:p.[Arg97*;Arg100X]");
assert!(result.success);
assert_eq!(result.preprocessed, "NP_000079.2:p.[Arg97*;Arg100Ter]");
assert_eq!(result.warnings.len(), 1);
assert_eq!(
result.warnings[0].error_type,
ErrorType::DeprecatedStopCodonX
);
}
#[test]
fn test_preprocessor_strict_rejects_old_substitution_with_refs() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_000088.3:c.79_80GC>TT");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_lenient_corrects_old_substitution_with_refs() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_000088.3:c.79_80GC>TT");
assert!(result.success, "lenient should rewrite to delins");
assert_eq!(result.preprocessed, "NM_000088.3:c.79_80delinsTT");
assert!(result.has_warnings());
assert!(result
.warnings
.iter()
.any(|w| w.error_type == ErrorType::OldSubstitutionSyntax));
}
#[test]
fn test_preprocessor_lenient_corrects_old_substitution_no_refs() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_000088.3:c.100_102>ATG");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.100_102delinsATG");
assert!(result.has_warnings());
}
#[test]
fn test_preprocessor_silent_rewrites_old_substitution_no_warning() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("NM_000088.3:c.79_80GC>TT");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.79_80delinsTT");
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_canonical_substitution_unchanged() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_000088.3:c.100A>G");
assert!(result.success);
assert!(!result.has_corrections());
}
#[test]
fn test_preprocessor_strict_rejects_con_syntax() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_004006.2:c.100_200conNM_001.1:c.5_105");
assert!(!result.success);
}
#[test]
fn test_preprocessor_lenient_corrects_con_to_delins() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_004006.2:c.100_200conNM_001.1:c.5_105");
assert!(result.success);
assert_eq!(
result.preprocessed,
"NM_004006.2:c.100_200delinsNM_001.1:c.5_105"
);
assert!(result.has_warnings());
assert!(result
.warnings
.iter()
.any(|w| w.error_type == ErrorType::DeprecatedConSyntax));
}
#[test]
fn test_preprocessor_silent_corrects_con_no_warning() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("NM_004006.2:c.100_200conNM_001.1:c.5_105");
assert!(result.success);
assert_eq!(
result.preprocessed,
"NM_004006.2:c.100_200delinsNM_001.1:c.5_105"
);
assert!(!result.has_warnings());
}
#[test]
fn test_preprocessor_strict_rejects_ivs_notation() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_000088.3:c.IVS2+2T>G");
assert!(!result.success);
assert!(result.error.is_some());
}
#[test]
fn test_preprocessor_lenient_rejects_ivs_notation() {
let preprocessor = InputPreprocessor::lenient();
let result = preprocessor.preprocess("NM_000088.3:c.IVS2+2T>G");
assert!(!result.success);
}
#[test]
fn test_preprocessor_silent_rejects_ivs_notation() {
let preprocessor = InputPreprocessor::silent();
let result = preprocessor.preprocess("NM_000088.3:c.IVS2+2T>G");
assert!(!result.success);
}
#[test]
fn test_preprocessor_override_accept_keeps_ivs_notation() {
let config = ErrorConfig::strict()
.with_override(ErrorType::DeprecatedIvsNotation, ErrorOverride::Accept);
let preprocessor = InputPreprocessor::new(config);
let result = preprocessor.preprocess("NM_000088.3:c.IVS2+2T>G");
assert!(result.success);
assert_eq!(result.preprocessed, "NM_000088.3:c.IVS2+2T>G");
}
#[test]
fn test_preprocessor_canonical_intronic_unchanged() {
let preprocessor = InputPreprocessor::strict();
let result = preprocessor.preprocess("NM_000088.3:c.88+2T>G");
assert!(result.success);
assert!(!result.has_corrections());
}
}