1use super::corrections::{
7 correct_accession_prefix_case, correct_dash_characters, correct_missing_coordinate_prefix,
8 correct_old_allele_format, correct_protein_arrow, correct_quote_characters, correct_whitespace,
9 detect_position_zero, strip_trailing_annotation, DetectedCorrection,
10};
11use super::types::{ErrorType, ResolvedAction};
12use super::ErrorConfig;
13use crate::error::{Diagnostic, ErrorCode, FerroError, SourceSpan};
14
15#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct CorrectionWarning {
18 pub error_type: ErrorType,
20 pub message: String,
22 pub span: Option<(usize, usize)>,
24 pub original: String,
26 pub corrected: String,
28}
29
30impl CorrectionWarning {
31 pub fn new(
33 error_type: ErrorType,
34 message: impl Into<String>,
35 span: Option<(usize, usize)>,
36 original: impl Into<String>,
37 corrected: impl Into<String>,
38 ) -> Self {
39 Self {
40 error_type,
41 message: message.into(),
42 span,
43 original: original.into(),
44 corrected: corrected.into(),
45 }
46 }
47
48 pub fn from_correction(correction: &DetectedCorrection) -> Self {
50 Self {
51 error_type: correction.error_type,
52 message: correction.warning_message(),
53 span: Some((correction.start, correction.end)),
54 original: correction.original.clone(),
55 corrected: correction.corrected.clone(),
56 }
57 }
58}
59
60#[derive(Debug, Clone)]
62pub struct PreprocessResult {
63 pub original: String,
65 pub preprocessed: String,
67 pub warnings: Vec<CorrectionWarning>,
69 pub success: bool,
71 pub error: Option<FerroError>,
73}
74
75impl PreprocessResult {
76 pub fn unchanged(input: String) -> Self {
78 Self {
79 original: input.clone(),
80 preprocessed: input,
81 warnings: Vec::new(),
82 success: true,
83 error: None,
84 }
85 }
86
87 pub fn corrected(
89 original: String,
90 preprocessed: String,
91 warnings: Vec<CorrectionWarning>,
92 ) -> Self {
93 Self {
94 original,
95 preprocessed,
96 warnings,
97 success: true,
98 error: None,
99 }
100 }
101
102 pub fn failed(original: String, error: FerroError) -> Self {
104 Self {
105 original: original.clone(),
106 preprocessed: original,
107 warnings: Vec::new(),
108 success: false,
109 error: Some(error),
110 }
111 }
112
113 pub fn has_corrections(&self) -> bool {
115 self.original != self.preprocessed
116 }
117
118 pub fn has_warnings(&self) -> bool {
120 !self.warnings.is_empty()
121 }
122}
123
124#[derive(Debug, Clone)]
126pub struct InputPreprocessor {
127 config: ErrorConfig,
129}
130
131impl InputPreprocessor {
132 pub fn new(config: ErrorConfig) -> Self {
134 Self { config }
135 }
136
137 pub fn strict() -> Self {
139 Self::new(ErrorConfig::strict())
140 }
141
142 pub fn lenient() -> Self {
144 Self::new(ErrorConfig::lenient())
145 }
146
147 pub fn silent() -> Self {
149 Self::new(ErrorConfig::silent())
150 }
151
152 fn action_for(&self, error_type: ErrorType) -> ResolvedAction {
154 self.config.action_for(error_type)
155 }
156
157 pub fn preprocess(&self, input: &str) -> PreprocessResult {
161 let mut current = input.to_string();
163 let mut all_warnings = Vec::new();
164
165 if let Some(pos) = detect_position_zero(¤t) {
169 return PreprocessResult::failed(
170 input.to_string(),
171 FerroError::parse_with_diagnostic(
172 pos,
173 "Position 0 is not valid in HGVS notation",
174 Diagnostic::new()
175 .with_code(ErrorCode::InvalidPosition)
176 .with_span(SourceSpan::new(pos, pos + 1))
177 .with_source(input)
178 .with_hint("HGVS positions start at 1, not 0"),
179 ),
180 );
181 }
182
183 let (corrected, corrections) = correct_dash_characters(¤t);
185 if !corrections.is_empty() {
186 let action = self.action_for(ErrorType::WrongDashCharacter);
187 match action {
188 ResolvedAction::Reject => {
189 let first = &corrections[0];
190 return PreprocessResult::failed(
191 input.to_string(),
192 FerroError::parse_with_diagnostic(
193 first.start,
194 format!("Invalid dash character '{}', expected '-'", first.original),
195 Diagnostic::new()
196 .with_code(ErrorCode::UnexpectedChar)
197 .with_span(SourceSpan::new(first.start, first.end))
198 .with_source(input)
199 .with_suggestion(corrected.clone()),
200 ),
201 );
202 }
203 ResolvedAction::WarnCorrect => {
204 for c in &corrections {
205 all_warnings.push(CorrectionWarning::from_correction(c));
206 }
207 current = corrected;
208 }
209 ResolvedAction::SilentCorrect => {
210 current = corrected;
211 }
212 ResolvedAction::Accept => {
213 }
215 }
216 }
217
218 let (corrected, corrections) = correct_quote_characters(¤t);
220 if !corrections.is_empty() {
221 let action = self.action_for(ErrorType::WrongQuoteCharacter);
222 match action {
223 ResolvedAction::Reject => {
224 let first = &corrections[0];
225 return PreprocessResult::failed(
226 input.to_string(),
227 FerroError::parse_with_diagnostic(
228 first.start,
229 format!(
230 "Invalid quote character '{}', expected regular quotes",
231 first.original
232 ),
233 Diagnostic::new()
234 .with_code(ErrorCode::UnexpectedChar)
235 .with_span(SourceSpan::new(first.start, first.end))
236 .with_source(input)
237 .with_suggestion(corrected.clone()),
238 ),
239 );
240 }
241 ResolvedAction::WarnCorrect => {
242 for c in &corrections {
243 all_warnings.push(CorrectionWarning::from_correction(c));
244 }
245 current = corrected;
246 }
247 ResolvedAction::SilentCorrect => {
248 current = corrected;
249 }
250 ResolvedAction::Accept => {}
251 }
252 }
253
254 let (corrected, corrections) = correct_whitespace(¤t);
256 if !corrections.is_empty() {
257 let action = self.action_for(ErrorType::ExtraWhitespace);
258 match action {
259 ResolvedAction::Reject => {
260 let first = &corrections[0];
261 return PreprocessResult::failed(
262 input.to_string(),
263 FerroError::parse_with_diagnostic(
264 first.start,
265 "Extra whitespace in HGVS description",
266 Diagnostic::new()
267 .with_code(ErrorCode::UnexpectedChar)
268 .with_span(SourceSpan::new(first.start, first.end))
269 .with_source(input)
270 .with_suggestion(corrected.clone()),
271 ),
272 );
273 }
274 ResolvedAction::WarnCorrect => {
275 for c in &corrections {
276 all_warnings.push(CorrectionWarning::from_correction(c));
277 }
278 current = corrected;
279 }
280 ResolvedAction::SilentCorrect => {
281 current = corrected;
282 }
283 ResolvedAction::Accept => {}
284 }
285 }
286
287 let (corrected, corrections) = correct_accession_prefix_case(¤t);
289 if !corrections.is_empty() {
290 let action = self.action_for(ErrorType::LowercaseAccessionPrefix);
291 match action {
292 ResolvedAction::Reject => {
293 let first = &corrections[0];
294 return PreprocessResult::failed(
295 input.to_string(),
296 FerroError::parse_with_diagnostic(
297 first.start,
298 format!(
299 "Lowercase accession prefix '{}', expected uppercase",
300 first.original
301 ),
302 Diagnostic::new()
303 .with_code(ErrorCode::InvalidAccession)
304 .with_span(SourceSpan::new(first.start, first.end))
305 .with_source(input)
306 .with_suggestion(corrected.clone()),
307 ),
308 );
309 }
310 ResolvedAction::WarnCorrect => {
311 for c in &corrections {
312 all_warnings.push(CorrectionWarning::from_correction(c));
313 }
314 current = corrected;
315 }
316 ResolvedAction::SilentCorrect => {
317 current = corrected;
318 }
319 ResolvedAction::Accept => {}
320 }
321 }
322
323 let (corrected, corrections) = correct_protein_arrow(¤t);
325 if !corrections.is_empty() {
326 let action = self.action_for(ErrorType::ProteinSubstitutionArrow);
327 match action {
328 ResolvedAction::Reject => {
329 let first = &corrections[0];
330 return PreprocessResult::failed(
331 input.to_string(),
332 FerroError::parse_with_diagnostic(
333 first.start,
334 "Arrow '>' in protein substitution is not standard HGVS",
335 Diagnostic::new()
336 .with_code(ErrorCode::InvalidEdit)
337 .with_span(SourceSpan::new(first.start, first.end))
338 .with_source(input)
339 .with_suggestion(corrected.clone())
340 .with_hint("Use p.Val600Glu instead of p.Val600>Glu"),
341 ),
342 );
343 }
344 ResolvedAction::WarnCorrect => {
345 for c in &corrections {
346 all_warnings.push(CorrectionWarning::from_correction(c));
347 }
348 current = corrected;
349 }
350 ResolvedAction::SilentCorrect => {
351 current = corrected;
352 }
353 ResolvedAction::Accept => {}
354 }
355 }
356
357 let (corrected, corrections) = correct_missing_coordinate_prefix(¤t);
359 if !corrections.is_empty() {
360 let action = self.action_for(ErrorType::MissingCoordinatePrefix);
361 match action {
362 ResolvedAction::Reject => {
363 let first = &corrections[0];
364 return PreprocessResult::failed(
365 input.to_string(),
366 FerroError::parse_with_diagnostic(
367 first.start,
368 "Missing coordinate type prefix (e.g., 'g.' for genomic)",
369 Diagnostic::new()
370 .with_code(ErrorCode::InvalidAccession)
371 .with_span(SourceSpan::new(first.start, first.end))
372 .with_source(input)
373 .with_suggestion(corrected.clone())
374 .with_hint("For genomic accessions (NC_, NG_), add 'g.' before the position"),
375 ),
376 );
377 }
378 ResolvedAction::WarnCorrect => {
379 for c in &corrections {
380 all_warnings.push(CorrectionWarning::from_correction(c));
381 }
382 current = corrected;
383 }
384 ResolvedAction::SilentCorrect => {
385 current = corrected;
386 }
387 ResolvedAction::Accept => {}
388 }
389 }
390
391 let (corrected, corrections) = strip_trailing_annotation(¤t);
393 if !corrections.is_empty() {
394 let action = self.action_for(ErrorType::TrailingAnnotation);
395 match action {
396 ResolvedAction::Reject => {
397 let first = &corrections[0];
398 return PreprocessResult::failed(
399 input.to_string(),
400 FerroError::parse_with_diagnostic(
401 first.start,
402 format!(
403 "Trailing annotation '{}' is not valid HGVS syntax",
404 first.original
405 ),
406 Diagnostic::new()
407 .with_code(ErrorCode::UnexpectedChar)
408 .with_span(SourceSpan::new(first.start, first.end))
409 .with_source(input)
410 .with_suggestion(corrected.clone())
411 .with_hint("Protein consequence annotations should be separate from the HGVS expression"),
412 ),
413 );
414 }
415 ResolvedAction::WarnCorrect => {
416 for c in &corrections {
417 all_warnings.push(CorrectionWarning::from_correction(c));
418 }
419 current = corrected;
420 }
421 ResolvedAction::SilentCorrect => {
422 current = corrected;
423 }
424 ResolvedAction::Accept => {}
425 }
426 }
427
428 let (corrected, corrections) = correct_old_allele_format(¤t);
430 if !corrections.is_empty() {
431 let action = self.action_for(ErrorType::OldAlleleFormat);
432 match action {
433 ResolvedAction::Reject => {
434 let first = &corrections[0];
435 return PreprocessResult::failed(
436 input.to_string(),
437 FerroError::parse_with_diagnostic(
438 first.start,
439 "Old/deprecated allele format with coordinate type inside brackets",
440 Diagnostic::new()
441 .with_code(ErrorCode::InvalidEdit)
442 .with_span(SourceSpan::new(first.start, first.end))
443 .with_source(input)
444 .with_suggestion(corrected.clone())
445 .with_hint(
446 "Use c.[edit1;edit2] format instead of [c.edit1;c.edit2]",
447 ),
448 ),
449 );
450 }
451 ResolvedAction::WarnCorrect => {
452 for c in &corrections {
453 all_warnings.push(CorrectionWarning::from_correction(c));
454 }
455 current = corrected;
456 }
457 ResolvedAction::SilentCorrect => {
458 current = corrected;
459 }
460 ResolvedAction::Accept => {}
461 }
462 }
463
464 if current == input && all_warnings.is_empty() {
466 PreprocessResult::unchanged(input.to_string())
467 } else {
468 PreprocessResult::corrected(input.to_string(), current, all_warnings)
469 }
470 }
471}
472
473impl Default for InputPreprocessor {
474 fn default() -> Self {
475 Self::strict()
476 }
477}
478
479#[cfg(test)]
480mod tests {
481 use super::*;
482 use crate::error_handling::ErrorOverride;
483
484 #[test]
486 fn test_preprocess_result_unchanged() {
487 let result = PreprocessResult::unchanged("c.100A>G".to_string());
488 assert!(result.success);
489 assert!(!result.has_corrections());
490 assert!(!result.has_warnings());
491 assert_eq!(result.original, "c.100A>G");
492 assert_eq!(result.preprocessed, "c.100A>G");
493 }
494
495 #[test]
496 fn test_preprocess_result_corrected() {
497 let result = PreprocessResult::corrected(
498 "c.100\u{2013}200del".to_string(),
499 "c.100-200del".to_string(),
500 vec![CorrectionWarning::new(
501 ErrorType::WrongDashCharacter,
502 "test warning",
503 Some((5, 8)),
504 "\u{2013}",
505 "-",
506 )],
507 );
508 assert!(result.success);
509 assert!(result.has_corrections());
510 assert!(result.has_warnings());
511 }
512
513 #[test]
515 fn test_preprocessor_strict_valid_input() {
516 let preprocessor = InputPreprocessor::strict();
517 let result = preprocessor.preprocess("c.100A>G");
518 assert!(result.success);
519 assert!(!result.has_corrections());
520 }
521
522 #[test]
523 fn test_preprocessor_strict_rejects_en_dash() {
524 let preprocessor = InputPreprocessor::strict();
525 let result = preprocessor.preprocess("c.100\u{2013}200del");
526 assert!(!result.success);
527 assert!(result.error.is_some());
528 }
529
530 #[test]
531 fn test_preprocessor_strict_rejects_whitespace() {
532 let preprocessor = InputPreprocessor::strict();
533 let result = preprocessor.preprocess(" c.100A>G ");
534 assert!(!result.success);
535 }
536
537 #[test]
538 fn test_preprocessor_strict_rejects_position_zero() {
539 let preprocessor = InputPreprocessor::strict();
540 let result = preprocessor.preprocess("c.0A>G");
541 assert!(!result.success);
542 assert!(result.error.is_some());
543 }
544
545 #[test]
547 fn test_preprocessor_lenient_corrects_en_dash() {
548 let preprocessor = InputPreprocessor::lenient();
549 let result = preprocessor.preprocess("c.100\u{2013}200del");
550 assert!(result.success);
551 assert_eq!(result.preprocessed, "c.100-200del");
552 assert!(result.has_warnings());
553 }
554
555 #[test]
556 fn test_preprocessor_lenient_corrects_whitespace() {
557 let preprocessor = InputPreprocessor::lenient();
558 let result = preprocessor.preprocess(" c.100A>G ");
559 assert!(result.success);
560 assert_eq!(result.preprocessed, "c.100A>G");
561 assert!(result.has_warnings());
562 }
563
564 #[test]
565 fn test_preprocessor_lenient_corrects_protein_arrow() {
566 let preprocessor = InputPreprocessor::lenient();
567 let result = preprocessor.preprocess("p.Val600>Glu");
568 assert!(result.success);
569 assert_eq!(result.preprocessed, "p.Val600Glu");
570 assert!(result.has_warnings());
571 }
572
573 #[test]
574 fn test_preprocessor_lenient_rejects_position_zero() {
575 let preprocessor = InputPreprocessor::lenient();
576 let result = preprocessor.preprocess("c.0A>G");
577 assert!(!result.success);
579 }
580
581 #[test]
583 fn test_preprocessor_silent_corrects_without_warnings() {
584 let preprocessor = InputPreprocessor::silent();
585 let result = preprocessor.preprocess("c.100\u{2013}200del");
586 assert!(result.success);
587 assert_eq!(result.preprocessed, "c.100-200del");
588 assert!(!result.has_warnings());
589 }
590
591 #[test]
592 fn test_preprocessor_silent_corrects_multiple() {
593 let preprocessor = InputPreprocessor::silent();
594 let result = preprocessor.preprocess(" nm_000088.3:c.100\u{2013}200del ");
595 assert!(result.success);
596 assert_eq!(result.preprocessed, "NM_000088.3:c.100-200del");
597 assert!(!result.has_warnings());
598 }
599
600 #[test]
602 fn test_preprocessor_override_reject_in_lenient() {
603 let config = ErrorConfig::lenient()
604 .with_override(ErrorType::WrongDashCharacter, ErrorOverride::Reject);
605 let preprocessor = InputPreprocessor::new(config);
606 let result = preprocessor.preprocess("c.100\u{2013}200del");
607 assert!(!result.success);
608 }
609
610 #[test]
611 fn test_preprocessor_override_silent_in_lenient() {
612 let config = ErrorConfig::lenient()
613 .with_override(ErrorType::WrongDashCharacter, ErrorOverride::SilentCorrect);
614 let preprocessor = InputPreprocessor::new(config);
615 let result = preprocessor.preprocess("c.100\u{2013}200del");
616 assert!(result.success);
617 assert!(!result.has_warnings()); }
619
620 #[test]
621 fn test_preprocessor_override_correct_in_strict() {
622 let config = ErrorConfig::strict()
623 .with_override(ErrorType::WrongDashCharacter, ErrorOverride::WarnCorrect);
624 let preprocessor = InputPreprocessor::new(config);
625 let result = preprocessor.preprocess("c.100\u{2013}200del");
626 assert!(result.success);
627 assert!(result.has_warnings());
628 assert_eq!(result.preprocessed, "c.100-200del");
629 }
630
631 #[test]
633 fn test_correction_warning_from_correction() {
634 let correction =
635 DetectedCorrection::new(ErrorType::WrongDashCharacter, "\u{2013}", "-", 5, 8);
636 let warning = CorrectionWarning::from_correction(&correction);
637 assert_eq!(warning.error_type, ErrorType::WrongDashCharacter);
638 assert!(warning.message.contains("dash"));
639 assert_eq!(warning.span, Some((5, 8)));
640 }
641
642 #[test]
644 fn test_preprocessor_strict_rejects_trailing_annotation() {
645 let preprocessor = InputPreprocessor::strict();
646 let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
647 assert!(!result.success);
648 assert!(result.error.is_some());
649 }
650
651 #[test]
652 fn test_preprocessor_lenient_strips_trailing_annotation() {
653 let preprocessor = InputPreprocessor::lenient();
654 let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
655 assert!(result.success);
656 assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
657 assert!(result.has_warnings());
658 }
659
660 #[test]
661 fn test_preprocessor_silent_strips_trailing_annotation() {
662 let preprocessor = InputPreprocessor::silent();
663 let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
664 assert!(result.success);
665 assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
666 assert!(!result.has_warnings());
667 }
668
669 #[test]
670 fn test_preprocessor_lenient_clinvar_pattern() {
671 let preprocessor = InputPreprocessor::lenient();
672 let result = preprocessor.preprocess("NM_003467.3(CXCR4):c.708G>A (p.Lys236=)");
673 assert!(result.success);
674 assert_eq!(result.preprocessed, "NM_003467.3(CXCR4):c.708G>A");
675 }
676
677 #[test]
678 fn test_preprocessor_override_accept_trailing_annotation() {
679 let config = ErrorConfig::strict()
681 .with_override(ErrorType::ExtraWhitespace, ErrorOverride::SilentCorrect)
682 .with_override(ErrorType::TrailingAnnotation, ErrorOverride::WarnCorrect);
683 let preprocessor = InputPreprocessor::new(config);
684 let result = preprocessor.preprocess("NM_000088.3:c.459A>G (p.Lys153=)");
685 assert!(result.success);
686 assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
687 assert!(result.has_warnings());
688 }
689
690 #[test]
691 fn test_preprocessor_override_trailing_annotation_no_space() {
692 let config = ErrorConfig::strict()
694 .with_override(ErrorType::TrailingAnnotation, ErrorOverride::WarnCorrect);
695 let preprocessor = InputPreprocessor::new(config);
696 let result = preprocessor.preprocess("NM_000088.3:c.459A>G(p.Lys153=)");
697 assert!(result.success);
698 assert_eq!(result.preprocessed, "NM_000088.3:c.459A>G");
699 assert!(result.has_warnings());
700 }
701
702 #[test]
704 fn test_preprocessor_strict_rejects_missing_prefix() {
705 let preprocessor = InputPreprocessor::strict();
706 let result = preprocessor.preprocess("NC_000017.11:12345A>G");
707 assert!(!result.success);
708 assert!(result.error.is_some());
709 }
710
711 #[test]
712 fn test_preprocessor_lenient_adds_missing_prefix() {
713 let preprocessor = InputPreprocessor::lenient();
714 let result = preprocessor.preprocess("NC_000017.11:12345A>G");
715 assert!(result.success);
716 assert_eq!(result.preprocessed, "NC_000017.11:g.12345A>G");
717 assert!(result.has_warnings());
718 }
719
720 #[test]
721 fn test_preprocessor_lenient_adds_missing_prefix_uncertain() {
722 let preprocessor = InputPreprocessor::lenient();
723 let result = preprocessor.preprocess("NC_000017.11:(?_31094927)_(31377677_?)del");
724 assert!(result.success);
725 assert_eq!(
726 result.preprocessed,
727 "NC_000017.11:g.(?_31094927)_(31377677_?)del"
728 );
729 }
730
731 #[test]
732 fn test_preprocessor_silent_adds_missing_prefix() {
733 let preprocessor = InputPreprocessor::silent();
734 let result = preprocessor.preprocess("NC_000017.11:12345A>G");
735 assert!(result.success);
736 assert_eq!(result.preprocessed, "NC_000017.11:g.12345A>G");
737 assert!(!result.has_warnings());
738 }
739}