1pub mod boundary;
27pub mod config;
28pub mod rules;
29pub mod shuffle;
30pub mod validate;
31
32use crate::coords::{hgvs_pos_to_index, index_to_hgvs_pos};
33use crate::error::FerroError;
34use crate::hgvs::edit::NaEdit;
35use crate::hgvs::interval::Interval;
36use crate::hgvs::location::{CdsPos, GenomePos, TxPos};
37use crate::hgvs::variant::{CdsVariant, GenomeVariant, HgvsVariant, LocEdit, TxVariant};
38use crate::hgvs::HgvsVariant as HV;
39use crate::reference::ReferenceProvider;
40use boundary::Boundaries;
41pub use config::{NormalizeConfig, ShuffleDirection};
42use rules::{canonicalize_edit, needs_normalization, should_canonicalize};
43use shuffle::shuffle;
44
45#[derive(Debug, Clone)]
47pub struct NormalizationWarning {
48 pub code: String,
50 pub message: String,
52 pub stated_ref: String,
54 pub actual_ref: String,
56 pub position: String,
58 pub corrected: bool,
60}
61
62#[derive(Debug, Clone)]
64pub struct NormalizeResultWithWarnings {
65 pub result: HgvsVariant,
67 pub warnings: Vec<NormalizationWarning>,
69}
70
71impl NormalizeResultWithWarnings {
72 pub fn new(result: HgvsVariant) -> Self {
74 Self {
75 result,
76 warnings: vec![],
77 }
78 }
79
80 pub fn with_warnings(result: HgvsVariant, warnings: Vec<NormalizationWarning>) -> Self {
82 Self { result, warnings }
83 }
84
85 pub fn add_warning(&mut self, warning: NormalizationWarning) {
87 self.warnings.push(warning);
88 }
89
90 pub fn has_warnings(&self) -> bool {
92 !self.warnings.is_empty()
93 }
94
95 pub fn has_ref_mismatch(&self) -> bool {
97 self.warnings.iter().any(|w| w.code == "REFSEQ_MISMATCH")
98 }
99}
100
101pub struct Normalizer<P: ReferenceProvider> {
103 provider: P,
104 config: NormalizeConfig,
105}
106
107impl<P: ReferenceProvider> Normalizer<P> {
108 pub fn new(provider: P) -> Self {
110 Self {
111 provider,
112 config: NormalizeConfig::default(),
113 }
114 }
115
116 pub fn with_config(provider: P, config: NormalizeConfig) -> Self {
118 Self { provider, config }
119 }
120
121 pub fn config(&self) -> &NormalizeConfig {
123 &self.config
124 }
125
126 pub fn normalize(&self, variant: &HgvsVariant) -> Result<HgvsVariant, FerroError> {
131 let result = self.normalize_with_warnings(variant)?;
132
133 if self.config.should_reject_ref_mismatch() && result.has_ref_mismatch() {
135 if let Some(warning) = result.warnings.iter().find(|w| w.code == "REFSEQ_MISMATCH") {
136 return Err(FerroError::ReferenceMismatch {
137 location: warning.position.clone(),
138 expected: warning.stated_ref.clone(),
139 found: warning.actual_ref.clone(),
140 });
141 }
142 }
143
144 Ok(result.result)
145 }
146
147 pub fn normalize_with_warnings(
153 &self,
154 variant: &HgvsVariant,
155 ) -> Result<NormalizeResultWithWarnings, FerroError> {
156 let (result, warnings) = match variant {
157 HV::Genome(v) => self.normalize_genome(v)?,
158 HV::Cds(v) => self.normalize_cds(v)?,
159 HV::Tx(v) => self.normalize_tx(v)?,
160 HV::Protein(v) => self.normalize_protein(v)?,
161 HV::Rna(v) => self.normalize_rna(v)?,
162 HV::Mt(v) => self.normalize_mt(v)?,
163 HV::Allele(a) => self.normalize_allele(a)?,
164 HV::Circular(v) => (
166 HV::Circular(crate::hgvs::variant::CircularVariant {
167 accession: v.accession.clone(),
168 gene_symbol: v.gene_symbol.clone(),
169 loc_edit: v.loc_edit.clone(),
170 }),
171 vec![],
172 ),
173 HV::RnaFusion(v) => (HV::RnaFusion(v.clone()), vec![]),
175 HV::NullAllele => (HV::NullAllele, vec![]),
177 HV::UnknownAllele => (HV::UnknownAllele, vec![]),
178 };
179
180 Ok(NormalizeResultWithWarnings::with_warnings(result, warnings))
181 }
182
183 fn normalize_allele(
189 &self,
190 allele: &crate::hgvs::variant::AlleleVariant,
191 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
192 let mut all_warnings = Vec::new();
194 let mut normalized = Vec::new();
195
196 for v in &allele.variants {
197 let result = self.normalize_with_warnings(v)?;
198 all_warnings.extend(result.warnings);
199 normalized.push(result.result);
200 }
201
202 if self.config.prevent_overlap {
204 normalized = self.resolve_overlaps(normalized)?;
205 }
206
207 Ok((
208 HgvsVariant::Allele(crate::hgvs::variant::AlleleVariant::new(
209 normalized,
210 allele.phase,
211 )),
212 all_warnings,
213 ))
214 }
215
216 fn resolve_overlaps(&self, variants: Vec<HgvsVariant>) -> Result<Vec<HgvsVariant>, FerroError> {
221 if variants.len() < 2 {
222 return Ok(variants);
223 }
224
225 let mut positions: Vec<(usize, Option<(u64, u64)>)> = variants
227 .iter()
228 .enumerate()
229 .map(|(i, v)| (i, self.extract_position_range(v)))
230 .collect();
231
232 positions.sort_by(|a, b| match (&a.1, &b.1) {
234 (Some((s1, _)), Some((s2, _))) => s1.cmp(s2),
235 (Some(_), None) => std::cmp::Ordering::Less,
236 (None, Some(_)) => std::cmp::Ordering::Greater,
237 (None, None) => std::cmp::Ordering::Equal,
238 });
239
240 let mut needs_adjustment = vec![false; variants.len()];
242
243 for i in 0..positions.len() - 1 {
244 let (_idx_a, pos_a) = &positions[i];
245 let (idx_b, pos_b) = &positions[i + 1];
246
247 if let (Some((_, end_a)), Some((start_b, _))) = (pos_a, pos_b) {
248 if end_a >= start_b {
250 needs_adjustment[*idx_b] = true;
252 }
253 }
254 }
255
256 for (i, needs_adj) in needs_adjustment.iter().enumerate() {
260 if *needs_adj {
261 let _ = i; }
266 }
267
268 Ok(variants)
269 }
270
271 fn extract_position_range(&self, variant: &HgvsVariant) -> Option<(u64, u64)> {
273 match variant {
274 HV::Genome(v) => {
275 let start = v.loc_edit.location.start.inner()?.base;
276 let end = v.loc_edit.location.end.inner()?.base;
277 Some((start, end))
278 }
279 HV::Cds(v) => {
280 let start_pos = v.loc_edit.location.start.inner()?;
281 let end_pos = v.loc_edit.location.end.inner()?;
282 if start_pos.is_intronic() || end_pos.is_intronic() {
284 None
285 } else {
286 Some((start_pos.base as u64, end_pos.base as u64))
287 }
288 }
289 HV::Tx(v) => {
290 let start = v.loc_edit.location.start.inner()?.base;
291 let end = v.loc_edit.location.end.inner()?.base;
292 if start < 1 || end < 1 {
294 None
295 } else {
296 Some((start as u64, end as u64))
297 }
298 }
299 _ => None,
300 }
301 }
302
303 fn normalize_genome(
305 &self,
306 variant: &GenomeVariant,
307 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
308 let edit = match variant.loc_edit.edit.inner() {
310 Some(e) => e,
311 None => return Ok((HV::Genome(variant.clone()), vec![])),
312 };
313
314 if !needs_normalization(edit) {
316 return Ok((HV::Genome(variant.clone()), vec![]));
317 }
318
319 let accession = variant.accession.full();
321 let start = match variant.loc_edit.location.start.inner() {
322 Some(pos) => pos.base,
323 None => {
324 return Ok((
325 HV::Genome(self.canonicalize_genome_variant(variant)),
326 vec![],
327 ))
328 }
329 };
330 let end = match variant.loc_edit.location.end.inner() {
331 Some(pos) => pos.base,
332 None => {
333 return Ok((
334 HV::Genome(self.canonicalize_genome_variant(variant)),
335 vec![],
336 ))
337 }
338 };
339
340 let window_start = start.saturating_sub(self.config.window_size);
342 let seq_result = self.provider.get_sequence(
343 &accession,
344 window_start,
345 end.saturating_add(self.config.window_size),
346 );
347
348 let ref_seq = match seq_result {
349 Ok(s) => s,
350 Err(_) => {
352 return Ok((
353 HV::Genome(self.canonicalize_genome_variant(variant)),
354 vec![],
355 ))
356 }
357 };
358
359 let rel_start = start - window_start;
361 let rel_end = end - window_start;
362
363 let (new_rel_start, new_rel_end, new_edit, warnings) = self.normalize_na_edit(
365 ref_seq.as_bytes(),
366 edit,
367 rel_start,
368 rel_end,
369 &Boundaries::new(1, ref_seq.len() as u64),
370 )?;
371
372 let new_start = new_rel_start + window_start;
374 let new_end = new_rel_end + window_start;
375
376 let new_variant = GenomeVariant {
378 accession: variant.accession.clone(),
379 gene_symbol: variant.gene_symbol.clone(),
380 loc_edit: LocEdit::new(
381 Interval::new(GenomePos::new(new_start), GenomePos::new(new_end)),
382 new_edit,
383 ),
384 };
385
386 Ok((HV::Genome(new_variant), warnings))
387 }
388
389 fn normalize_cds(
391 &self,
392 variant: &CdsVariant,
393 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
394 let edit = match variant.loc_edit.edit.inner() {
396 Some(e) => e,
397 None => return Ok((HV::Cds(variant.clone()), vec![])),
398 };
399
400 if !needs_normalization(edit) {
402 return Ok((HV::Cds(variant.clone()), vec![]));
403 }
404
405 let start_pos = match variant.loc_edit.location.start.inner() {
407 Some(pos) => pos,
408 None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
409 };
410 let end_pos = match variant.loc_edit.location.end.inner() {
411 Some(pos) => pos,
412 None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
413 };
414
415 let accession = variant.accession.full();
417 let transcript = match self.provider.get_transcript(&accession) {
418 Ok(t) => t,
419 Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
421 };
422
423 if start_pos.is_intronic() || end_pos.is_intronic() {
425 if start_pos.is_intronic() && end_pos.is_intronic() {
427 return self.normalize_intronic_cds(variant, &transcript, start_pos, end_pos, edit);
428 }
429 return self.normalize_boundary_spanning_cds(
431 variant,
432 &transcript,
433 start_pos,
434 end_pos,
435 edit,
436 );
437 }
438
439 let cds_start = transcript
441 .cds_start
442 .ok_or_else(|| FerroError::ConversionError {
443 msg: "Transcript has no CDS".to_string(),
444 })?;
445
446 let tx_start = self.cds_to_tx_pos(start_pos, cds_start, transcript.cds_end)?;
448 let tx_end = self.cds_to_tx_pos(end_pos, cds_start, transcript.cds_end)?;
449
450 let boundaries = if self.config.cross_boundaries {
452 Boundaries::new(1, transcript.sequence.len() as u64)
453 } else {
454 boundary::get_cds_boundaries(&transcript, tx_start, &self.config)?
455 };
456
457 let seq = transcript.sequence.as_bytes();
459 let (new_tx_start, new_tx_end, new_edit, warnings) =
460 self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
461
462 let new_start = self.tx_to_cds_pos(new_tx_start, cds_start, transcript.cds_end)?;
464 let new_end = self.tx_to_cds_pos(new_tx_end, cds_start, transcript.cds_end)?;
465
466 let new_variant = CdsVariant {
467 accession: variant.accession.clone(),
468 gene_symbol: variant.gene_symbol.clone(),
469 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
470 };
471
472 Ok((HV::Cds(new_variant), warnings))
473 }
474
475 fn normalize_tx(
477 &self,
478 variant: &TxVariant,
479 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
480 let edit = match variant.loc_edit.edit.inner() {
482 Some(e) => e,
483 None => return Ok((HV::Tx(variant.clone()), vec![])),
484 };
485
486 if !needs_normalization(edit) {
488 return Ok((HV::Tx(variant.clone()), vec![]));
489 }
490
491 let start_pos = match variant.loc_edit.location.start.inner() {
493 Some(pos) => pos,
494 None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
495 };
496 let end_pos = match variant.loc_edit.location.end.inner() {
497 Some(pos) => pos,
498 None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
499 };
500
501 if start_pos.is_intronic() || end_pos.is_intronic() {
502 return Err(FerroError::IntronicVariant {
503 variant: format!("{}", variant),
504 });
505 }
506
507 let accession = variant.accession.full();
509 let transcript = match self.provider.get_transcript(&accession) {
510 Ok(t) => t,
511 Err(_) => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
513 };
514
515 if start_pos.base < 1 || end_pos.base < 1 {
518 return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
519 }
520
521 let tx_start = start_pos.base as u64;
522 let tx_end = end_pos.base as u64;
523
524 let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
526
527 let seq = transcript.sequence.as_bytes();
529 let (new_start, new_end, new_edit, warnings) =
530 self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
531
532 let new_variant = TxVariant {
533 accession: variant.accession.clone(),
534 gene_symbol: variant.gene_symbol.clone(),
535 loc_edit: LocEdit::new(
536 Interval::new(TxPos::new(new_start as i64), TxPos::new(new_end as i64)),
537 new_edit,
538 ),
539 };
540
541 Ok((HV::Tx(new_variant), warnings))
542 }
543
544 fn normalize_protein(
558 &self,
559 variant: &crate::hgvs::variant::ProteinVariant,
560 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
561 use crate::hgvs::edit::ProteinEdit;
562 use crate::hgvs::variant::{LocEdit, ProteinVariant};
563
564 if self.provider.has_protein_data() {
566 self.validate_protein_reference(variant)?;
567 }
568
569 let edit = match variant.loc_edit.edit.inner() {
571 Some(e) => e,
572 None => return Ok((HV::Protein(variant.clone()), vec![])),
573 };
574
575 let normalized_edit = match edit {
577 ProteinEdit::Deletion { sequence, count } => {
578 if let Some(seq) = sequence {
580 if self.is_redundant_protein_deletion_sequence(&variant.loc_edit.location, seq)
581 {
582 ProteinEdit::Deletion {
584 sequence: None,
585 count: *count,
586 }
587 } else {
588 edit.clone()
589 }
590 } else {
591 edit.clone()
592 }
593 }
594 _ => edit.clone(),
596 };
597
598 if &normalized_edit != edit {
600 let new_variant = ProteinVariant {
601 accession: variant.accession.clone(),
602 gene_symbol: variant.gene_symbol.clone(),
603 loc_edit: LocEdit::with_uncertainty(
604 variant.loc_edit.location.clone(),
605 variant.loc_edit.edit.map_ref(|_| normalized_edit),
606 ),
607 };
608 Ok((HV::Protein(new_variant), vec![]))
609 } else {
610 Ok((HV::Protein(variant.clone()), vec![]))
611 }
612 }
613
614 fn validate_protein_reference(
619 &self,
620 variant: &crate::hgvs::variant::ProteinVariant,
621 ) -> Result<(), FerroError> {
622 use crate::hgvs::edit::ProteinEdit;
623
624 let accession = variant.accession.full();
625
626 let start_pos = match variant.loc_edit.location.start.inner() {
628 Some(pos) => pos,
629 None => return Ok(()), };
631 let end_pos = match variant.loc_edit.location.end.inner() {
632 Some(pos) => pos,
633 None => return Ok(()),
634 };
635
636 let edit = match variant.loc_edit.edit.inner() {
638 Some(e) => e,
639 None => return Ok(()),
640 };
641
642 match edit {
644 ProteinEdit::Substitution { .. } => {
645 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
648 }
649 ProteinEdit::Deletion { .. } => {
650 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
652 if end_pos.number != start_pos.number {
654 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
655 }
656 }
657 ProteinEdit::Duplication => {
658 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
660 if end_pos.number != start_pos.number {
661 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
662 }
663 }
664 ProteinEdit::Insertion { .. } => {
665 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
667 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
668 }
669 ProteinEdit::Delins { .. } => {
670 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
672 if end_pos.number != start_pos.number {
673 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
674 }
675 }
676 ProteinEdit::Frameshift { .. } => {
677 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
679 }
680 ProteinEdit::Extension { .. } => {
681 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
683 }
684 _ => {
685 }
687 }
688
689 Ok(())
690 }
691
692 fn validate_protein_position(
694 &self,
695 accession: &str,
696 position: u64,
697 expected_aa: &crate::hgvs::location::AminoAcid,
698 ) -> Result<(), FerroError> {
699 let start = hgvs_pos_to_index(position) as u64;
702 let end = position; match self.provider.get_protein_sequence(accession, start, end) {
706 Ok(ref_seq) => {
707 if ref_seq.len() != 1 {
708 return Ok(()); }
710
711 let ref_aa_char = ref_seq.chars().next().unwrap();
712 let expected_char = expected_aa.to_one_letter();
713
714 if ref_aa_char != expected_char {
715 return Err(FerroError::AminoAcidMismatch {
716 accession: accession.to_string(),
717 position,
718 expected: expected_aa.to_string(),
719 found: ref_aa_char.to_string(),
720 });
721 }
722 }
723 Err(_) => {
724 }
726 }
727
728 Ok(())
729 }
730
731 fn is_redundant_protein_deletion_sequence(
738 &self,
739 interval: &crate::hgvs::interval::ProtInterval,
740 sequence: &crate::hgvs::edit::AminoAcidSeq,
741 ) -> bool {
742 let start_pos = match interval.start.inner() {
744 Some(pos) => pos,
745 None => return false,
746 };
747 let end_pos = match interval.end.inner() {
748 Some(pos) => pos,
749 None => return false,
750 };
751
752 let interval_len = if end_pos.number >= start_pos.number {
754 (end_pos.number - start_pos.number + 1) as usize
755 } else {
756 return false;
757 };
758
759 if sequence.len() != interval_len {
761 return false;
762 }
763
764 if interval_len == 1 {
766 return sequence.0.len() == 1 && sequence.0[0] == start_pos.aa;
767 }
768
769 if let (Some(first), Some(last)) = (sequence.0.first(), sequence.0.last()) {
772 return *first == start_pos.aa && *last == end_pos.aa;
773 }
774
775 false
776 }
777
778 fn normalize_rna(
784 &self,
785 variant: &crate::hgvs::variant::RnaVariant,
786 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
787 use crate::hgvs::interval::RnaInterval;
788 use crate::hgvs::location::RnaPos;
789 use crate::hgvs::variant::{LocEdit, RnaVariant};
790
791 let edit = match variant.loc_edit.edit.inner() {
793 Some(e) => e,
794 None => return Ok((HV::Rna(variant.clone()), vec![])),
795 };
796
797 if !needs_normalization(edit) {
799 return Ok((HV::Rna(variant.clone()), vec![]));
800 }
801
802 let start_pos = match variant.loc_edit.location.start.inner() {
804 Some(pos) => pos,
805 None => return Ok((HV::Rna(variant.clone()), vec![])),
806 };
807 let end_pos = match variant.loc_edit.location.end.inner() {
808 Some(pos) => pos,
809 None => return Ok((HV::Rna(variant.clone()), vec![])),
810 };
811
812 if start_pos.is_intronic() || end_pos.is_intronic() {
813 return Err(FerroError::IntronicVariant {
814 variant: format!("{}", variant),
815 });
816 }
817
818 let accession = variant.accession.full();
820 let transcript = match self.provider.get_transcript(&accession) {
821 Ok(t) => t,
822 Err(_) => return Ok((HV::Rna(variant.clone()), vec![])),
823 };
824
825 if start_pos.base < 1 || end_pos.base < 1 {
828 return Ok((HV::Rna(variant.clone()), vec![]));
829 }
830
831 let rna_start = start_pos.base as u64;
832 let rna_end = end_pos.base as u64;
833
834 let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
836
837 let seq = transcript.sequence.as_bytes();
839 let (new_start, new_end, new_edit, warnings) =
840 self.normalize_na_edit(seq, edit, rna_start, rna_end, &boundaries)?;
841
842 let new_variant = RnaVariant {
843 accession: variant.accession.clone(),
844 gene_symbol: variant.gene_symbol.clone(),
845 loc_edit: LocEdit::new(
846 RnaInterval::new(RnaPos::new(new_start as i64), RnaPos::new(new_end as i64)),
847 new_edit,
848 ),
849 };
850
851 Ok((HV::Rna(new_variant), warnings))
852 }
853
854 fn normalize_mt(
856 &self,
857 variant: &crate::hgvs::variant::MtVariant,
858 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
859 Ok((HV::Mt(variant.clone()), vec![]))
862 }
863
864 fn normalize_intronic_cds(
869 &self,
870 variant: &CdsVariant,
871 transcript: &crate::reference::transcript::Transcript,
872 start_pos: &CdsPos,
873 end_pos: &CdsPos,
874 edit: &NaEdit,
875 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
876 use crate::convert::CoordinateMapper;
877
878 if !self.provider.has_genomic_data() {
880 return Err(FerroError::IntronicVariant {
881 variant: format!("{}", variant),
882 });
883 }
884
885 let chromosome =
887 transcript
888 .chromosome
889 .as_ref()
890 .ok_or_else(|| FerroError::ConversionError {
891 msg: "Transcript has no chromosome mapping for intronic normalization"
892 .to_string(),
893 })?;
894
895 let mapper = CoordinateMapper::new(transcript);
897
898 let g_start = mapper.cds_to_genomic_with_intron(start_pos)?;
900 let g_end = mapper.cds_to_genomic_with_intron(end_pos)?;
901
902 let (g_start, g_end) = if g_start <= g_end {
904 (g_start, g_end)
905 } else {
906 (g_end, g_start)
907 };
908
909 let window = self.config.window_size;
912 let seq_start = g_start.saturating_sub(window);
913 let seq_end = g_end.saturating_add(window);
914
915 let genomic_seq = self
917 .provider
918 .get_genomic_sequence(chromosome, seq_start, seq_end)?;
919
920 let rel_start = (g_start - seq_start) + 1; let rel_end = (g_end - seq_start) + 1;
923
924 let tx_pos = mapper.cds_to_tx(start_pos)?;
929 let tx_start = tx_pos.base as u64;
930
931 let intron = transcript
932 .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
933 .ok_or_else(|| FerroError::ConversionError {
934 msg: "Could not find intron for normalization".to_string(),
935 })?;
936
937 let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
939 (Some(s), Some(e)) => (s, e),
940 _ => {
941 return Err(FerroError::ConversionError {
942 msg: "Intron has no genomic coordinates".to_string(),
943 })
944 }
945 };
946
947 let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
949 let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
950 let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
951
952 let seq_bytes = genomic_seq.as_bytes();
954 let (new_rel_start, new_rel_end, new_edit, warnings) =
955 self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
956
957 let new_g_start = seq_start + new_rel_start - 1;
959 let new_g_end = seq_start + new_rel_end - 1;
960
961 let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
963 let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
964
965 let new_variant = CdsVariant {
966 accession: variant.accession.clone(),
967 gene_symbol: variant.gene_symbol.clone(),
968 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
969 };
970
971 Ok((HV::Cds(new_variant), warnings))
972 }
973
974 fn normalize_boundary_spanning_cds(
982 &self,
983 variant: &CdsVariant,
984 transcript: &crate::reference::transcript::Transcript,
985 start_pos: &CdsPos,
986 end_pos: &CdsPos,
987 edit: &NaEdit,
988 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
989 use crate::convert::CoordinateMapper;
990
991 if !self.provider.has_genomic_data() {
993 return Err(FerroError::ExonIntronBoundary {
994 exon: 0,
995 variant: format!("{}", variant),
996 });
997 }
998
999 let chromosome =
1000 transcript
1001 .chromosome
1002 .as_ref()
1003 .ok_or_else(|| FerroError::ConversionError {
1004 msg: "Transcript has no chromosome for boundary normalization".to_string(),
1005 })?;
1006
1007 let mapper = CoordinateMapper::new(transcript);
1008
1009 let g_start = self.cds_pos_to_genomic(&mapper, start_pos)?;
1013 let g_end = self.cds_pos_to_genomic(&mapper, end_pos)?;
1014
1015 let (g_start, g_end) = if g_start <= g_end {
1017 (g_start, g_end)
1018 } else {
1019 (g_end, g_start)
1020 };
1021
1022 let window = self.config.window_size;
1024 let seq_start = g_start.saturating_sub(window);
1025 let seq_end = g_end.saturating_add(window);
1026 let genomic_seq = self
1027 .provider
1028 .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1029
1030 let rel_start = (g_start - seq_start) + 1;
1032 let rel_end = (g_end - seq_start) + 1;
1033
1034 let boundaries =
1037 self.get_boundary_spanning_limits(transcript, &mapper, start_pos, end_pos, seq_start)?;
1038
1039 let seq_bytes = genomic_seq.as_bytes();
1041 let (new_rel_start, new_rel_end, new_edit, warnings) =
1042 self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
1043
1044 let new_g_start = seq_start + new_rel_start - 1;
1046 let new_g_end = seq_start + new_rel_end - 1;
1047
1048 let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1054 let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1055
1056 let new_variant = CdsVariant {
1057 accession: variant.accession.clone(),
1058 gene_symbol: variant.gene_symbol.clone(),
1059 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1060 };
1061
1062 Ok((HV::Cds(new_variant), warnings))
1063 }
1064
1065 fn cds_pos_to_genomic(
1067 &self,
1068 mapper: &crate::convert::CoordinateMapper,
1069 pos: &CdsPos,
1070 ) -> Result<u64, FerroError> {
1071 if pos.is_intronic() {
1072 mapper.cds_to_genomic_with_intron(pos)
1073 } else {
1074 mapper
1075 .cds_to_genomic(pos)?
1076 .ok_or_else(|| FerroError::ConversionError {
1077 msg: format!("CDS position {} not in exons", pos.base),
1078 })
1079 }
1080 }
1081
1082 fn get_boundary_spanning_limits(
1087 &self,
1088 transcript: &crate::reference::transcript::Transcript,
1089 mapper: &crate::convert::CoordinateMapper,
1090 start_pos: &CdsPos,
1091 end_pos: &CdsPos,
1092 seq_start: u64,
1093 ) -> Result<Boundaries, FerroError> {
1094 let (exonic_pos, intronic_pos) = if start_pos.is_intronic() {
1101 (end_pos, start_pos)
1102 } else {
1103 (start_pos, end_pos)
1104 };
1105
1106 let tx_pos = mapper.cds_to_tx(exonic_pos)?;
1108 let exon =
1109 transcript
1110 .exon_at(tx_pos.base as u64)
1111 .ok_or_else(|| FerroError::ConversionError {
1112 msg: "Could not find exon for boundary normalization".to_string(),
1113 })?;
1114
1115 let tx_boundary = mapper.cds_to_tx(intronic_pos)?;
1117 let offset = intronic_pos.offset.unwrap_or(0);
1118 let intron = transcript
1119 .find_intron_at_tx_boundary(tx_boundary.base as u64, offset)
1120 .ok_or_else(|| FerroError::ConversionError {
1121 msg: "Could not find intron for boundary normalization".to_string(),
1122 })?;
1123
1124 let (exon_g_start, exon_g_end): (u64, u64) = match (exon.genomic_start, exon.genomic_end) {
1126 (Some(s), Some(e)) => (s, e),
1127 _ => {
1128 return Err(FerroError::ConversionError {
1129 msg: "Exon has no genomic coordinates".to_string(),
1130 })
1131 }
1132 };
1133
1134 let (intron_g_start, intron_g_end): (u64, u64) =
1135 match (intron.genomic_start, intron.genomic_end) {
1136 (Some(s), Some(e)) => (s, e),
1137 _ => {
1138 return Err(FerroError::ConversionError {
1139 msg: "Intron has no genomic coordinates".to_string(),
1140 })
1141 }
1142 };
1143
1144 let combined_start = exon_g_start.min(intron_g_start);
1146 let combined_end = exon_g_end.max(intron_g_end);
1147
1148 let rel_start = combined_start.saturating_sub(seq_start) + 1;
1150 let rel_end = combined_end.saturating_sub(seq_start) + 1;
1151
1152 Ok(Boundaries::new(rel_start, rel_end))
1153 }
1154
1155 fn normalize_na_edit(
1159 &self,
1160 ref_seq: &[u8],
1161 edit: &NaEdit,
1162 start: u64,
1163 end: u64,
1164 boundaries: &Boundaries,
1165 ) -> Result<(u64, u64, NaEdit, Vec<NormalizationWarning>), FerroError> {
1166 let mut warnings = Vec::new();
1167
1168 let validation = validate::validate_reference(edit, ref_seq, start, end);
1170 if !validation.valid {
1171 warnings.push(NormalizationWarning {
1172 code: "REFSEQ_MISMATCH".to_string(),
1173 message: validation.warning.unwrap_or_default(),
1174 stated_ref: validation.stated_ref.unwrap_or_default(),
1175 actual_ref: validation.actual_ref.unwrap_or_default(),
1176 position: format!("{}-{}", start, end),
1177 corrected: true,
1178 });
1179 }
1180
1181 let alt_seq = match edit {
1183 NaEdit::Deletion { .. } => vec![],
1184 NaEdit::Insertion { sequence } => {
1185 if let Some(bases) = sequence.bases() {
1187 bases.iter().map(|b| b.to_u8()).collect()
1188 } else {
1189 return Ok((start, end, edit.clone(), warnings.clone()));
1191 }
1192 }
1193 NaEdit::Duplication { sequence, .. } => {
1194 if let Some(seq) = sequence {
1195 seq.bases().iter().map(|b| b.to_u8()).collect()
1196 } else {
1197 let s = hgvs_pos_to_index(start);
1200 let e = end as usize;
1201 if e <= ref_seq.len() {
1202 ref_seq[s..e].to_vec()
1203 } else {
1204 vec![]
1205 }
1206 }
1207 }
1208 NaEdit::Delins { sequence } => {
1209 use crate::hgvs::edit::InsertedSequence;
1210
1211 if let InsertedSequence::Literal(seq) = sequence {
1215 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1216 let start_idx = hgvs_pos_to_index(start);
1217 let end_idx = end as usize;
1218
1219 if rules::delins_is_duplication(ref_seq, start_idx, end_idx, &seq_bytes) {
1220 return Ok((
1222 start,
1223 end,
1224 NaEdit::Duplication {
1225 sequence: None,
1226 length: None,
1227 uncertain_extent: None,
1228 },
1229 warnings.clone(),
1230 ));
1231 }
1232 }
1233 return Ok((start, end, edit.clone(), warnings.clone()));
1235 }
1236 NaEdit::Inversion { sequence, length } => {
1237 let start_idx = hgvs_pos_to_index(start); let end_idx = end as usize; if let Some((new_s, new_e)) = rules::shorten_inversion(ref_seq, start_idx, end_idx)
1243 {
1244 let shortened_edit = NaEdit::Inversion {
1246 sequence: sequence.clone(),
1247 length: *length,
1248 };
1249 return Ok((
1250 index_to_hgvs_pos(new_s),
1251 new_e as u64,
1252 shortened_edit,
1253 warnings,
1254 ));
1255 } else {
1256 return Ok((
1258 start,
1259 end,
1260 NaEdit::Identity {
1261 sequence: None,
1262 whole_entity: false,
1263 },
1264 warnings,
1265 ));
1266 }
1267 }
1268 NaEdit::Repeat {
1269 sequence,
1270 count,
1271 additional_counts,
1272 trailing,
1273 } => {
1274 use crate::hgvs::edit::RepeatCount;
1275
1276 let Some(seq) = sequence else {
1278 return Ok((start, end, edit.clone(), warnings.clone()));
1279 };
1280
1281 let RepeatCount::Exact(specified_count) = count else {
1282 return Ok((start, end, edit.clone(), warnings.clone()));
1283 };
1284
1285 if !additional_counts.is_empty() || trailing.is_some() {
1287 return Ok((start, end, edit.clone(), warnings.clone()));
1288 }
1289
1290 let repeat_unit: Vec<u8> = seq.bases().iter().map(|b| b.to_u8()).collect();
1292 let pos_idx = hgvs_pos_to_index(start); match rules::normalize_repeat(ref_seq, pos_idx, &repeat_unit, *specified_count) {
1296 rules::RepeatNormResult::Deletion {
1297 start: del_start,
1298 end: del_end,
1299 } => {
1300 let del_edit = NaEdit::Deletion {
1302 sequence: None,
1303 length: None,
1304 };
1305 return Ok((del_start, del_end, del_edit, warnings));
1306 }
1307 rules::RepeatNormResult::Duplication {
1308 start: dup_start,
1309 end: dup_end,
1310 sequence: _dup_seq,
1311 } => {
1312 let dup_edit = NaEdit::Duplication {
1314 sequence: None,
1315 length: None,
1316 uncertain_extent: None,
1317 };
1318 return Ok((dup_start, dup_end, dup_edit, warnings));
1319 }
1320 rules::RepeatNormResult::Repeat {
1321 start: rep_start,
1322 end: rep_end,
1323 sequence: rep_seq,
1324 count: rep_count,
1325 } => {
1326 use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1327 let bases: Vec<Base> = rep_seq
1328 .iter()
1329 .filter_map(|&b| Base::from_char(b as char))
1330 .collect();
1331 let rep_edit = NaEdit::Repeat {
1332 sequence: Some(Sequence::new(bases)),
1333 count: RepeatCount::Exact(rep_count),
1334 additional_counts: vec![],
1335 trailing: None,
1336 };
1337 return Ok((rep_start, rep_end, rep_edit, warnings));
1338 }
1339 rules::RepeatNormResult::Unchanged => {
1340 return Ok((start, end, edit.clone(), warnings.clone()));
1341 }
1342 }
1343 }
1344 _ => return Ok((start, end, edit.clone(), warnings.clone())), };
1346
1347 let shuffle_end = match edit {
1361 NaEdit::Insertion { .. } => end.saturating_sub(1), _ => end,
1363 };
1364 let start_idx = hgvs_pos_to_index(start); let shuffle_start = match edit {
1366 NaEdit::Insertion { .. } => start_idx as u64 + 1, _ => start_idx as u64,
1368 };
1369 let result = shuffle(
1370 ref_seq,
1371 &alt_seq,
1372 shuffle_start,
1373 shuffle_end, boundaries,
1375 self.config.shuffle_direction,
1376 );
1377
1378 let shuffle_result_start = match edit {
1381 NaEdit::Insertion { .. } => result.start.saturating_sub(1), _ => result.start,
1383 };
1384 let new_start = index_to_hgvs_pos(shuffle_result_start as usize);
1385 let new_end = match edit {
1387 NaEdit::Insertion { .. } => index_to_hgvs_pos(result.end as usize), _ => result.end,
1389 };
1390
1391 let (final_start, final_end, new_edit) = match edit {
1397 NaEdit::Insertion { sequence } => {
1398 use crate::hgvs::edit::{InsertedSequence, RepeatCount, Sequence};
1399
1400 if let InsertedSequence::Literal(seq) = sequence {
1401 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1402
1403 if seq_bytes.len() > 1 {
1407 let original_pos_idx = hgvs_pos_to_index(start) as u64; if let Some((base, count, rep_start, rep_end)) =
1409 rules::insertion_to_repeat(ref_seq, original_pos_idx, &seq_bytes)
1410 {
1411 use crate::hgvs::edit::Base;
1413 if let Some(base_enum) = Base::from_char(base as char) {
1414 let repeat_seq = Sequence::new(vec![base_enum]);
1415 let repeat_edit = NaEdit::Repeat {
1416 sequence: Some(repeat_seq),
1417 count: RepeatCount::Exact(count),
1418 additional_counts: vec![],
1419 trailing: None,
1420 };
1421 return Ok((rep_start, rep_end, repeat_edit, warnings));
1422 }
1423 }
1424 }
1425
1426 let shift_amount =
1430 (result.start as usize).saturating_sub(shuffle_start as usize);
1431 let rotation = shift_amount % seq_bytes.len();
1432 let rotated_seq: Vec<u8> = if rotation > 0 {
1433 seq_bytes[rotation..]
1434 .iter()
1435 .chain(seq_bytes[..rotation].iter())
1436 .copied()
1437 .collect()
1438 } else {
1439 seq_bytes.clone()
1440 };
1441
1442 if rules::insertion_is_duplication(ref_seq, result.start, &rotated_seq) {
1443 let dup_len = rotated_seq.len() as u64;
1447 let (dup_start, dup_end) = if dup_len == 1 {
1448 (new_start, new_start) } else {
1450 (new_start - dup_len + 1, new_start)
1452 };
1453 (
1454 dup_start,
1455 dup_end,
1456 NaEdit::Duplication {
1457 sequence: None, length: None,
1459 uncertain_extent: None,
1460 },
1461 )
1462 } else {
1463 if rotation > 0 {
1465 use crate::hgvs::edit::{Base, InsertedSequence, Sequence};
1466 let rotated_bases: Vec<Base> = rotated_seq
1467 .iter()
1468 .filter_map(|&b| Base::from_char(b as char))
1469 .collect();
1470 let new_sequence =
1471 InsertedSequence::Literal(Sequence::new(rotated_bases));
1472 (
1473 new_start,
1474 new_end,
1475 NaEdit::Insertion {
1476 sequence: new_sequence,
1477 },
1478 )
1479 } else {
1480 (new_start, new_end, edit.clone())
1481 }
1482 }
1483 } else {
1484 (new_start, new_end, edit.clone())
1485 }
1486 }
1487 NaEdit::Duplication { .. } => {
1488 use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1489
1490 if let Some(dup_result) =
1494 rules::duplication_to_repeat(ref_seq, result.start, result.end)
1495 {
1496 match dup_result {
1497 rules::DupToRepeatResult::Homopolymer {
1498 base,
1499 count,
1500 start: rep_start,
1501 end: rep_end,
1502 } => {
1503 if let Some(base_enum) = Base::from_char(base as char) {
1504 let repeat_seq = Sequence::new(vec![base_enum]);
1505 let repeat_edit = NaEdit::Repeat {
1506 sequence: Some(repeat_seq),
1507 count: RepeatCount::Exact(count),
1508 additional_counts: vec![],
1509 trailing: None,
1510 };
1511 return Ok((rep_start, rep_end, repeat_edit, warnings));
1512 }
1513 }
1514 rules::DupToRepeatResult::TandemRepeat {
1515 unit,
1516 count,
1517 start: rep_start,
1518 end: rep_end,
1519 } => {
1520 let bases: Vec<Base> = unit
1521 .iter()
1522 .filter_map(|&b| Base::from_char(b as char))
1523 .collect();
1524 if bases.len() == unit.len() {
1525 let repeat_seq = Sequence::new(bases);
1526 let repeat_edit = NaEdit::Repeat {
1527 sequence: Some(repeat_seq),
1528 count: RepeatCount::Exact(count),
1529 additional_counts: vec![],
1530 trailing: None,
1531 };
1532 return Ok((rep_start, rep_end, repeat_edit, warnings));
1533 }
1534 }
1535 }
1536 }
1537 (
1539 new_start,
1540 new_end,
1541 NaEdit::Duplication {
1542 sequence: None,
1543 length: None,
1544 uncertain_extent: None,
1545 },
1546 )
1547 }
1548 NaEdit::Delins { sequence } => {
1549 use crate::hgvs::edit::{InsertedSequence, Sequence};
1550
1551 if let InsertedSequence::Literal(seq) = sequence {
1554 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1555 let start_0 = result.start as usize;
1556 let end_0 = result.end as usize;
1557
1558 if rules::delins_is_duplication(ref_seq, start_0, end_0, &seq_bytes) {
1559 let dup_len = end_0 - start_0;
1561 let dup_seq_bytes = &ref_seq[start_0..end_0];
1562 let dup_bases: Vec<crate::hgvs::edit::Base> = dup_seq_bytes
1563 .iter()
1564 .filter_map(|&b| crate::hgvs::edit::Base::from_char(b as char))
1565 .collect();
1566 let dup_seq = Sequence::new(dup_bases);
1567
1568 (
1569 new_start,
1570 new_end,
1571 NaEdit::Duplication {
1572 sequence: Some(dup_seq),
1573 length: Some(dup_len as u64),
1574 uncertain_extent: None,
1575 },
1576 )
1577 } else {
1578 (new_start, new_end, edit.clone())
1579 }
1580 } else {
1581 (new_start, new_end, edit.clone())
1582 }
1583 }
1584 NaEdit::Deletion { .. } => (
1586 new_start,
1587 new_end,
1588 NaEdit::Deletion {
1589 sequence: None,
1590 length: None,
1591 },
1592 ),
1593 _ => (new_start, new_end, edit.clone()),
1595 };
1596
1597 Ok((final_start, final_end, new_edit, warnings))
1598 }
1599
1600 fn cds_to_tx_pos(
1602 &self,
1603 pos: &CdsPos,
1604 cds_start: u64,
1605 cds_end: Option<u64>,
1606 ) -> Result<u64, FerroError> {
1607 if pos.utr3 {
1608 let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1609 msg: "No CDS end".to_string(),
1610 })?;
1611 Ok(end + pos.base as u64)
1612 } else if pos.base < 1 {
1613 Ok((cds_start as i64 + pos.base - 1) as u64)
1614 } else {
1615 Ok(cds_start + pos.base as u64 - 1)
1616 }
1617 }
1618
1619 fn tx_to_cds_pos(
1621 &self,
1622 pos: u64,
1623 cds_start: u64,
1624 cds_end: Option<u64>,
1625 ) -> Result<CdsPos, FerroError> {
1626 let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1627 msg: "No CDS end".to_string(),
1628 })?;
1629
1630 if pos < cds_start {
1631 Ok(CdsPos {
1634 base: pos as i64 - cds_start as i64 + 1,
1635 offset: None,
1636 utr3: false,
1637 })
1638 } else if pos > end {
1639 Ok(CdsPos {
1640 base: (pos - end) as i64,
1641 offset: None,
1642 utr3: true,
1643 })
1644 } else {
1645 Ok(CdsPos {
1646 base: (pos - cds_start + 1) as i64,
1647 offset: None,
1648 utr3: false,
1649 })
1650 }
1651 }
1652
1653 fn canonicalize_cds_variant(&self, variant: &CdsVariant) -> CdsVariant {
1658 let edit = match variant.loc_edit.edit.inner() {
1659 Some(e) => e,
1660 None => return variant.clone(),
1661 };
1662
1663 if !should_canonicalize(edit) {
1665 return variant.clone();
1666 }
1667
1668 let canonical_edit = canonicalize_edit(edit);
1669
1670 CdsVariant {
1671 accession: variant.accession.clone(),
1672 gene_symbol: variant.gene_symbol.clone(),
1673 loc_edit: LocEdit::with_uncertainty(
1674 variant.loc_edit.location.clone(),
1675 variant.loc_edit.edit.map_ref(|_| canonical_edit),
1676 ),
1677 }
1678 }
1679
1680 fn canonicalize_genome_variant(&self, variant: &GenomeVariant) -> GenomeVariant {
1682 let edit = match variant.loc_edit.edit.inner() {
1683 Some(e) => e,
1684 None => return variant.clone(),
1685 };
1686
1687 if !should_canonicalize(edit) {
1688 return variant.clone();
1689 }
1690
1691 let canonical_edit = canonicalize_edit(edit);
1692
1693 GenomeVariant {
1694 accession: variant.accession.clone(),
1695 gene_symbol: variant.gene_symbol.clone(),
1696 loc_edit: LocEdit::with_uncertainty(
1697 variant.loc_edit.location.clone(),
1698 variant.loc_edit.edit.map_ref(|_| canonical_edit),
1699 ),
1700 }
1701 }
1702
1703 fn canonicalize_tx_variant(&self, variant: &TxVariant) -> TxVariant {
1705 let edit = match variant.loc_edit.edit.inner() {
1706 Some(e) => e,
1707 None => return variant.clone(),
1708 };
1709
1710 if !should_canonicalize(edit) {
1711 return variant.clone();
1712 }
1713
1714 let canonical_edit = canonicalize_edit(edit);
1715
1716 TxVariant {
1717 accession: variant.accession.clone(),
1718 gene_symbol: variant.gene_symbol.clone(),
1719 loc_edit: LocEdit::with_uncertainty(
1720 variant.loc_edit.location.clone(),
1721 variant.loc_edit.edit.map_ref(|_| canonical_edit),
1722 ),
1723 }
1724 }
1725}
1726
1727#[cfg(test)]
1728mod tests {
1729 use super::*;
1730 use crate::hgvs::parser::parse_hgvs;
1731 use crate::reference::MockProvider;
1732
1733 #[test]
1734 fn test_normalize_substitution_unchanged() {
1735 let provider = MockProvider::with_test_data();
1736 let normalizer = Normalizer::new(provider);
1737
1738 let variant = parse_hgvs("NM_000088.3:c.10A>G").unwrap();
1739 let normalized = normalizer.normalize(&variant).unwrap();
1740
1741 assert_eq!(format!("{}", variant), format!("{}", normalized));
1743 }
1744
1745 #[test]
1746 fn test_normalize_with_config() {
1747 let provider = MockProvider::with_test_data();
1748 let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
1749 let normalizer = Normalizer::with_config(provider, config);
1750
1751 assert_eq!(
1752 normalizer.config().shuffle_direction,
1753 ShuffleDirection::FivePrime
1754 );
1755 }
1756
1757 #[test]
1758 fn test_normalizer_handles_missing_transcript() {
1759 let provider = MockProvider::new(); let normalizer = Normalizer::new(provider);
1761
1762 let variant = parse_hgvs("NM_MISSING.1:c.100del").unwrap();
1764 let result = normalizer.normalize(&variant);
1765 assert!(result.is_ok());
1766 assert_eq!(
1768 format!("{}", variant),
1769 format!("{}", result.unwrap()),
1770 "Missing transcript should return variant unchanged"
1771 );
1772 }
1773
1774 #[test]
1775 fn test_normalize_deletion_shifts_3prime() {
1776 let provider = MockProvider::with_test_data();
1781 let normalizer = Normalizer::new(provider);
1782
1783 let variant = parse_hgvs("NM_001234.1:c.10del").unwrap();
1784 let result = normalizer.normalize(&variant).unwrap();
1785 let output = format!("{}", result);
1786
1787 assert!(
1789 output.contains("del"),
1790 "Deletion should remain a deletion, got: {}",
1791 output
1792 );
1793 assert!(
1795 output.contains("c.11del"),
1796 "Deletion should shift 3' to exon boundary (c.11), got: {}",
1797 output
1798 );
1799 }
1800
1801 #[test]
1802 fn test_normalize_insertion_becomes_dup() {
1803 let provider = MockProvider::with_test_data();
1806 let normalizer = Normalizer::new(provider);
1807
1808 let variant = parse_hgvs("NM_001234.1:c.10_11insG").unwrap();
1809 let result = normalizer.normalize(&variant).unwrap();
1810 let output = format!("{}", result);
1811
1812 assert!(
1814 output.contains("dup"),
1815 "Insertion of matching base should become dup, got: {}",
1816 output
1817 );
1818 }
1819
1820 #[test]
1821 fn test_normalize_duplication_shifts_3prime() {
1822 let provider = MockProvider::with_test_data();
1825 let normalizer = Normalizer::new(provider);
1826
1827 let variant = parse_hgvs("NM_001234.1:c.10dup").unwrap();
1828 let result = normalizer.normalize(&variant).unwrap();
1829 let output = format!("{}", result);
1830
1831 assert!(
1834 output.contains("dup"),
1835 "Single-base duplication should remain as dup notation, got: {}",
1836 output
1837 );
1838 }
1839
1840 #[test]
1841 fn test_normalize_delins_unchanged() {
1842 let provider = MockProvider::with_test_data();
1845 let normalizer = Normalizer::new(provider);
1846
1847 let variant = parse_hgvs("NM_000088.3:c.10delinsAT").unwrap();
1848 let result = normalizer.normalize(&variant).unwrap();
1849 let output = format!("{}", result);
1850
1851 assert!(
1852 output.contains("delinsAT"),
1853 "Delins should remain unchanged, got: {}",
1854 output
1855 );
1856 }
1857
1858 #[test]
1859 fn test_normalize_protein_substitution_unchanged() {
1860 let provider = MockProvider::with_test_data();
1861 let normalizer = Normalizer::new(provider);
1862
1863 let variant = parse_hgvs("NP_000079.2:p.Val600Glu").unwrap();
1865 let normalized = normalizer.normalize(&variant).unwrap();
1866 assert_eq!(format!("{}", variant), format!("{}", normalized));
1867 }
1868
1869 #[test]
1870 fn test_normalize_protein_deletion_removes_redundant_sequence() {
1871 let provider = MockProvider::with_test_data();
1873 let normalizer = Normalizer::new(provider);
1874
1875 let variant = parse_hgvs("NP_000079.2:p.Val600delVal").unwrap();
1876 let normalized = normalizer.normalize(&variant).unwrap();
1877 let output = format!("{}", normalized);
1878
1879 assert_eq!(
1881 output, "NP_000079.2:p.Val600del",
1882 "Redundant sequence should be removed from deletion"
1883 );
1884 }
1885
1886 #[test]
1887 fn test_normalize_protein_deletion_range_removes_redundant_sequence() {
1888 let provider = MockProvider::with_test_data();
1890 let normalizer = Normalizer::new(provider);
1891
1892 let variant = parse_hgvs("NP_000079.2:p.Lys23_Glu25delLysAlaGlu").unwrap();
1893 let normalized = normalizer.normalize(&variant).unwrap();
1894 let output = format!("{}", normalized);
1895
1896 assert_eq!(
1898 output, "NP_000079.2:p.Lys23_Glu25del",
1899 "Redundant sequence should be removed from range deletion"
1900 );
1901 }
1902
1903 #[test]
1904 fn test_normalize_protein_deletion_non_matching_sequence_unchanged() {
1905 let provider = MockProvider::with_test_data();
1907 let normalizer = Normalizer::new(provider);
1908
1909 let variant = parse_hgvs("NP_000079.2:p.Val600delGlu").unwrap();
1910 let normalized = normalizer.normalize(&variant).unwrap();
1911 let output = format!("{}", normalized);
1912
1913 assert_eq!(
1915 output, "NP_000079.2:p.Val600delGlu",
1916 "Non-matching sequence should not be removed"
1917 );
1918 }
1919
1920 #[test]
1921 fn test_normalize_protein_deletion_no_sequence_unchanged() {
1922 let provider = MockProvider::with_test_data();
1924 let normalizer = Normalizer::new(provider);
1925
1926 let variant = parse_hgvs("NP_000079.2:p.Val600del").unwrap();
1927 let normalized = normalizer.normalize(&variant).unwrap();
1928 let output = format!("{}", normalized);
1929
1930 assert_eq!(
1931 output, "NP_000079.2:p.Val600del",
1932 "Deletion without sequence should remain unchanged"
1933 );
1934 }
1935
1936 #[test]
1937 fn test_normalize_protein_duplication_unchanged() {
1938 let provider = MockProvider::with_test_data();
1940 let normalizer = Normalizer::new(provider);
1941
1942 let variant = parse_hgvs("NP_000079.2:p.Val600dup").unwrap();
1943 let normalized = normalizer.normalize(&variant).unwrap();
1944 assert_eq!(format!("{}", variant), format!("{}", normalized));
1945 }
1946
1947 #[test]
1948 fn test_normalize_protein_frameshift_unchanged() {
1949 let provider = MockProvider::with_test_data();
1951 let normalizer = Normalizer::new(provider);
1952
1953 let variant = parse_hgvs("NP_000079.2:p.Arg97ProfsTer23").unwrap();
1954 let normalized = normalizer.normalize(&variant).unwrap();
1955 assert_eq!(format!("{}", variant), format!("{}", normalized));
1956 }
1957
1958 #[test]
1959 fn test_normalize_protein_reference_validation_match() {
1960 let provider = MockProvider::with_test_data();
1963 let normalizer = Normalizer::new(provider);
1964
1965 let variant = parse_hgvs("NP_TEST.1:p.Met1Val").unwrap();
1967 let result = normalizer.normalize(&variant);
1968 assert!(
1969 result.is_ok(),
1970 "Validation should pass for matching amino acid"
1971 );
1972 }
1973
1974 #[test]
1975 fn test_normalize_protein_reference_validation_mismatch() {
1976 let provider = MockProvider::with_test_data();
1979 let normalizer = Normalizer::new(provider);
1980
1981 let variant = parse_hgvs("NP_TEST.1:p.Val1Glu").unwrap();
1983 let result = normalizer.normalize(&variant);
1984 assert!(
1985 result.is_err(),
1986 "Validation should fail for mismatched amino acid"
1987 );
1988
1989 if let Err(crate::error::FerroError::AminoAcidMismatch {
1990 position,
1991 expected,
1992 found,
1993 ..
1994 }) = result
1995 {
1996 assert_eq!(position, 1);
1997 assert_eq!(expected, "Val");
1998 assert_eq!(found, "M");
1999 } else {
2000 panic!("Expected AminoAcidMismatch error");
2001 }
2002 }
2003
2004 #[test]
2005 fn test_normalize_protein_reference_validation_deletion() {
2006 let provider = MockProvider::with_test_data();
2009 let normalizer = Normalizer::new(provider);
2010
2011 let variant = parse_hgvs("NP_TEST.1:p.Val2del").unwrap();
2013 let result = normalizer.normalize(&variant);
2014 assert!(
2015 result.is_ok(),
2016 "Validation should pass for matching deletion position"
2017 );
2018 }
2019
2020 #[test]
2021 fn test_normalize_protein_reference_validation_missing_protein() {
2022 let provider = MockProvider::with_test_data();
2024 let normalizer = Normalizer::new(provider);
2025
2026 let variant = parse_hgvs("NP_MISSING.1:p.Val600Glu").unwrap();
2028 let result = normalizer.normalize(&variant);
2029 assert!(
2031 result.is_ok(),
2032 "Missing protein should skip validation, not fail"
2033 );
2034 }
2035
2036 #[test]
2037 fn test_normalize_rna_substitution_unchanged() {
2038 let provider = MockProvider::with_test_data();
2039 let normalizer = Normalizer::new(provider);
2040
2041 let variant = parse_hgvs("NM_000088.3:r.10a>g").unwrap();
2043 let normalized = normalizer.normalize(&variant).unwrap();
2044 assert_eq!(format!("{}", variant), format!("{}", normalized));
2045 }
2046
2047 #[test]
2048 fn test_normalize_rna_deletion_shifts_3prime() {
2049 let provider = MockProvider::with_test_data();
2052 let normalizer = Normalizer::new(provider);
2053
2054 let variant = parse_hgvs("NM_001234.1:r.14del").unwrap();
2055 let result = normalizer.normalize(&variant).unwrap();
2056 let output = format!("{}", result);
2057
2058 assert!(
2060 output.contains("del"),
2061 "Deletion should remain a deletion, got: {}",
2062 output
2063 );
2064 assert!(
2066 output.contains("r.37del"),
2067 "Deletion should shift 3' to position 37, got: {}",
2068 output
2069 );
2070 }
2071
2072 #[test]
2073 fn test_normalize_rna_insertion_becomes_dup() {
2074 let provider = MockProvider::with_test_data();
2077 let normalizer = Normalizer::new(provider);
2078
2079 let variant = parse_hgvs("NM_001234.1:r.14_15insg").unwrap();
2080 let result = normalizer.normalize(&variant).unwrap();
2081 let output = format!("{}", result);
2082
2083 assert!(
2085 output.contains("dup"),
2086 "Insertion of matching base should become dup, got: {}",
2087 output
2088 );
2089 }
2090
2091 #[test]
2092 fn test_normalize_rna_duplication_shifts_3prime() {
2093 let provider = MockProvider::with_test_data();
2095 let normalizer = Normalizer::new(provider);
2096
2097 let variant = parse_hgvs("NM_001234.1:r.14dup").unwrap();
2098 let result = normalizer.normalize(&variant).unwrap();
2099 let output = format!("{}", result);
2100
2101 assert!(
2103 output.contains("dup"),
2104 "Single-base RNA duplication should remain as dup notation, got: {}",
2105 output
2106 );
2107 }
2108
2109 #[test]
2110 fn test_normalize_rna_intronic_returns_error() {
2111 let provider = MockProvider::with_test_data();
2112 let normalizer = Normalizer::new(provider);
2113
2114 let variant = parse_hgvs("NM_001234.1:r.10+5del").unwrap();
2116 let result = normalizer.normalize(&variant);
2117 assert!(result.is_err(), "Intronic RNA variant should return error");
2118 }
2119
2120 #[test]
2121 fn test_normalize_rna_missing_transcript_unchanged() {
2122 let provider = MockProvider::with_test_data();
2123 let normalizer = Normalizer::new(provider);
2124
2125 let variant = parse_hgvs("NM_MISSING.1:r.100del").unwrap();
2127 let normalized = normalizer.normalize(&variant).unwrap();
2128 assert_eq!(format!("{}", variant), format!("{}", normalized));
2129 }
2130
2131 #[test]
2132 fn test_normalize_null_allele() {
2133 let provider = MockProvider::with_test_data();
2134 let normalizer = Normalizer::new(provider);
2135
2136 let variant = HgvsVariant::NullAllele;
2138 let normalized = normalizer.normalize(&variant).unwrap();
2139 assert!(matches!(normalized, HgvsVariant::NullAllele));
2140 }
2141
2142 #[test]
2143 fn test_normalize_unknown_allele() {
2144 let provider = MockProvider::with_test_data();
2145 let normalizer = Normalizer::new(provider);
2146
2147 let variant = HgvsVariant::UnknownAllele;
2149 let normalized = normalizer.normalize(&variant).unwrap();
2150 assert!(matches!(normalized, HgvsVariant::UnknownAllele));
2151 }
2152
2153 #[test]
2154 fn test_normalize_allele_variant() {
2155 let provider = MockProvider::with_test_data();
2156 let normalizer = Normalizer::new(provider);
2157
2158 let variant = parse_hgvs("[NM_000088.3:c.10A>G;NM_000088.3:c.20C>T]").unwrap();
2161 let result = normalizer.normalize(&variant).unwrap();
2162
2163 assert!(matches!(result, HgvsVariant::Allele(_)));
2165
2166 let output = format!("{}", result);
2168 assert!(
2169 output.contains("c.10A>G"),
2170 "Allele should contain first variant, got: {}",
2171 output
2172 );
2173 assert!(
2174 output.contains("c.20C>T"),
2175 "Allele should contain second variant, got: {}",
2176 output
2177 );
2178 }
2179
2180 #[test]
2181 fn test_normalize_5prime_direction() {
2182 let provider = MockProvider::with_test_data();
2188 let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2189 let normalizer = Normalizer::with_config(provider, config);
2190
2191 let variant = parse_hgvs("NM_001234.1:c.20del").unwrap();
2193 let result = normalizer.normalize(&variant).unwrap();
2194 let output = format!("{}", result);
2195
2196 assert!(
2197 output.contains("del"),
2198 "Should remain a deletion, got: {}",
2199 output
2200 );
2201 assert!(
2204 output.contains("c.13del") || output.contains("c.12del"),
2205 "5' direction should shift deletion toward exon start, got: {}",
2206 output
2207 );
2208 }
2209
2210 #[test]
2211 fn test_normalize_3prime_direction() {
2212 let provider = MockProvider::with_test_data();
2213 let config = NormalizeConfig::default().with_direction(ShuffleDirection::ThreePrime);
2214 let normalizer = Normalizer::with_config(provider, config);
2215
2216 let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2217 let result = normalizer.normalize(&variant);
2218 assert!(result.is_ok());
2219 }
2220
2221 #[test]
2222 fn test_normalize_with_cross_boundaries() {
2223 let provider = MockProvider::with_test_data();
2224 let config = NormalizeConfig::default().allow_crossing_boundaries();
2225 let normalizer = Normalizer::with_config(provider, config);
2226
2227 let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2228 let result = normalizer.normalize(&variant);
2229 assert!(result.is_ok());
2230 }
2231
2232 #[test]
2233 fn test_normalize_genomic_variant() {
2234 let provider = MockProvider::with_test_data();
2235 let normalizer = Normalizer::new(provider);
2236
2237 let variant = parse_hgvs("NC_000001.11:g.12345del").unwrap();
2239 let result = normalizer.normalize(&variant);
2240 assert!(result.is_ok());
2241 }
2242
2243 #[test]
2244 fn test_normalize_tx_variant() {
2245 let provider = MockProvider::with_test_data();
2246 let normalizer = Normalizer::new(provider);
2247
2248 let variant = parse_hgvs("NR_000001.1:n.100del").unwrap();
2249 let result = normalizer.normalize(&variant);
2250 assert!(result.is_ok());
2251 }
2252
2253 #[test]
2254 fn test_normalize_mt_variant() {
2255 let provider = MockProvider::with_test_data();
2256 let normalizer = Normalizer::new(provider);
2257
2258 let variant = parse_hgvs("NC_012920.1:m.100A>G").unwrap();
2260 let normalized = normalizer.normalize(&variant).unwrap();
2261 assert!(matches!(normalized, HgvsVariant::Mt(_)));
2262 }
2263
2264 #[test]
2265 fn test_cds_to_tx_pos_utr5() {
2266 let provider = MockProvider::with_test_data();
2267 let normalizer = Normalizer::new(provider);
2268
2269 let cds_pos = CdsPos {
2271 base: -5,
2272 offset: None,
2273 utr3: false,
2274 };
2275 let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2276 assert!(result.is_ok());
2277 }
2278
2279 #[test]
2280 fn test_cds_to_tx_pos_utr3() {
2281 let provider = MockProvider::with_test_data();
2282 let normalizer = Normalizer::new(provider);
2283
2284 let cds_pos = CdsPos {
2286 base: 5,
2287 offset: None,
2288 utr3: true,
2289 };
2290 let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2291 assert!(result.is_ok());
2292 assert_eq!(result.unwrap(), 55);
2293 }
2294
2295 #[test]
2296 fn test_cds_to_tx_pos_coding() {
2297 let provider = MockProvider::with_test_data();
2298 let normalizer = Normalizer::new(provider);
2299
2300 let cds_pos = CdsPos {
2302 base: 10,
2303 offset: None,
2304 utr3: false,
2305 };
2306 let result = normalizer.cds_to_tx_pos(&cds_pos, 5, Some(50));
2307 assert!(result.is_ok());
2308 assert_eq!(result.unwrap(), 14); }
2310
2311 #[test]
2312 fn test_tx_to_cds_pos_utr5() {
2313 let provider = MockProvider::with_test_data();
2314 let normalizer = Normalizer::new(provider);
2315
2316 let result = normalizer.tx_to_cds_pos(3, 10, Some(50));
2318 assert!(result.is_ok());
2319 let cds_pos = result.unwrap();
2320 assert!(cds_pos.base < 0);
2321 assert!(!cds_pos.utr3);
2322 }
2323
2324 #[test]
2325 fn test_tx_to_cds_pos_utr3() {
2326 let provider = MockProvider::with_test_data();
2327 let normalizer = Normalizer::new(provider);
2328
2329 let result = normalizer.tx_to_cds_pos(55, 10, Some(50));
2331 assert!(result.is_ok());
2332 let cds_pos = result.unwrap();
2333 assert!(cds_pos.utr3);
2334 }
2335
2336 #[test]
2337 fn test_tx_to_cds_pos_coding() {
2338 let provider = MockProvider::with_test_data();
2339 let normalizer = Normalizer::new(provider);
2340
2341 let result = normalizer.tx_to_cds_pos(20, 10, Some(50));
2343 assert!(result.is_ok());
2344 let cds_pos = result.unwrap();
2345 assert!(!cds_pos.utr3);
2346 assert_eq!(cds_pos.base, 11); }
2348
2349 #[test]
2350 fn test_config_default() {
2351 let config = NormalizeConfig::default();
2352 assert_eq!(config.shuffle_direction, ShuffleDirection::ThreePrime);
2353 assert!(!config.cross_boundaries);
2354 }
2355
2356 #[test]
2357 #[allow(deprecated)]
2358 fn test_config_builder() {
2359 let config = NormalizeConfig::default()
2360 .with_direction(ShuffleDirection::FivePrime)
2361 .allow_crossing_boundaries()
2362 .skip_validation();
2363
2364 assert_eq!(config.shuffle_direction, ShuffleDirection::FivePrime);
2365 assert!(config.cross_boundaries);
2366 assert!(!config.should_reject_ref_mismatch());
2368 assert!(!config.should_warn_ref_mismatch());
2369 }
2370
2371 #[test]
2372 fn test_duplication_3prime_shift_two_bases() {
2373 let provider = MockProvider::with_test_data();
2383 let normalizer = Normalizer::new(provider);
2384
2385 let variant = parse_hgvs("NM_888888.1:c.8dup").unwrap();
2386 let result = normalizer.normalize(&variant).unwrap();
2387 let output = format!("{}", result);
2388
2389 assert!(
2391 output.contains("dup"),
2392 "Single-base duplication should remain as dup notation, got: {}",
2393 output
2394 );
2395 }
2396
2397 #[test]
2398 fn test_duplication_3prime_shift_three_bases() {
2399 let provider = MockProvider::with_test_data();
2407 let normalizer = Normalizer::new(provider);
2408
2409 let variant = parse_hgvs("NM_888888.1:c.21dup").unwrap();
2410 let result = normalizer.normalize(&variant).unwrap();
2411 let output = format!("{}", result);
2412
2413 assert!(
2415 output.contains("dup"),
2416 "Single-base duplication should remain as dup notation, got: {}",
2417 output
2418 );
2419 }
2420
2421 #[test]
2422 fn test_duplication_no_shift_when_unique() {
2423 let provider = MockProvider::with_test_data();
2430 let normalizer = Normalizer::new(provider);
2431
2432 let variant = parse_hgvs("NM_888888.1:c.7dup").unwrap();
2433 let result = normalizer.normalize(&variant).unwrap();
2434 let output = format!("{}", result);
2435
2436 assert!(
2437 output.contains("dup"),
2438 "Should remain a duplication, got: {}",
2439 output
2440 );
2441 assert!(
2442 output.contains("c.7dup"),
2443 "Duplication of unique G at c.7 should not shift, got: {}",
2444 output
2445 );
2446 }
2447
2448 fn make_boundary_test_provider() -> MockProvider {
2485 use crate::reference::transcript::{Exon, ManeStatus, Strand, Transcript};
2486 use std::sync::OnceLock;
2487
2488 let mut provider = MockProvider::new();
2489
2490 let tx_seq = "ATGCCCAAAGGGTTTAGGCCAAAGGGTTTAGGCCCAAAAAGGGTTTAGGCCCAAATGA";
2496
2497 let mut genomic_seq = String::new();
2505
2506 for _ in 0..1000 {
2508 genomic_seq.push('N');
2509 }
2510
2511 genomic_seq.push_str("ATGCCCAAAGGGTTTAGGCC");
2513
2514 genomic_seq.push_str("GTAAGCTAAA");
2517
2518 genomic_seq.push_str("AAAGGGTTTAGGCCCAAAAA");
2520
2521 genomic_seq.push_str("AAAGTAAGGG");
2523
2524 genomic_seq.push_str("GGGTTTAGGCCCAAATGA");
2526
2527 for _ in 0..100 {
2529 genomic_seq.push('N');
2530 }
2531
2532 provider.add_genomic_sequence("chr1", genomic_seq);
2534
2535 provider.add_transcript(Transcript {
2537 id: "NM_BOUNDARY.1".to_string(),
2538 gene_symbol: Some("BOUNDARY".to_string()),
2539 strand: Strand::Plus,
2540 sequence: tx_seq.to_string(),
2541 cds_start: Some(1),
2542 cds_end: Some(58),
2543 exons: vec![
2544 Exon::with_genomic(1, 1, 20, 1000, 1019),
2545 Exon::with_genomic(2, 21, 40, 1030, 1049),
2546 Exon::with_genomic(3, 41, 58, 1060, 1077),
2547 ],
2548 chromosome: Some("chr1".to_string()),
2549 genomic_start: Some(1000),
2550 genomic_end: Some(1077),
2551 genome_build: Default::default(),
2552 mane_status: ManeStatus::None,
2553 refseq_match: None,
2554 ensembl_match: None,
2555 cached_introns: OnceLock::new(),
2556 });
2557
2558 provider
2559 }
2560
2561 #[test]
2562 fn test_boundary_spanning_exonic_to_intronic_del() {
2563 let provider = make_boundary_test_provider();
2570 let normalizer = Normalizer::new(provider);
2571
2572 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+3del").unwrap();
2573 let result = normalizer.normalize(&variant);
2574
2575 assert!(
2576 result.is_ok(),
2577 "Boundary-spanning deletion should normalize, got error: {:?}",
2578 result.err()
2579 );
2580
2581 let output = format!("{}", result.unwrap());
2582 assert!(
2583 output.contains("del"),
2584 "Should remain a deletion, got: {}",
2585 output
2586 );
2587 }
2588
2589 #[test]
2590 fn test_boundary_spanning_intronic_to_exonic_del() {
2591 let provider = make_boundary_test_provider();
2599 let normalizer = Normalizer::new(provider);
2600
2601 let variant = parse_hgvs("NM_BOUNDARY.1:c.21-3_23del").unwrap();
2602 let result = normalizer.normalize(&variant);
2603
2604 assert!(
2605 result.is_ok(),
2606 "Boundary-spanning deletion should normalize, got error: {:?}",
2607 result.err()
2608 );
2609
2610 let output = format!("{}", result.unwrap());
2611 assert!(
2612 output.contains("del"),
2613 "Should remain a deletion, got: {}",
2614 output
2615 );
2616 }
2617
2618 #[test]
2619 fn test_boundary_spanning_same_base_position() {
2620 let provider = make_boundary_test_provider();
2623 let normalizer = Normalizer::new(provider);
2624
2625 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+5del").unwrap();
2626 let result = normalizer.normalize(&variant);
2627
2628 assert!(
2629 result.is_ok(),
2630 "Same-base boundary-spanning deletion should normalize, got error: {:?}",
2631 result.err()
2632 );
2633 }
2634
2635 #[test]
2636 fn test_boundary_splice_site_plus1() {
2637 let provider = make_boundary_test_provider();
2640 let normalizer = Normalizer::new(provider);
2641
2642 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+1del").unwrap();
2643 let result = normalizer.normalize(&variant);
2644
2645 assert!(
2646 result.is_ok(),
2647 "Splice site +1 deletion should normalize, got error: {:?}",
2648 result.err()
2649 );
2650 }
2651
2652 #[test]
2653 fn test_boundary_splice_site_minus1() {
2654 let provider = make_boundary_test_provider();
2658 let normalizer = Normalizer::new(provider);
2659
2660 let variant = parse_hgvs("NM_BOUNDARY.1:c.21-1_22del").unwrap();
2661 let result = normalizer.normalize(&variant);
2662
2663 assert!(
2664 result.is_ok(),
2665 "Splice site -1 deletion should normalize, got error: {:?}",
2666 result.err()
2667 );
2668 }
2669
2670 #[test]
2671 fn test_boundary_spanning_dup() {
2672 let provider = make_boundary_test_provider();
2678 let normalizer = Normalizer::new(provider);
2679
2680 let variant = parse_hgvs("NM_BOUNDARY.1:c.40_40+3dup").unwrap();
2681 let result = normalizer.normalize(&variant);
2682
2683 assert!(
2684 result.is_ok(),
2685 "Boundary-spanning duplication should normalize, got error: {:?}",
2686 result.err()
2687 );
2688
2689 let output = format!("{}", result.unwrap());
2690 assert!(
2692 output.contains("dup") || output.contains("["),
2693 "Should remain a duplication or become repeat notation, got: {}",
2694 output
2695 );
2696 }
2697
2698 #[test]
2699 fn test_boundary_spanning_delins() {
2700 let provider = make_boundary_test_provider();
2702 let normalizer = Normalizer::new(provider);
2703
2704 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+2delinsTTT").unwrap();
2705 let result = normalizer.normalize(&variant);
2706
2707 assert!(
2708 result.is_ok(),
2709 "Boundary-spanning delins should normalize, got error: {:?}",
2710 result.err()
2711 );
2712
2713 let output = format!("{}", result.unwrap());
2714 assert!(
2715 output.contains("delins") || output.contains(">"),
2716 "Should remain a delins or become substitution, got: {}",
2717 output
2718 );
2719 }
2720
2721 #[test]
2722 fn test_boundary_no_genomic_data_returns_error() {
2723 let provider = MockProvider::with_test_data(); let normalizer = Normalizer::new(provider);
2726
2727 let variant = parse_hgvs("NM_001234.1:c.11_11+3del").unwrap();
2729 let result = normalizer.normalize(&variant);
2730
2731 assert!(
2732 result.is_err(),
2733 "Boundary-spanning without genomic data should return error"
2734 );
2735 }
2736
2737 #[test]
2738 fn test_deletion_3prime_shift_consecutive_bases() {
2739 use crate::reference::transcript::{Exon, ManeStatus, Strand};
2742 use std::sync::OnceLock;
2743
2744 let mut provider = MockProvider::new();
2745
2746 let mut seq = String::new();
2751 for _ in 0..516 {
2752 seq.push('G'); }
2754 seq.push('A'); seq.push('A'); for _ in 519..=600 {
2757 seq.push('G'); }
2759
2760 provider.add_transcript(crate::reference::transcript::Transcript {
2761 id: "NM_777777.1".to_string(),
2762 gene_symbol: Some("SHIFTTEST".to_string()),
2763 strand: Strand::Plus,
2764 sequence: seq.clone(),
2765 cds_start: Some(1),
2766 cds_end: Some(600),
2767 exons: vec![Exon::new(1, 1, 600)], chromosome: None,
2769 genomic_start: None,
2770 genomic_end: None,
2771 genome_build: Default::default(),
2772 mane_status: ManeStatus::None,
2773 refseq_match: None,
2774 ensembl_match: None,
2775 cached_introns: OnceLock::new(),
2776 });
2777
2778 let normalizer = Normalizer::new(provider);
2779
2780 let variant = parse_hgvs("NM_777777.1:c.517delA").unwrap();
2782
2783 println!("Sequence length: {}", seq.len());
2785 println!(
2786 "Position 516 (0-based 515): {}",
2787 seq.chars().nth(515).unwrap()
2788 );
2789 println!(
2790 "Position 517 (0-based 516): {}",
2791 seq.chars().nth(516).unwrap()
2792 );
2793 println!(
2794 "Position 518 (0-based 517): {}",
2795 seq.chars().nth(517).unwrap()
2796 );
2797 println!(
2798 "Position 519 (0-based 518): {}",
2799 seq.chars().nth(518).unwrap()
2800 );
2801
2802 let result = normalizer.normalize(&variant).unwrap();
2803 let output = format!("{}", result);
2804
2805 println!("Input: NM_777777.1:c.517delA");
2806 println!("Output: {}", output);
2807
2808 assert!(
2811 output.contains("c.518del"),
2812 "Deletion at c.517 should shift to c.518 (3' rule), got: {}",
2813 output
2814 );
2815 }
2816
2817 #[test]
2818 fn test_deletion_3prime_shift_with_utr() {
2819 use crate::reference::transcript::{Exon, ManeStatus, Strand};
2822 use std::sync::OnceLock;
2823
2824 let mut provider = MockProvider::new();
2825
2826 let utr_len = 100;
2832 let mut seq = String::new();
2833
2834 for _ in 0..utr_len {
2836 seq.push('T');
2837 }
2838 for _ in 0..516 {
2840 seq.push('G');
2841 }
2842 seq.push('A'); seq.push('A'); for _ in 0..100 {
2845 seq.push('G');
2846 }
2847
2848 let seq_len = seq.len();
2849 println!("Test with UTR:");
2850 println!(" Sequence length: {}", seq_len);
2851 println!(" CDS start (1-based): 101");
2852 println!(" c.517 = tx position 617 (0-based 616)");
2853 println!(" c.518 = tx position 618 (0-based 617)");
2854 println!(
2855 " tx pos 617 (0-based 616): {}",
2856 seq.chars().nth(616).unwrap()
2857 );
2858 println!(
2859 " tx pos 618 (0-based 617): {}",
2860 seq.chars().nth(617).unwrap()
2861 );
2862
2863 provider.add_transcript(crate::reference::transcript::Transcript {
2864 id: "NM_666666.1".to_string(),
2865 gene_symbol: Some("UTRTEST".to_string()),
2866 strand: Strand::Plus,
2867 sequence: seq.clone(),
2868 cds_start: Some(101),
2869 cds_end: Some(seq_len as u64),
2870 exons: vec![Exon::new(1, 1, seq_len as u64)], chromosome: None,
2872 genomic_start: None,
2873 genomic_end: None,
2874 genome_build: Default::default(),
2875 mane_status: ManeStatus::None,
2876 refseq_match: None,
2877 ensembl_match: None,
2878 cached_introns: OnceLock::new(),
2879 });
2880
2881 let normalizer = Normalizer::new(provider);
2882
2883 let variant = parse_hgvs("NM_666666.1:c.517delA").unwrap();
2884 let result = normalizer.normalize(&variant).unwrap();
2885 let output = format!("{}", result);
2886
2887 println!("Input: NM_666666.1:c.517delA");
2888 println!("Output: {}", output);
2889
2890 assert!(
2891 output.contains("c.518del"),
2892 "Deletion at c.517 should shift to c.518 (3' rule) even with UTR, got: {}",
2893 output
2894 );
2895 }
2896
2897 #[test]
2898 fn test_delins_should_not_shift() {
2899 use crate::reference::transcript::{Exon, ManeStatus, Strand};
2902 use std::sync::OnceLock;
2903
2904 let mut provider = MockProvider::new();
2905
2906 let seq = "GGGGGGGGGGAATTCCGGGGGGGGGG".to_string(); provider.add_transcript(crate::reference::transcript::Transcript {
2912 id: "NM_555555.1".to_string(),
2913 gene_symbol: Some("DELINSTEST".to_string()),
2914 strand: Strand::Plus,
2915 sequence: seq,
2916 cds_start: Some(1),
2917 cds_end: Some(26),
2918 exons: vec![Exon::new(1, 1, 26)],
2919 chromosome: None,
2920 genomic_start: None,
2921 genomic_end: None,
2922 genome_build: Default::default(),
2923 mane_status: ManeStatus::None,
2924 refseq_match: None,
2925 ensembl_match: None,
2926 cached_introns: OnceLock::new(),
2927 });
2928
2929 let normalizer = Normalizer::new(provider);
2930
2931 let variant = parse_hgvs("NM_555555.1:c.10_11delinsTT").unwrap();
2933 let result = normalizer.normalize(&variant).unwrap();
2934 let output = format!("{}", result);
2935
2936 assert!(
2937 output.contains("c.10_11delins"),
2938 "Delins should NOT be shifted (HGVS spec), got: {}",
2939 output
2940 );
2941 }
2942}