1pub mod boundary;
27pub mod config;
28pub(crate) mod merge;
29pub mod rules;
30pub mod shuffle;
31pub mod validate;
32
33use crate::coords::{hgvs_pos_to_index, index_to_hgvs_pos};
34use crate::error::FerroError;
35use crate::hgvs::edit::NaEdit;
36use crate::hgvs::interval::Interval;
37use crate::hgvs::location::{CdsPos, GenomePos, TxPos};
38use crate::hgvs::parser::position::{OFFSET_UNKNOWN_NEGATIVE, OFFSET_UNKNOWN_POSITIVE};
39use crate::hgvs::variant::{CdsVariant, GenomeVariant, HgvsVariant, LocEdit, TxVariant};
40use crate::hgvs::HgvsVariant as HV;
41use crate::reference::transcript::Strand;
42use crate::reference::ReferenceProvider;
43use boundary::Boundaries;
44pub use config::{NormalizeConfig, ShuffleDirection};
45use rules::{canonicalize_edit, needs_normalization, should_canonicalize};
46use shuffle::shuffle;
47
48fn has_unknown_offset_cds(pos: &CdsPos) -> bool {
50 matches!(
51 pos.offset,
52 Some(OFFSET_UNKNOWN_POSITIVE) | Some(OFFSET_UNKNOWN_NEGATIVE)
53 )
54}
55
56fn has_unknown_offset_tx(pos: &TxPos) -> bool {
58 matches!(
59 pos.offset,
60 Some(OFFSET_UNKNOWN_POSITIVE) | Some(OFFSET_UNKNOWN_NEGATIVE)
61 )
62}
63
64#[derive(Debug, Clone)]
66pub struct NormalizationWarning {
67 pub code: String,
69 pub message: String,
71 pub stated_ref: String,
73 pub actual_ref: String,
75 pub position: String,
77 pub corrected: bool,
79}
80
81#[derive(Debug, Clone)]
83pub struct NormalizeResultWithWarnings {
84 pub result: HgvsVariant,
86 pub warnings: Vec<NormalizationWarning>,
88}
89
90impl NormalizeResultWithWarnings {
91 pub fn new(result: HgvsVariant) -> Self {
93 Self {
94 result,
95 warnings: vec![],
96 }
97 }
98
99 pub fn with_warnings(result: HgvsVariant, warnings: Vec<NormalizationWarning>) -> Self {
101 Self { result, warnings }
102 }
103
104 pub fn add_warning(&mut self, warning: NormalizationWarning) {
106 self.warnings.push(warning);
107 }
108
109 pub fn has_warnings(&self) -> bool {
111 !self.warnings.is_empty()
112 }
113
114 pub fn has_ref_mismatch(&self) -> bool {
116 self.warnings.iter().any(|w| w.code == "REFSEQ_MISMATCH")
117 }
118}
119
120pub struct Normalizer<P: ReferenceProvider> {
122 provider: P,
123 config: NormalizeConfig,
124}
125
126impl<P: ReferenceProvider> Normalizer<P> {
127 pub fn new(provider: P) -> Self {
129 Self {
130 provider,
131 config: NormalizeConfig::default(),
132 }
133 }
134
135 pub fn with_config(provider: P, config: NormalizeConfig) -> Self {
137 Self { provider, config }
138 }
139
140 pub fn config(&self) -> &NormalizeConfig {
142 &self.config
143 }
144
145 pub fn normalize(&self, variant: &HgvsVariant) -> Result<HgvsVariant, FerroError> {
150 let result = self.normalize_with_warnings(variant)?;
151
152 if self.config.should_reject_ref_mismatch() && result.has_ref_mismatch() {
154 if let Some(warning) = result.warnings.iter().find(|w| w.code == "REFSEQ_MISMATCH") {
155 return Err(FerroError::ReferenceMismatch {
156 location: warning.position.clone(),
157 expected: warning.stated_ref.clone(),
158 found: warning.actual_ref.clone(),
159 });
160 }
161 }
162
163 Ok(result.result)
164 }
165
166 pub fn normalize_with_warnings(
172 &self,
173 variant: &HgvsVariant,
174 ) -> Result<NormalizeResultWithWarnings, FerroError> {
175 let (result, warnings) = match variant {
176 HV::Genome(v) => self.normalize_genome(v)?,
177 HV::Cds(v) => self.normalize_cds(v)?,
178 HV::Tx(v) => self.normalize_tx(v)?,
179 HV::Protein(v) => self.normalize_protein(v)?,
180 HV::Rna(v) => self.normalize_rna(v)?,
181 HV::Mt(v) => self.normalize_mt(v)?,
182 HV::Allele(a) => self.normalize_allele(a)?,
183 HV::Circular(v) => (
185 HV::Circular(crate::hgvs::variant::CircularVariant {
186 accession: v.accession.clone(),
187 gene_symbol: v.gene_symbol.clone(),
188 loc_edit: v.loc_edit.clone(),
189 }),
190 vec![],
191 ),
192 HV::RnaFusion(v) => (HV::RnaFusion(v.clone()), vec![]),
194 HV::NullAllele => (HV::NullAllele, vec![]),
196 HV::UnknownAllele => (HV::UnknownAllele, vec![]),
197 };
198
199 Ok(NormalizeResultWithWarnings::with_warnings(result, warnings))
200 }
201
202 fn normalize_allele(
208 &self,
209 allele: &crate::hgvs::variant::AlleleVariant,
210 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
211 let mut all_warnings = Vec::new();
213 let mut normalized = Vec::new();
214
215 for v in &allele.variants {
216 let result = self.normalize_with_warnings(v)?;
217 all_warnings.extend(result.warnings);
218 normalized.push(result.result);
219 }
220
221 if self.config.prevent_overlap {
223 normalized = self.resolve_overlaps(normalized)?;
224 }
225
226 let original_len = normalized.len();
232 let mut merged = merge::merge_consecutive_edits(normalized, allele.phase, &self.provider);
233
234 let result = if allele.phase == crate::hgvs::variant::AllelePhase::Cis
235 && original_len > 1
236 && merged.len() == 1
237 {
238 merged.pop().unwrap()
239 } else {
240 HgvsVariant::Allele(crate::hgvs::variant::AlleleVariant::new(
241 merged,
242 allele.phase,
243 ))
244 };
245
246 Ok((result, all_warnings))
247 }
248
249 fn resolve_overlaps(&self, variants: Vec<HgvsVariant>) -> Result<Vec<HgvsVariant>, FerroError> {
254 if variants.len() < 2 {
255 return Ok(variants);
256 }
257
258 let mut positions: Vec<(usize, Option<(u64, u64)>)> = variants
260 .iter()
261 .enumerate()
262 .map(|(i, v)| (i, self.extract_position_range(v)))
263 .collect();
264
265 positions.sort_by(|a, b| match (&a.1, &b.1) {
267 (Some((s1, _)), Some((s2, _))) => s1.cmp(s2),
268 (Some(_), None) => std::cmp::Ordering::Less,
269 (None, Some(_)) => std::cmp::Ordering::Greater,
270 (None, None) => std::cmp::Ordering::Equal,
271 });
272
273 let mut needs_adjustment = vec![false; variants.len()];
275
276 for i in 0..positions.len() - 1 {
277 let (_idx_a, pos_a) = &positions[i];
278 let (idx_b, pos_b) = &positions[i + 1];
279
280 if let (Some((_, end_a)), Some((start_b, _))) = (pos_a, pos_b) {
281 if end_a >= start_b {
283 needs_adjustment[*idx_b] = true;
285 }
286 }
287 }
288
289 for (i, needs_adj) in needs_adjustment.iter().enumerate() {
293 if *needs_adj {
294 let _ = i; }
299 }
300
301 Ok(variants)
302 }
303
304 fn extract_position_range(&self, variant: &HgvsVariant) -> Option<(u64, u64)> {
306 match variant {
307 HV::Genome(v) => {
308 let start = v.loc_edit.location.start.inner()?.base;
309 let end = v.loc_edit.location.end.inner()?.base;
310 Some((start, end))
311 }
312 HV::Cds(v) => {
313 let start_pos = v.loc_edit.location.start.inner()?;
314 let end_pos = v.loc_edit.location.end.inner()?;
315 if start_pos.is_intronic()
317 || end_pos.is_intronic()
318 || start_pos.base < 1
319 || end_pos.base < 1
320 {
321 None
322 } else {
323 Some((start_pos.base as u64, end_pos.base as u64))
324 }
325 }
326 HV::Tx(v) => {
327 let start = v.loc_edit.location.start.inner()?.base;
328 let end = v.loc_edit.location.end.inner()?.base;
329 if start < 1 || end < 1 {
331 None
332 } else {
333 Some((start as u64, end as u64))
334 }
335 }
336 _ => None,
337 }
338 }
339
340 fn normalize_genome(
342 &self,
343 variant: &GenomeVariant,
344 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
345 let edit = match variant.loc_edit.edit.inner() {
347 Some(e) => e,
348 None => return Ok((HV::Genome(variant.clone()), vec![])),
349 };
350
351 if !needs_normalization(edit) {
353 return Ok((HV::Genome(variant.clone()), vec![]));
354 }
355
356 let accession = variant.accession.transcript_accession();
358 let start = match variant.loc_edit.location.start.inner() {
359 Some(pos) => pos.base,
360 None => {
361 return Ok((
362 HV::Genome(self.canonicalize_genome_variant(variant)),
363 vec![],
364 ))
365 }
366 };
367 let end = match variant.loc_edit.location.end.inner() {
368 Some(pos) => pos.base,
369 None => {
370 return Ok((
371 HV::Genome(self.canonicalize_genome_variant(variant)),
372 vec![],
373 ))
374 }
375 };
376
377 let window_start = start.saturating_sub(self.config.window_size);
379 let seq_result = self.provider.get_sequence(
380 &accession,
381 window_start,
382 end.saturating_add(self.config.window_size),
383 );
384
385 let ref_seq = match seq_result {
386 Ok(s) => s,
387 Err(_) => {
389 return Ok((
390 HV::Genome(self.canonicalize_genome_variant(variant)),
391 vec![],
392 ))
393 }
394 };
395
396 let rel_start = start - window_start;
398 let rel_end = end - window_start;
399
400 let (new_rel_start, new_rel_end, new_edit, warnings) = self.normalize_na_edit(
402 ref_seq.as_bytes(),
403 edit,
404 rel_start,
405 rel_end,
406 &Boundaries::new(1, ref_seq.len() as u64),
407 )?;
408
409 let new_start = new_rel_start + window_start;
411 let new_end = new_rel_end + window_start;
412
413 let new_variant = GenomeVariant {
415 accession: variant.accession.clone(),
416 gene_symbol: variant.gene_symbol.clone(),
417 loc_edit: LocEdit::new(
418 Interval::new(GenomePos::new(new_start), GenomePos::new(new_end)),
419 new_edit,
420 ),
421 };
422
423 Ok((HV::Genome(new_variant), warnings))
424 }
425
426 fn normalize_cds(
428 &self,
429 variant: &CdsVariant,
430 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
431 let edit = match variant.loc_edit.edit.inner() {
433 Some(e) => e,
434 None => return Ok((HV::Cds(variant.clone()), vec![])),
435 };
436
437 if !needs_normalization(edit) {
439 return Ok((HV::Cds(variant.clone()), vec![]));
440 }
441
442 let start_pos = match variant.loc_edit.location.start.inner() {
444 Some(pos) => pos,
445 None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
446 };
447 let end_pos = match variant.loc_edit.location.end.inner() {
448 Some(pos) => pos,
449 None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
450 };
451
452 if has_unknown_offset_cds(start_pos) || has_unknown_offset_cds(end_pos) {
454 return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![]));
455 }
456
457 let accession = variant.accession.transcript_accession();
459 let transcript = match self.provider.get_transcript(&accession) {
460 Ok(t) => t,
461 Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
463 };
464
465 if start_pos.is_intronic() || end_pos.is_intronic() {
467 if start_pos.is_intronic() && end_pos.is_intronic() {
469 return self.normalize_intronic_cds(variant, &transcript, start_pos, end_pos, edit);
470 }
471 return self.normalize_boundary_spanning_cds(
473 variant,
474 &transcript,
475 start_pos,
476 end_pos,
477 edit,
478 );
479 }
480
481 let cds_start = match transcript.cds_start {
483 Some(s) => s,
484 None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
485 };
486
487 let tx_start = match self.cds_to_tx_pos(start_pos, cds_start, transcript.cds_end) {
489 Ok(v) => v,
490 Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
491 };
492 let tx_end = match self.cds_to_tx_pos(end_pos, cds_start, transcript.cds_end) {
493 Ok(v) => v,
494 Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
495 };
496
497 let boundaries = if self.config.cross_boundaries {
499 Boundaries::new(1, transcript.sequence.len() as u64)
500 } else {
501 match boundary::get_cds_boundaries(&transcript, tx_start, &self.config) {
502 Ok(b) => b,
503 Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
504 }
505 };
506
507 let seq = transcript.sequence.as_bytes();
509 let (new_tx_start, new_tx_end, new_edit, warnings) =
510 self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
511
512 let new_start = self.tx_to_cds_pos(new_tx_start, cds_start, transcript.cds_end)?;
514 let new_end = self.tx_to_cds_pos(new_tx_end, cds_start, transcript.cds_end)?;
515
516 let new_variant = CdsVariant {
517 accession: variant.accession.clone(),
518 gene_symbol: variant.gene_symbol.clone(),
519 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
520 };
521
522 Ok((HV::Cds(new_variant), warnings))
523 }
524
525 fn normalize_tx(
527 &self,
528 variant: &TxVariant,
529 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
530 let edit = match variant.loc_edit.edit.inner() {
532 Some(e) => e,
533 None => return Ok((HV::Tx(variant.clone()), vec![])),
534 };
535
536 if !needs_normalization(edit) {
538 return Ok((HV::Tx(variant.clone()), vec![]));
539 }
540
541 let start_pos = match variant.loc_edit.location.start.inner() {
543 Some(pos) => pos,
544 None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
545 };
546 let end_pos = match variant.loc_edit.location.end.inner() {
547 Some(pos) => pos,
548 None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
549 };
550
551 if has_unknown_offset_tx(start_pos) || has_unknown_offset_tx(end_pos) {
553 return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
554 }
555
556 let accession = variant.accession.transcript_accession();
558 let transcript = match self.provider.get_transcript(&accession) {
559 Ok(t) => t,
560 Err(_) => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
562 };
563
564 if start_pos.is_intronic() || end_pos.is_intronic() {
565 if start_pos.is_intronic() && end_pos.is_intronic() {
567 return self.normalize_intronic_tx(variant, &transcript, start_pos, end_pos, edit);
568 }
569 return Err(FerroError::IntronicVariant {
571 variant: format!("{}", variant),
572 });
573 }
574
575 if start_pos.base < 1 || end_pos.base < 1 {
578 return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
579 }
580
581 let tx_start = start_pos.base as u64;
582 let tx_end = end_pos.base as u64;
583
584 let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
586
587 let seq = transcript.sequence.as_bytes();
589 let (new_start, new_end, new_edit, warnings) =
590 self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
591
592 let new_variant = TxVariant {
593 accession: variant.accession.clone(),
594 gene_symbol: variant.gene_symbol.clone(),
595 loc_edit: LocEdit::new(
596 Interval::new(TxPos::new(new_start as i64), TxPos::new(new_end as i64)),
597 new_edit,
598 ),
599 };
600
601 Ok((HV::Tx(new_variant), warnings))
602 }
603
604 fn normalize_protein(
618 &self,
619 variant: &crate::hgvs::variant::ProteinVariant,
620 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
621 use crate::hgvs::edit::ProteinEdit;
622 use crate::hgvs::variant::{LocEdit, ProteinVariant};
623
624 if self.provider.has_protein_data() {
626 self.validate_protein_reference(variant)?;
627 }
628
629 let edit = match variant.loc_edit.edit.inner() {
631 Some(e) => e,
632 None => return Ok((HV::Protein(variant.clone()), vec![])),
633 };
634
635 let normalized_edit = match edit {
637 ProteinEdit::Deletion { sequence, count } => {
638 if let Some(seq) = sequence {
640 if self.is_redundant_protein_deletion_sequence(&variant.loc_edit.location, seq)
641 {
642 ProteinEdit::Deletion {
644 sequence: None,
645 count: *count,
646 }
647 } else {
648 edit.clone()
649 }
650 } else {
651 edit.clone()
652 }
653 }
654 _ => edit.clone(),
656 };
657
658 if &normalized_edit != edit {
660 let new_variant = ProteinVariant {
661 accession: variant.accession.clone(),
662 gene_symbol: variant.gene_symbol.clone(),
663 loc_edit: LocEdit::with_uncertainty(
664 variant.loc_edit.location.clone(),
665 variant.loc_edit.edit.map_ref(|_| normalized_edit),
666 ),
667 };
668 Ok((HV::Protein(new_variant), vec![]))
669 } else {
670 Ok((HV::Protein(variant.clone()), vec![]))
671 }
672 }
673
674 fn validate_protein_reference(
679 &self,
680 variant: &crate::hgvs::variant::ProteinVariant,
681 ) -> Result<(), FerroError> {
682 use crate::hgvs::edit::ProteinEdit;
683
684 let accession = variant.accession.transcript_accession();
685
686 let start_pos = match variant.loc_edit.location.start.inner() {
688 Some(pos) => pos,
689 None => return Ok(()), };
691 let end_pos = match variant.loc_edit.location.end.inner() {
692 Some(pos) => pos,
693 None => return Ok(()),
694 };
695
696 let edit = match variant.loc_edit.edit.inner() {
698 Some(e) => e,
699 None => return Ok(()),
700 };
701
702 match edit {
704 ProteinEdit::Substitution { .. } => {
705 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
708 }
709 ProteinEdit::Deletion { .. } => {
710 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
712 if end_pos.number != start_pos.number {
714 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
715 }
716 }
717 ProteinEdit::Duplication => {
718 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
720 if end_pos.number != start_pos.number {
721 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
722 }
723 }
724 ProteinEdit::Insertion { .. } => {
725 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
727 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
728 }
729 ProteinEdit::Delins { .. } => {
730 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
732 if end_pos.number != start_pos.number {
733 self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
734 }
735 }
736 ProteinEdit::Frameshift { .. } => {
737 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
739 }
740 ProteinEdit::Extension { .. } => {
741 self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
743 }
744 _ => {
745 }
747 }
748
749 Ok(())
750 }
751
752 fn validate_protein_position(
754 &self,
755 accession: &str,
756 position: u64,
757 expected_aa: &crate::hgvs::location::AminoAcid,
758 ) -> Result<(), FerroError> {
759 let start = hgvs_pos_to_index(position) as u64;
762 let end = position; match self.provider.get_protein_sequence(accession, start, end) {
766 Ok(ref_seq) => {
767 if ref_seq.len() != 1 {
768 return Ok(()); }
770
771 let ref_aa_char = ref_seq.chars().next().unwrap();
772 let expected_char = expected_aa.to_one_letter();
773
774 if ref_aa_char != expected_char {
775 return Err(FerroError::AminoAcidMismatch {
776 accession: accession.to_string(),
777 position,
778 expected: expected_aa.to_string(),
779 found: ref_aa_char.to_string(),
780 });
781 }
782 }
783 Err(_) => {
784 }
786 }
787
788 Ok(())
789 }
790
791 fn is_redundant_protein_deletion_sequence(
798 &self,
799 interval: &crate::hgvs::interval::ProtInterval,
800 sequence: &crate::hgvs::edit::AminoAcidSeq,
801 ) -> bool {
802 let start_pos = match interval.start.inner() {
804 Some(pos) => pos,
805 None => return false,
806 };
807 let end_pos = match interval.end.inner() {
808 Some(pos) => pos,
809 None => return false,
810 };
811
812 let interval_len = if end_pos.number >= start_pos.number {
814 (end_pos.number - start_pos.number + 1) as usize
815 } else {
816 return false;
817 };
818
819 if sequence.len() != interval_len {
821 return false;
822 }
823
824 if interval_len == 1 {
826 return sequence.0.len() == 1 && sequence.0[0] == start_pos.aa;
827 }
828
829 if let (Some(first), Some(last)) = (sequence.0.first(), sequence.0.last()) {
832 return *first == start_pos.aa && *last == end_pos.aa;
833 }
834
835 false
836 }
837
838 fn normalize_rna(
844 &self,
845 variant: &crate::hgvs::variant::RnaVariant,
846 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
847 use crate::hgvs::interval::RnaInterval;
848 use crate::hgvs::location::RnaPos;
849 use crate::hgvs::variant::{LocEdit, RnaVariant};
850
851 let edit = match variant.loc_edit.edit.inner() {
853 Some(e) => e,
854 None => return Ok((HV::Rna(variant.clone()), vec![])),
855 };
856
857 if !needs_normalization(edit) {
859 return Ok((HV::Rna(variant.clone()), vec![]));
860 }
861
862 let start_pos = match variant.loc_edit.location.start.inner() {
864 Some(pos) => pos,
865 None => return Ok((HV::Rna(variant.clone()), vec![])),
866 };
867 let end_pos = match variant.loc_edit.location.end.inner() {
868 Some(pos) => pos,
869 None => return Ok((HV::Rna(variant.clone()), vec![])),
870 };
871
872 if start_pos.is_intronic() || end_pos.is_intronic() {
873 return Err(FerroError::IntronicVariant {
874 variant: format!("{}", variant),
875 });
876 }
877
878 let accession = variant.accession.transcript_accession();
880 let transcript = match self.provider.get_transcript(&accession) {
881 Ok(t) => t,
882 Err(_) => return Ok((HV::Rna(variant.clone()), vec![])),
883 };
884
885 if start_pos.base < 1 || end_pos.base < 1 {
888 return Ok((HV::Rna(variant.clone()), vec![]));
889 }
890
891 let rna_start = start_pos.base as u64;
892 let rna_end = end_pos.base as u64;
893
894 let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
896
897 let seq = transcript.sequence.as_bytes();
899 let (new_start, new_end, new_edit, warnings) =
900 self.normalize_na_edit(seq, edit, rna_start, rna_end, &boundaries)?;
901
902 let new_variant = RnaVariant {
903 accession: variant.accession.clone(),
904 gene_symbol: variant.gene_symbol.clone(),
905 loc_edit: LocEdit::new(
906 RnaInterval::new(RnaPos::new(new_start as i64), RnaPos::new(new_end as i64)),
907 new_edit,
908 ),
909 };
910
911 Ok((HV::Rna(new_variant), warnings))
912 }
913
914 fn normalize_mt(
916 &self,
917 variant: &crate::hgvs::variant::MtVariant,
918 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
919 Ok((HV::Mt(variant.clone()), vec![]))
922 }
923
924 fn normalize_intronic_cds(
929 &self,
930 variant: &CdsVariant,
931 transcript: &crate::reference::transcript::Transcript,
932 start_pos: &CdsPos,
933 end_pos: &CdsPos,
934 edit: &NaEdit,
935 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
936 use crate::convert::CoordinateMapper;
937
938 if !self.provider.has_genomic_data() {
940 return Err(FerroError::IntronicVariant {
941 variant: format!("{}", variant),
942 });
943 }
944
945 let chromosome =
947 transcript
948 .chromosome
949 .as_ref()
950 .ok_or_else(|| FerroError::ConversionError {
951 msg: "Transcript has no chromosome mapping for intronic normalization"
952 .to_string(),
953 })?;
954
955 let mapper = CoordinateMapper::new(transcript);
957
958 let g_start = mapper.cds_to_genomic_with_intron(start_pos)?;
960 let g_end = mapper.cds_to_genomic_with_intron(end_pos)?;
961
962 let swapped = g_start > g_end;
965 let (g_start, g_end) = if swapped {
966 (g_end, g_start)
967 } else {
968 (g_start, g_end)
969 };
970
971 let window = self.config.window_size;
974 let seq_start = g_start.saturating_sub(window);
975 let seq_end = g_end.saturating_add(window);
976
977 let genomic_seq = self
979 .provider
980 .get_genomic_sequence(chromosome, seq_start, seq_end)?;
981
982 let rel_start = (g_start - seq_start) + 1; let rel_end = (g_end - seq_start) + 1;
985
986 let tx_pos = mapper.cds_to_tx(start_pos)?;
991 let tx_start = u64::try_from(tx_pos.base).map_err(|_| FerroError::ConversionError {
992 msg: format!(
993 "Negative transcript position {} during intronic normalization",
994 tx_pos.base
995 ),
996 })?;
997
998 let intron = transcript
999 .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
1000 .ok_or_else(|| FerroError::ConversionError {
1001 msg: "Could not find intron for normalization".to_string(),
1002 })?;
1003
1004 let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
1006 (Some(s), Some(e)) => (s, e),
1007 _ => {
1008 return Err(FerroError::ConversionError {
1009 msg: "Intron has no genomic coordinates".to_string(),
1010 })
1011 }
1012 };
1013
1014 let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
1016 let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
1017 let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
1018
1019 let (work_seq, work_rel_start, work_rel_end, work_boundaries) = flip_intronic_for_strand(
1028 transcript.strand,
1029 &genomic_seq,
1030 rel_start,
1031 rel_end,
1032 &boundaries,
1033 );
1034
1035 let seq_bytes = work_seq.as_bytes();
1037 let (work_new_rel_start, work_new_rel_end, new_edit, warnings) = self.normalize_na_edit(
1038 seq_bytes,
1039 edit,
1040 work_rel_start,
1041 work_rel_end,
1042 &work_boundaries,
1043 )?;
1044
1045 let (new_rel_start, new_rel_end) = unflip_intronic_positions(
1047 transcript.strand,
1048 work_seq.len() as u64,
1049 work_new_rel_start,
1050 work_new_rel_end,
1051 );
1052
1053 let new_g_start = seq_start + new_rel_start - 1;
1055 let new_g_end = seq_start + new_rel_end - 1;
1056
1057 let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1059 let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1060
1061 let (new_start, new_end) = if swapped {
1063 (new_end, new_start)
1064 } else {
1065 (new_start, new_end)
1066 };
1067
1068 let new_variant = CdsVariant {
1069 accession: variant.accession.clone(),
1070 gene_symbol: variant.gene_symbol.clone(),
1071 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1072 };
1073
1074 Ok((HV::Cds(new_variant), warnings))
1075 }
1076
1077 fn normalize_intronic_tx(
1082 &self,
1083 variant: &TxVariant,
1084 transcript: &crate::reference::transcript::Transcript,
1085 start_pos: &TxPos,
1086 end_pos: &TxPos,
1087 edit: &NaEdit,
1088 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
1089 use crate::convert::CoordinateMapper;
1090
1091 if !self.provider.has_genomic_data() {
1093 return Err(FerroError::IntronicVariant {
1094 variant: format!("{}", variant),
1095 });
1096 }
1097
1098 let chromosome =
1100 transcript
1101 .chromosome
1102 .as_ref()
1103 .ok_or_else(|| FerroError::ConversionError {
1104 msg: "Transcript has no chromosome mapping for intronic normalization"
1105 .to_string(),
1106 })?;
1107
1108 let mapper = CoordinateMapper::new(transcript);
1110
1111 let g_start = mapper.tx_to_genomic_with_intron(start_pos)?;
1113 let g_end = mapper.tx_to_genomic_with_intron(end_pos)?;
1114
1115 let swapped = g_start > g_end;
1118 let (g_start, g_end) = if swapped {
1119 (g_end, g_start)
1120 } else {
1121 (g_start, g_end)
1122 };
1123
1124 let window = self.config.window_size;
1126 let seq_start = g_start.saturating_sub(window);
1127 let seq_end = g_end.saturating_add(window);
1128
1129 let genomic_seq = self
1131 .provider
1132 .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1133
1134 let rel_start = (g_start - seq_start) + 1; let rel_end = (g_end - seq_start) + 1;
1137
1138 let tx_start = u64::try_from(start_pos.base).map_err(|_| FerroError::ConversionError {
1140 msg: format!(
1141 "Negative transcript position {} during intronic normalization",
1142 start_pos.base
1143 ),
1144 })?;
1145
1146 let intron = transcript
1147 .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
1148 .ok_or_else(|| FerroError::ConversionError {
1149 msg: "Could not find intron for normalization".to_string(),
1150 })?;
1151
1152 let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
1154 (Some(s), Some(e)) => (s, e),
1155 _ => {
1156 return Err(FerroError::ConversionError {
1157 msg: "Intron has no genomic coordinates".to_string(),
1158 })
1159 }
1160 };
1161
1162 let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
1164 let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
1165 let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
1166
1167 let (work_seq, work_rel_start, work_rel_end, work_boundaries) = flip_intronic_for_strand(
1169 transcript.strand,
1170 &genomic_seq,
1171 rel_start,
1172 rel_end,
1173 &boundaries,
1174 );
1175
1176 let seq_bytes = work_seq.as_bytes();
1178 let (work_new_rel_start, work_new_rel_end, new_edit, warnings) = self.normalize_na_edit(
1179 seq_bytes,
1180 edit,
1181 work_rel_start,
1182 work_rel_end,
1183 &work_boundaries,
1184 )?;
1185
1186 let (new_rel_start, new_rel_end) = unflip_intronic_positions(
1188 transcript.strand,
1189 work_seq.len() as u64,
1190 work_new_rel_start,
1191 work_new_rel_end,
1192 );
1193
1194 let new_g_start = seq_start + new_rel_start - 1;
1196 let new_g_end = seq_start + new_rel_end - 1;
1197
1198 let new_start = mapper.genomic_to_tx_with_intron(new_g_start)?;
1200 let new_end = mapper.genomic_to_tx_with_intron(new_g_end)?;
1201
1202 let (new_start, new_end) = if swapped {
1204 (new_end, new_start)
1205 } else {
1206 (new_start, new_end)
1207 };
1208
1209 let new_variant = TxVariant {
1210 accession: variant.accession.clone(),
1211 gene_symbol: variant.gene_symbol.clone(),
1212 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1213 };
1214
1215 Ok((HV::Tx(new_variant), warnings))
1216 }
1217
1218 fn normalize_boundary_spanning_cds(
1226 &self,
1227 variant: &CdsVariant,
1228 transcript: &crate::reference::transcript::Transcript,
1229 start_pos: &CdsPos,
1230 end_pos: &CdsPos,
1231 edit: &NaEdit,
1232 ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
1233 use crate::convert::CoordinateMapper;
1234
1235 if !self.provider.has_genomic_data() {
1237 return Err(FerroError::ExonIntronBoundary {
1238 exon: 0,
1239 variant: format!("{}", variant),
1240 });
1241 }
1242
1243 let chromosome =
1244 transcript
1245 .chromosome
1246 .as_ref()
1247 .ok_or_else(|| FerroError::ConversionError {
1248 msg: "Transcript has no chromosome for boundary normalization".to_string(),
1249 })?;
1250
1251 let mapper = CoordinateMapper::new(transcript);
1252
1253 let g_start = self.cds_pos_to_genomic(&mapper, start_pos)?;
1257 let g_end = self.cds_pos_to_genomic(&mapper, end_pos)?;
1258
1259 let swapped = g_start > g_end;
1262 let (g_start, g_end) = if swapped {
1263 (g_end, g_start)
1264 } else {
1265 (g_start, g_end)
1266 };
1267
1268 let window = self.config.window_size;
1270 let seq_start = g_start.saturating_sub(window);
1271 let seq_end = g_end.saturating_add(window);
1272 let genomic_seq = self
1273 .provider
1274 .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1275
1276 let rel_start = (g_start - seq_start) + 1;
1278 let rel_end = (g_end - seq_start) + 1;
1279
1280 let boundaries =
1283 self.get_boundary_spanning_limits(transcript, &mapper, start_pos, end_pos, seq_start)?;
1284
1285 let seq_bytes = genomic_seq.as_bytes();
1287 let (new_rel_start, new_rel_end, new_edit, warnings) =
1288 self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
1289
1290 let new_g_start = seq_start + new_rel_start - 1;
1292 let new_g_end = seq_start + new_rel_end - 1;
1293
1294 let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1300 let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1301
1302 let (new_start, new_end) = if swapped {
1304 (new_end, new_start)
1305 } else {
1306 (new_start, new_end)
1307 };
1308
1309 let new_variant = CdsVariant {
1310 accession: variant.accession.clone(),
1311 gene_symbol: variant.gene_symbol.clone(),
1312 loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1313 };
1314
1315 Ok((HV::Cds(new_variant), warnings))
1316 }
1317
1318 fn cds_pos_to_genomic(
1320 &self,
1321 mapper: &crate::convert::CoordinateMapper,
1322 pos: &CdsPos,
1323 ) -> Result<u64, FerroError> {
1324 if pos.is_intronic() {
1325 mapper.cds_to_genomic_with_intron(pos)
1326 } else {
1327 mapper
1328 .cds_to_genomic(pos)?
1329 .ok_or_else(|| FerroError::ConversionError {
1330 msg: format!("CDS position {} not in exons", pos.base),
1331 })
1332 }
1333 }
1334
1335 fn get_boundary_spanning_limits(
1340 &self,
1341 transcript: &crate::reference::transcript::Transcript,
1342 mapper: &crate::convert::CoordinateMapper,
1343 start_pos: &CdsPos,
1344 end_pos: &CdsPos,
1345 seq_start: u64,
1346 ) -> Result<Boundaries, FerroError> {
1347 let (exonic_pos, intronic_pos) = if start_pos.is_intronic() {
1354 (end_pos, start_pos)
1355 } else {
1356 (start_pos, end_pos)
1357 };
1358
1359 let tx_pos = mapper.cds_to_tx(exonic_pos)?;
1361 let tx_pos_base = u64::try_from(tx_pos.base).map_err(|_| FerroError::ConversionError {
1362 msg: format!(
1363 "Negative transcript position {} during boundary normalization",
1364 tx_pos.base
1365 ),
1366 })?;
1367 let exon = transcript
1368 .exon_at(tx_pos_base)
1369 .ok_or_else(|| FerroError::ConversionError {
1370 msg: "Could not find exon for boundary normalization".to_string(),
1371 })?;
1372
1373 let tx_boundary = mapper.cds_to_tx(intronic_pos)?;
1375 let tx_boundary_base =
1376 u64::try_from(tx_boundary.base).map_err(|_| FerroError::ConversionError {
1377 msg: format!(
1378 "Negative transcript position {} during boundary normalization",
1379 tx_boundary.base
1380 ),
1381 })?;
1382 let offset = intronic_pos.offset.unwrap_or(0);
1383 let intron = transcript
1384 .find_intron_at_tx_boundary(tx_boundary_base, offset)
1385 .ok_or_else(|| FerroError::ConversionError {
1386 msg: "Could not find intron for boundary normalization".to_string(),
1387 })?;
1388
1389 let (exon_g_start, exon_g_end): (u64, u64) = match (exon.genomic_start, exon.genomic_end) {
1391 (Some(s), Some(e)) => (s, e),
1392 _ => {
1393 return Err(FerroError::ConversionError {
1394 msg: "Exon has no genomic coordinates".to_string(),
1395 })
1396 }
1397 };
1398
1399 let (intron_g_start, intron_g_end): (u64, u64) =
1400 match (intron.genomic_start, intron.genomic_end) {
1401 (Some(s), Some(e)) => (s, e),
1402 _ => {
1403 return Err(FerroError::ConversionError {
1404 msg: "Intron has no genomic coordinates".to_string(),
1405 })
1406 }
1407 };
1408
1409 let combined_start = exon_g_start.min(intron_g_start);
1411 let combined_end = exon_g_end.max(intron_g_end);
1412
1413 let rel_start = combined_start.saturating_sub(seq_start) + 1;
1415 let rel_end = combined_end.saturating_sub(seq_start) + 1;
1416
1417 Ok(Boundaries::new(rel_start, rel_end))
1418 }
1419
1420 fn normalize_na_edit(
1424 &self,
1425 ref_seq: &[u8],
1426 edit: &NaEdit,
1427 start: u64,
1428 end: u64,
1429 boundaries: &Boundaries,
1430 ) -> Result<(u64, u64, NaEdit, Vec<NormalizationWarning>), FerroError> {
1431 let mut warnings = Vec::new();
1432
1433 let validation = validate::validate_reference(edit, ref_seq, start, end);
1435 if !validation.valid {
1436 warnings.push(NormalizationWarning {
1437 code: "REFSEQ_MISMATCH".to_string(),
1438 message: validation.warning.unwrap_or_default(),
1439 stated_ref: validation.stated_ref.unwrap_or_default(),
1440 actual_ref: validation.actual_ref.unwrap_or_default(),
1441 position: format!("{}-{}", start, end),
1442 corrected: true,
1443 });
1444 }
1445
1446 let alt_seq = match edit {
1448 NaEdit::Deletion { .. } => vec![],
1449 NaEdit::Insertion { sequence } => {
1450 if let Some(bases) = sequence.bases() {
1452 bases.iter().map(|b| b.to_u8()).collect()
1453 } else {
1454 return Ok((start, end, edit.clone(), warnings.clone()));
1456 }
1457 }
1458 NaEdit::Duplication { sequence, .. } => {
1459 if let Some(seq) = sequence {
1460 seq.bases().iter().map(|b| b.to_u8()).collect()
1461 } else {
1462 let s = hgvs_pos_to_index(start);
1465 let e = end as usize;
1466 if e <= ref_seq.len() {
1467 ref_seq[s..e].to_vec()
1468 } else {
1469 vec![]
1470 }
1471 }
1472 }
1473 NaEdit::Delins { sequence } => {
1474 use crate::hgvs::edit::InsertedSequence;
1475
1476 if let InsertedSequence::Literal(seq) = sequence {
1481 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1482 let start_idx = hgvs_pos_to_index(start);
1483 let end_idx = end as usize;
1484
1485 if rules::delins_is_identity(ref_seq, start_idx, end_idx, &seq_bytes) {
1488 return Ok((start, end, NaEdit::position_identity(), warnings.clone()));
1489 }
1490
1491 if let Some((reference, alternative)) =
1494 rules::delins_is_substitution(ref_seq, start_idx, end_idx, &seq_bytes)
1495 {
1496 return Ok((
1497 start,
1498 start,
1499 NaEdit::Substitution {
1500 reference,
1501 alternative,
1502 },
1503 warnings.clone(),
1504 ));
1505 }
1506
1507 if rules::delins_is_duplication(ref_seq, start_idx, end_idx, &seq_bytes) {
1509 return Ok((
1511 start,
1512 end,
1513 NaEdit::Duplication {
1514 sequence: None,
1515 length: None,
1516 uncertain_extent: None,
1517 },
1518 warnings.clone(),
1519 ));
1520 }
1521 }
1522 return Ok((start, end, edit.clone(), warnings.clone()));
1524 }
1525 NaEdit::Inversion { sequence, length } => {
1526 let start_idx = hgvs_pos_to_index(start); let end_idx = end as usize; if let Some((new_s, new_e)) = rules::shorten_inversion(ref_seq, start_idx, end_idx)
1532 {
1533 let shortened_edit = NaEdit::Inversion {
1535 sequence: sequence.clone(),
1536 length: *length,
1537 };
1538 return Ok((
1539 index_to_hgvs_pos(new_s),
1540 new_e as u64,
1541 shortened_edit,
1542 warnings,
1543 ));
1544 } else {
1545 return Ok((
1547 start,
1548 end,
1549 NaEdit::Identity {
1550 sequence: None,
1551 whole_entity: false,
1552 },
1553 warnings,
1554 ));
1555 }
1556 }
1557 NaEdit::Repeat {
1558 sequence,
1559 count,
1560 additional_counts,
1561 trailing,
1562 } => {
1563 use crate::hgvs::edit::RepeatCount;
1564
1565 let Some(seq) = sequence else {
1567 return Ok((start, end, edit.clone(), warnings.clone()));
1568 };
1569
1570 let RepeatCount::Exact(specified_count) = count else {
1571 return Ok((start, end, edit.clone(), warnings.clone()));
1572 };
1573
1574 if !additional_counts.is_empty() || trailing.is_some() {
1576 return Ok((start, end, edit.clone(), warnings.clone()));
1577 }
1578
1579 let repeat_unit: Vec<u8> = seq.bases().iter().map(|b| b.to_u8()).collect();
1581 let pos_idx = hgvs_pos_to_index(start); match rules::normalize_repeat(ref_seq, pos_idx, &repeat_unit, *specified_count) {
1585 rules::RepeatNormResult::Deletion {
1586 start: del_start,
1587 end: del_end,
1588 } => {
1589 let del_edit = NaEdit::Deletion {
1591 sequence: None,
1592 length: None,
1593 };
1594 return Ok((del_start, del_end, del_edit, warnings));
1595 }
1596 rules::RepeatNormResult::Duplication {
1597 start: dup_start,
1598 end: dup_end,
1599 sequence: _dup_seq,
1600 } => {
1601 let dup_edit = NaEdit::Duplication {
1603 sequence: None,
1604 length: None,
1605 uncertain_extent: None,
1606 };
1607 return Ok((dup_start, dup_end, dup_edit, warnings));
1608 }
1609 rules::RepeatNormResult::Repeat {
1610 start: rep_start,
1611 end: rep_end,
1612 sequence: rep_seq,
1613 count: rep_count,
1614 } => {
1615 use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1616 let bases: Vec<Base> = rep_seq
1617 .iter()
1618 .filter_map(|&b| Base::from_char(b as char))
1619 .collect();
1620 let rep_edit = NaEdit::Repeat {
1621 sequence: Some(Sequence::new(bases)),
1622 count: RepeatCount::Exact(rep_count),
1623 additional_counts: vec![],
1624 trailing: None,
1625 };
1626 return Ok((rep_start, rep_end, rep_edit, warnings));
1627 }
1628 rules::RepeatNormResult::Unchanged => {
1629 return Ok((start, end, edit.clone(), warnings.clone()));
1630 }
1631 }
1632 }
1633 _ => return Ok((start, end, edit.clone(), warnings.clone())), };
1635
1636 let shuffle_end = match edit {
1650 NaEdit::Insertion { .. } => end.saturating_sub(1), _ => end,
1652 };
1653 let start_idx = hgvs_pos_to_index(start); let shuffle_start = match edit {
1655 NaEdit::Insertion { .. } => start_idx as u64 + 1, _ => start_idx as u64,
1657 };
1658 let result = shuffle(
1659 ref_seq,
1660 &alt_seq,
1661 shuffle_start,
1662 shuffle_end, boundaries,
1664 self.config.shuffle_direction,
1665 );
1666
1667 let shuffle_result_start = match edit {
1670 NaEdit::Insertion { .. } => result.start.saturating_sub(1), _ => result.start,
1672 };
1673 let new_start = index_to_hgvs_pos(shuffle_result_start as usize);
1674 let new_end = match edit {
1676 NaEdit::Insertion { .. } => index_to_hgvs_pos(result.end as usize), _ => result.end,
1678 };
1679
1680 let (final_start, final_end, new_edit) = match edit {
1686 NaEdit::Insertion { sequence } => {
1687 use crate::hgvs::edit::{InsertedSequence, RepeatCount, Sequence};
1688
1689 if let InsertedSequence::Literal(seq) = sequence {
1690 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1691
1692 if seq_bytes.len() > 1 {
1696 let original_pos_idx = hgvs_pos_to_index(start) as u64; if let Some((_first, count, rep_start, rep_end, unit_bytes)) =
1698 rules::insertion_to_repeat(ref_seq, original_pos_idx, &seq_bytes)
1699 {
1700 use crate::hgvs::edit::Base;
1701 let bases: Vec<Base> = unit_bytes
1702 .iter()
1703 .filter_map(|&b| Base::from_char(b as char))
1704 .collect();
1705 if bases.len() == unit_bytes.len() {
1706 let repeat_seq = Sequence::new(bases);
1707 let repeat_edit = NaEdit::Repeat {
1708 sequence: Some(repeat_seq),
1709 count: RepeatCount::Exact(count),
1710 additional_counts: vec![],
1711 trailing: None,
1712 };
1713 return Ok((rep_start, rep_end, repeat_edit, warnings));
1714 }
1715 }
1716 }
1717
1718 let shift_amount =
1722 (result.start as usize).saturating_sub(shuffle_start as usize);
1723 let rotation = shift_amount % seq_bytes.len();
1724 let rotated_seq: Vec<u8> = if rotation > 0 {
1725 seq_bytes[rotation..]
1726 .iter()
1727 .chain(seq_bytes[..rotation].iter())
1728 .copied()
1729 .collect()
1730 } else {
1731 seq_bytes.clone()
1732 };
1733
1734 if rules::insertion_is_duplication(ref_seq, result.start, &rotated_seq) {
1735 let dup_len = rotated_seq.len() as u64;
1739 let (dup_start, dup_end) = if dup_len == 1 {
1740 (new_start, new_start) } else {
1742 (new_start - dup_len + 1, new_start)
1744 };
1745 (
1746 dup_start,
1747 dup_end,
1748 NaEdit::Duplication {
1749 sequence: None, length: None,
1751 uncertain_extent: None,
1752 },
1753 )
1754 } else {
1755 if rotation > 0 {
1757 use crate::hgvs::edit::{Base, InsertedSequence, Sequence};
1758 let rotated_bases: Vec<Base> = rotated_seq
1759 .iter()
1760 .filter_map(|&b| Base::from_char(b as char))
1761 .collect();
1762 let new_sequence =
1763 InsertedSequence::Literal(Sequence::new(rotated_bases));
1764 (
1765 new_start,
1766 new_end,
1767 NaEdit::Insertion {
1768 sequence: new_sequence,
1769 },
1770 )
1771 } else {
1772 (new_start, new_end, edit.clone())
1773 }
1774 }
1775 } else {
1776 (new_start, new_end, edit.clone())
1777 }
1778 }
1779 NaEdit::Duplication { .. } => {
1780 use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1781
1782 if let Some(dup_result) =
1786 rules::duplication_to_repeat(ref_seq, result.start, result.end)
1787 {
1788 match dup_result {
1789 rules::DupToRepeatResult::Homopolymer {
1790 base,
1791 count,
1792 start: rep_start,
1793 end: rep_end,
1794 } => {
1795 if let Some(base_enum) = Base::from_char(base as char) {
1796 let repeat_seq = Sequence::new(vec![base_enum]);
1797 let repeat_edit = NaEdit::Repeat {
1798 sequence: Some(repeat_seq),
1799 count: RepeatCount::Exact(count),
1800 additional_counts: vec![],
1801 trailing: None,
1802 };
1803 return Ok((rep_start, rep_end, repeat_edit, warnings));
1804 }
1805 }
1806 rules::DupToRepeatResult::TandemRepeat {
1807 unit,
1808 count,
1809 start: rep_start,
1810 end: rep_end,
1811 } => {
1812 let bases: Vec<Base> = unit
1813 .iter()
1814 .filter_map(|&b| Base::from_char(b as char))
1815 .collect();
1816 if bases.len() == unit.len() {
1817 let repeat_seq = Sequence::new(bases);
1818 let repeat_edit = NaEdit::Repeat {
1819 sequence: Some(repeat_seq),
1820 count: RepeatCount::Exact(count),
1821 additional_counts: vec![],
1822 trailing: None,
1823 };
1824 return Ok((rep_start, rep_end, repeat_edit, warnings));
1825 }
1826 }
1827 }
1828 }
1829 (
1831 new_start,
1832 new_end,
1833 NaEdit::Duplication {
1834 sequence: None,
1835 length: None,
1836 uncertain_extent: None,
1837 },
1838 )
1839 }
1840 NaEdit::Delins { sequence } => {
1841 use crate::hgvs::edit::{InsertedSequence, Sequence};
1842
1843 if let InsertedSequence::Literal(seq) = sequence {
1846 let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1847 let start_0 = result.start as usize;
1848 let end_0 = result.end as usize;
1849
1850 if rules::delins_is_duplication(ref_seq, start_0, end_0, &seq_bytes) {
1851 let dup_len = end_0 - start_0;
1853 let dup_seq_bytes = &ref_seq[start_0..end_0];
1854 let dup_bases: Vec<crate::hgvs::edit::Base> = dup_seq_bytes
1855 .iter()
1856 .filter_map(|&b| crate::hgvs::edit::Base::from_char(b as char))
1857 .collect();
1858 let dup_seq = Sequence::new(dup_bases);
1859
1860 (
1861 new_start,
1862 new_end,
1863 NaEdit::Duplication {
1864 sequence: Some(dup_seq),
1865 length: Some(dup_len as u64),
1866 uncertain_extent: None,
1867 },
1868 )
1869 } else {
1870 (new_start, new_end, edit.clone())
1871 }
1872 } else {
1873 (new_start, new_end, edit.clone())
1874 }
1875 }
1876 NaEdit::Deletion { .. } => {
1888 use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1889 if self.config.shuffle_direction == ShuffleDirection::ThreePrime {
1890 if let Some(rep) = rules::deletion_to_repeat(
1891 ref_seq,
1892 result.start as usize,
1893 result.end as usize,
1894 ) {
1895 let bases: Option<Vec<Base>> = rep
1896 .unit
1897 .iter()
1898 .map(|&b| Base::from_char(b as char))
1899 .collect();
1900 if let Some(bases) = bases {
1901 let repeat_edit = NaEdit::Repeat {
1902 sequence: Some(Sequence::new(bases)),
1903 count: RepeatCount::Exact(rep.count),
1904 additional_counts: vec![],
1905 trailing: None,
1906 };
1907 return Ok((rep.start, rep.end, repeat_edit, warnings));
1908 }
1909 }
1910 }
1911 (
1912 new_start,
1913 new_end,
1914 NaEdit::Deletion {
1915 sequence: None,
1916 length: None,
1917 },
1918 )
1919 }
1920 _ => (new_start, new_end, edit.clone()),
1922 };
1923
1924 Ok((final_start, final_end, new_edit, warnings))
1925 }
1926
1927 fn cds_to_tx_pos(
1929 &self,
1930 pos: &CdsPos,
1931 cds_start: u64,
1932 cds_end: Option<u64>,
1933 ) -> Result<u64, FerroError> {
1934 if pos.utr3 {
1935 let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1936 msg: "No CDS end".to_string(),
1937 })?;
1938 let base = u64::try_from(pos.base).map_err(|_| FerroError::ConversionError {
1939 msg: format!("Negative base {} in 3' UTR position", pos.base),
1940 })?;
1941 Ok(end + base)
1942 } else if pos.base < 0 {
1943 let tx_pos = cds_start as i64 + pos.base;
1948 u64::try_from(tx_pos).map_err(|_| FerroError::ConversionError {
1949 msg: format!(
1950 "CDS position c.{} maps before transcript start (cds_start={})",
1951 pos.base, cds_start
1952 ),
1953 })
1954 } else if pos.base == 0 {
1955 Ok(cds_start.saturating_sub(1))
1959 } else {
1960 Ok(cds_start + pos.base as u64 - 1)
1961 }
1962 }
1963
1964 fn tx_to_cds_pos(
1966 &self,
1967 pos: u64,
1968 cds_start: u64,
1969 cds_end: Option<u64>,
1970 ) -> Result<CdsPos, FerroError> {
1971 let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1972 msg: "No CDS end".to_string(),
1973 })?;
1974
1975 if pos < cds_start {
1976 Ok(CdsPos {
1984 base: pos as i64 - cds_start as i64,
1985 offset: None,
1986 utr3: false,
1987 })
1988 } else if pos > end {
1989 Ok(CdsPos {
1990 base: (pos - end) as i64,
1991 offset: None,
1992 utr3: true,
1993 })
1994 } else {
1995 Ok(CdsPos {
1996 base: (pos - cds_start + 1) as i64,
1997 offset: None,
1998 utr3: false,
1999 })
2000 }
2001 }
2002
2003 fn canonicalize_cds_variant(&self, variant: &CdsVariant) -> CdsVariant {
2008 let edit = match variant.loc_edit.edit.inner() {
2009 Some(e) => e,
2010 None => return variant.clone(),
2011 };
2012
2013 if !should_canonicalize(edit) {
2015 return variant.clone();
2016 }
2017
2018 let canonical_edit = canonicalize_edit(edit);
2019
2020 CdsVariant {
2021 accession: variant.accession.clone(),
2022 gene_symbol: variant.gene_symbol.clone(),
2023 loc_edit: LocEdit::with_uncertainty(
2024 variant.loc_edit.location.clone(),
2025 variant.loc_edit.edit.map_ref(|_| canonical_edit),
2026 ),
2027 }
2028 }
2029
2030 fn canonicalize_genome_variant(&self, variant: &GenomeVariant) -> GenomeVariant {
2032 let edit = match variant.loc_edit.edit.inner() {
2033 Some(e) => e,
2034 None => return variant.clone(),
2035 };
2036
2037 if !should_canonicalize(edit) {
2038 return variant.clone();
2039 }
2040
2041 let canonical_edit = canonicalize_edit(edit);
2042
2043 GenomeVariant {
2044 accession: variant.accession.clone(),
2045 gene_symbol: variant.gene_symbol.clone(),
2046 loc_edit: LocEdit::with_uncertainty(
2047 variant.loc_edit.location.clone(),
2048 variant.loc_edit.edit.map_ref(|_| canonical_edit),
2049 ),
2050 }
2051 }
2052
2053 fn canonicalize_tx_variant(&self, variant: &TxVariant) -> TxVariant {
2055 let edit = match variant.loc_edit.edit.inner() {
2056 Some(e) => e,
2057 None => return variant.clone(),
2058 };
2059
2060 if !should_canonicalize(edit) {
2061 return variant.clone();
2062 }
2063
2064 let canonical_edit = canonicalize_edit(edit);
2065
2066 TxVariant {
2067 accession: variant.accession.clone(),
2068 gene_symbol: variant.gene_symbol.clone(),
2069 loc_edit: LocEdit::with_uncertainty(
2070 variant.loc_edit.location.clone(),
2071 variant.loc_edit.edit.map_ref(|_| canonical_edit),
2072 ),
2073 }
2074 }
2075}
2076
2077fn flip_intronic_for_strand(
2083 strand: Strand,
2084 genomic_seq: &str,
2085 rel_start: u64,
2086 rel_end: u64,
2087 boundaries: &Boundaries,
2088) -> (String, u64, u64, Boundaries) {
2089 if strand != Strand::Minus {
2090 return (
2091 genomic_seq.to_string(),
2092 rel_start,
2093 rel_end,
2094 boundaries.clone(),
2095 );
2096 }
2097 let seq_len = genomic_seq.len() as u64;
2098 let rc = crate::cli::reverse_complement(genomic_seq);
2099 let new_rel_start = seq_len - rel_end + 1;
2100 let new_rel_end = seq_len - rel_start + 1;
2101 let new_boundaries = Boundaries::new(
2102 seq_len - boundaries.right + 1,
2103 seq_len - boundaries.left + 1,
2104 );
2105 (rc, new_rel_start, new_rel_end, new_boundaries)
2106}
2107
2108fn unflip_intronic_positions(
2113 strand: Strand,
2114 seq_len: u64,
2115 rel_start: u64,
2116 rel_end: u64,
2117) -> (u64, u64) {
2118 if strand == Strand::Minus {
2119 (seq_len - rel_end + 1, seq_len - rel_start + 1)
2120 } else {
2121 (rel_start, rel_end)
2122 }
2123}
2124
2125#[cfg(test)]
2126mod tests {
2127 use super::*;
2128 use crate::hgvs::parser::parse_hgvs;
2129 use crate::reference::MockProvider;
2130
2131 #[test]
2132 fn test_normalize_substitution_unchanged() {
2133 let provider = MockProvider::with_test_data();
2134 let normalizer = Normalizer::new(provider);
2135
2136 let variant = parse_hgvs("NM_000088.3:c.10A>G").unwrap();
2137 let normalized = normalizer.normalize(&variant).unwrap();
2138
2139 assert_eq!(format!("{}", variant), format!("{}", normalized));
2141 }
2142
2143 #[test]
2144 fn test_normalize_with_config() {
2145 let provider = MockProvider::with_test_data();
2146 let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2147 let normalizer = Normalizer::with_config(provider, config);
2148
2149 assert_eq!(
2150 normalizer.config().shuffle_direction,
2151 ShuffleDirection::FivePrime
2152 );
2153 }
2154
2155 #[test]
2156 fn test_normalizer_handles_missing_transcript() {
2157 let provider = MockProvider::new(); let normalizer = Normalizer::new(provider);
2159
2160 let variant = parse_hgvs("NM_MISSING.1:c.100del").unwrap();
2162 let result = normalizer.normalize(&variant);
2163 assert!(result.is_ok());
2164 assert_eq!(
2166 format!("{}", variant),
2167 format!("{}", result.unwrap()),
2168 "Missing transcript should return variant unchanged"
2169 );
2170 }
2171
2172 #[test]
2173 fn test_normalize_deletion_shifts_3prime() {
2174 let provider = MockProvider::with_test_data();
2179 let normalizer = Normalizer::new(provider);
2180
2181 let variant = parse_hgvs("NM_001234.1:c.10del").unwrap();
2182 let result = normalizer.normalize(&variant).unwrap();
2183 let output = format!("{}", result);
2184
2185 assert!(
2187 output.contains("del"),
2188 "Deletion should remain a deletion, got: {}",
2189 output
2190 );
2191 assert!(
2193 output.contains("c.11del"),
2194 "Deletion should shift 3' to exon boundary (c.11), got: {}",
2195 output
2196 );
2197 }
2198
2199 #[test]
2200 fn test_normalize_insertion_becomes_dup() {
2201 let provider = MockProvider::with_test_data();
2204 let normalizer = Normalizer::new(provider);
2205
2206 let variant = parse_hgvs("NM_001234.1:c.10_11insG").unwrap();
2207 let result = normalizer.normalize(&variant).unwrap();
2208 let output = format!("{}", result);
2209
2210 assert!(
2212 output.contains("dup"),
2213 "Insertion of matching base should become dup, got: {}",
2214 output
2215 );
2216 }
2217
2218 #[test]
2219 fn test_normalize_duplication_shifts_3prime() {
2220 let provider = MockProvider::with_test_data();
2223 let normalizer = Normalizer::new(provider);
2224
2225 let variant = parse_hgvs("NM_001234.1:c.10dup").unwrap();
2226 let result = normalizer.normalize(&variant).unwrap();
2227 let output = format!("{}", result);
2228
2229 assert!(
2232 output.contains("dup"),
2233 "Single-base duplication should remain as dup notation, got: {}",
2234 output
2235 );
2236 }
2237
2238 #[test]
2239 fn test_normalize_delins_unchanged() {
2240 let provider = MockProvider::with_test_data();
2243 let normalizer = Normalizer::new(provider);
2244
2245 let variant = parse_hgvs("NM_000088.3:c.10delinsAT").unwrap();
2246 let result = normalizer.normalize(&variant).unwrap();
2247 let output = format!("{}", result);
2248
2249 assert!(
2250 output.contains("delinsAT"),
2251 "Delins should remain unchanged, got: {}",
2252 output
2253 );
2254 }
2255
2256 #[test]
2257 fn test_normalize_single_base_delins_becomes_substitution() {
2258 let provider = MockProvider::with_test_data();
2262 let normalizer = Normalizer::new(provider);
2263
2264 let variant = parse_hgvs("NM_000088.3:c.10delinsT").unwrap();
2265 let result = normalizer.normalize(&variant).unwrap();
2266 assert_eq!(format!("{}", result), "NM_000088.3:c.10G>T");
2267 }
2268
2269 #[test]
2270 fn test_normalize_single_base_delins_same_base_becomes_identity() {
2271 let provider = MockProvider::with_test_data();
2275 let normalizer = Normalizer::new(provider);
2276
2277 let variant = parse_hgvs("NM_000088.3:c.10delinsG").unwrap();
2278 let result = normalizer.normalize(&variant).unwrap();
2279 assert_eq!(format!("{}", result), "NM_000088.3:c.10=");
2280 }
2281
2282 #[test]
2283 fn test_normalize_multi_base_delins_same_seq_becomes_identity() {
2284 let provider = MockProvider::with_test_data();
2287 let normalizer = Normalizer::new(provider);
2288
2289 let variant = parse_hgvs("NM_000088.3:c.1_3delinsATG").unwrap();
2290 let result = normalizer.normalize(&variant).unwrap();
2291 assert_eq!(format!("{}", result), "NM_000088.3:c.1_3=");
2292 }
2293
2294 #[test]
2295 fn test_normalize_multi_base_delete_delins_unchanged() {
2296 let provider = MockProvider::with_test_data();
2299 let normalizer = Normalizer::new(provider);
2300
2301 let variant = parse_hgvs("NM_000088.3:c.10_11delinsT").unwrap();
2302 let result = normalizer.normalize(&variant).unwrap();
2303 let output = format!("{}", result);
2304 assert!(
2305 output.contains("delinsT"),
2306 "Multi-base delete delins should remain delins, got: {}",
2307 output
2308 );
2309 assert!(
2310 !output.contains(">"),
2311 "Multi-base delete delins must not become a substitution, got: {}",
2312 output
2313 );
2314 }
2315
2316 #[test]
2317 fn test_normalize_delins_to_dup_still_works() {
2318 let provider = MockProvider::with_test_data();
2323 let normalizer = Normalizer::new(provider);
2324
2325 let variant = parse_hgvs("NM_000088.3:c.5delinsCC").unwrap();
2326 let result = normalizer.normalize(&variant).unwrap();
2327 let output = format!("{}", result);
2328 assert!(
2329 output.contains("dup"),
2330 "delins matching dup pattern should normalize to dup, got: {}",
2331 output
2332 );
2333 assert!(
2334 !output.contains("delins"),
2335 "delins should not survive the dup conversion, got: {}",
2336 output
2337 );
2338 }
2339
2340 #[test]
2341 fn test_normalize_delins_different_bases_unchanged() {
2342 let provider = MockProvider::with_test_data();
2345 let normalizer = Normalizer::new(provider);
2346
2347 let variant = parse_hgvs("NM_000088.3:c.1_3delinsACG").unwrap();
2348 let result = normalizer.normalize(&variant).unwrap();
2349 assert_eq!(format!("{}", result), "NM_000088.3:c.1_3delinsACG");
2350 }
2351
2352 #[test]
2353 fn test_normalize_protein_substitution_unchanged() {
2354 let provider = MockProvider::with_test_data();
2355 let normalizer = Normalizer::new(provider);
2356
2357 let variant = parse_hgvs("NP_000079.2:p.Val600Glu").unwrap();
2359 let normalized = normalizer.normalize(&variant).unwrap();
2360 assert_eq!(format!("{}", variant), format!("{}", normalized));
2361 }
2362
2363 #[test]
2364 fn test_normalize_protein_deletion_removes_redundant_sequence() {
2365 let provider = MockProvider::with_test_data();
2367 let normalizer = Normalizer::new(provider);
2368
2369 let variant = parse_hgvs("NP_000079.2:p.Val600delVal").unwrap();
2370 let normalized = normalizer.normalize(&variant).unwrap();
2371 let output = format!("{}", normalized);
2372
2373 assert_eq!(
2375 output, "NP_000079.2:p.Val600del",
2376 "Redundant sequence should be removed from deletion"
2377 );
2378 }
2379
2380 #[test]
2381 fn test_normalize_protein_deletion_range_removes_redundant_sequence() {
2382 let provider = MockProvider::with_test_data();
2384 let normalizer = Normalizer::new(provider);
2385
2386 let variant = parse_hgvs("NP_000079.2:p.Lys23_Glu25delLysAlaGlu").unwrap();
2387 let normalized = normalizer.normalize(&variant).unwrap();
2388 let output = format!("{}", normalized);
2389
2390 assert_eq!(
2392 output, "NP_000079.2:p.Lys23_Glu25del",
2393 "Redundant sequence should be removed from range deletion"
2394 );
2395 }
2396
2397 #[test]
2398 fn test_normalize_protein_deletion_non_matching_sequence_unchanged() {
2399 let provider = MockProvider::with_test_data();
2401 let normalizer = Normalizer::new(provider);
2402
2403 let variant = parse_hgvs("NP_000079.2:p.Val600delGlu").unwrap();
2404 let normalized = normalizer.normalize(&variant).unwrap();
2405 let output = format!("{}", normalized);
2406
2407 assert_eq!(
2409 output, "NP_000079.2:p.Val600delGlu",
2410 "Non-matching sequence should not be removed"
2411 );
2412 }
2413
2414 #[test]
2415 fn test_normalize_protein_deletion_no_sequence_unchanged() {
2416 let provider = MockProvider::with_test_data();
2418 let normalizer = Normalizer::new(provider);
2419
2420 let variant = parse_hgvs("NP_000079.2:p.Val600del").unwrap();
2421 let normalized = normalizer.normalize(&variant).unwrap();
2422 let output = format!("{}", normalized);
2423
2424 assert_eq!(
2425 output, "NP_000079.2:p.Val600del",
2426 "Deletion without sequence should remain unchanged"
2427 );
2428 }
2429
2430 #[test]
2431 fn test_normalize_protein_duplication_unchanged() {
2432 let provider = MockProvider::with_test_data();
2434 let normalizer = Normalizer::new(provider);
2435
2436 let variant = parse_hgvs("NP_000079.2:p.Val600dup").unwrap();
2437 let normalized = normalizer.normalize(&variant).unwrap();
2438 assert_eq!(format!("{}", variant), format!("{}", normalized));
2439 }
2440
2441 #[test]
2442 fn test_normalize_protein_frameshift_unchanged() {
2443 let provider = MockProvider::with_test_data();
2445 let normalizer = Normalizer::new(provider);
2446
2447 let variant = parse_hgvs("NP_000079.2:p.Arg97ProfsTer23").unwrap();
2448 let normalized = normalizer.normalize(&variant).unwrap();
2449 assert_eq!(format!("{}", variant), format!("{}", normalized));
2450 }
2451
2452 #[test]
2453 fn test_normalize_protein_reference_validation_match() {
2454 let provider = MockProvider::with_test_data();
2457 let normalizer = Normalizer::new(provider);
2458
2459 let variant = parse_hgvs("NP_TEST.1:p.Met1Val").unwrap();
2461 let result = normalizer.normalize(&variant);
2462 assert!(
2463 result.is_ok(),
2464 "Validation should pass for matching amino acid"
2465 );
2466 }
2467
2468 #[test]
2469 fn test_normalize_protein_reference_validation_mismatch() {
2470 let provider = MockProvider::with_test_data();
2473 let normalizer = Normalizer::new(provider);
2474
2475 let variant = parse_hgvs("NP_TEST.1:p.Val1Glu").unwrap();
2477 let result = normalizer.normalize(&variant);
2478 assert!(
2479 result.is_err(),
2480 "Validation should fail for mismatched amino acid"
2481 );
2482
2483 if let Err(crate::error::FerroError::AminoAcidMismatch {
2484 position,
2485 expected,
2486 found,
2487 ..
2488 }) = result
2489 {
2490 assert_eq!(position, 1);
2491 assert_eq!(expected, "Val");
2492 assert_eq!(found, "M");
2493 } else {
2494 panic!("Expected AminoAcidMismatch error");
2495 }
2496 }
2497
2498 #[test]
2499 fn test_normalize_protein_reference_validation_deletion() {
2500 let provider = MockProvider::with_test_data();
2503 let normalizer = Normalizer::new(provider);
2504
2505 let variant = parse_hgvs("NP_TEST.1:p.Val2del").unwrap();
2507 let result = normalizer.normalize(&variant);
2508 assert!(
2509 result.is_ok(),
2510 "Validation should pass for matching deletion position"
2511 );
2512 }
2513
2514 #[test]
2515 fn test_normalize_protein_reference_validation_missing_protein() {
2516 let provider = MockProvider::with_test_data();
2518 let normalizer = Normalizer::new(provider);
2519
2520 let variant = parse_hgvs("NP_MISSING.1:p.Val600Glu").unwrap();
2522 let result = normalizer.normalize(&variant);
2523 assert!(
2525 result.is_ok(),
2526 "Missing protein should skip validation, not fail"
2527 );
2528 }
2529
2530 #[test]
2531 fn test_normalize_rna_substitution_unchanged() {
2532 let provider = MockProvider::with_test_data();
2533 let normalizer = Normalizer::new(provider);
2534
2535 let variant = parse_hgvs("NM_000088.3:r.10a>g").unwrap();
2537 let normalized = normalizer.normalize(&variant).unwrap();
2538 assert_eq!(format!("{}", variant), format!("{}", normalized));
2539 }
2540
2541 #[test]
2542 fn test_normalize_rna_deletion_shifts_3prime() {
2543 let provider = MockProvider::with_test_data();
2546 let normalizer = Normalizer::new(provider);
2547
2548 let variant = parse_hgvs("NM_001234.1:r.14del").unwrap();
2549 let result = normalizer.normalize(&variant).unwrap();
2550 let output = format!("{}", result);
2551
2552 assert!(
2554 output.contains("del"),
2555 "Deletion should remain a deletion, got: {}",
2556 output
2557 );
2558 assert!(
2560 output.contains("r.37del"),
2561 "Deletion should shift 3' to position 37, got: {}",
2562 output
2563 );
2564 }
2565
2566 #[test]
2567 fn test_normalize_rna_insertion_becomes_dup() {
2568 let provider = MockProvider::with_test_data();
2571 let normalizer = Normalizer::new(provider);
2572
2573 let variant = parse_hgvs("NM_001234.1:r.14_15insg").unwrap();
2574 let result = normalizer.normalize(&variant).unwrap();
2575 let output = format!("{}", result);
2576
2577 assert!(
2579 output.contains("dup"),
2580 "Insertion of matching base should become dup, got: {}",
2581 output
2582 );
2583 }
2584
2585 #[test]
2586 fn test_normalize_rna_duplication_shifts_3prime() {
2587 let provider = MockProvider::with_test_data();
2589 let normalizer = Normalizer::new(provider);
2590
2591 let variant = parse_hgvs("NM_001234.1:r.14dup").unwrap();
2592 let result = normalizer.normalize(&variant).unwrap();
2593 let output = format!("{}", result);
2594
2595 assert!(
2597 output.contains("dup"),
2598 "Single-base RNA duplication should remain as dup notation, got: {}",
2599 output
2600 );
2601 }
2602
2603 #[test]
2604 fn test_normalize_rna_intronic_returns_error() {
2605 let provider = MockProvider::with_test_data();
2606 let normalizer = Normalizer::new(provider);
2607
2608 let variant = parse_hgvs("NM_001234.1:r.10+5del").unwrap();
2610 let result = normalizer.normalize(&variant);
2611 assert!(result.is_err(), "Intronic RNA variant should return error");
2612 }
2613
2614 #[test]
2615 fn test_normalize_rna_missing_transcript_unchanged() {
2616 let provider = MockProvider::with_test_data();
2617 let normalizer = Normalizer::new(provider);
2618
2619 let variant = parse_hgvs("NM_MISSING.1:r.100del").unwrap();
2621 let normalized = normalizer.normalize(&variant).unwrap();
2622 assert_eq!(format!("{}", variant), format!("{}", normalized));
2623 }
2624
2625 #[test]
2626 fn test_normalize_null_allele() {
2627 let provider = MockProvider::with_test_data();
2628 let normalizer = Normalizer::new(provider);
2629
2630 let variant = HgvsVariant::NullAllele;
2632 let normalized = normalizer.normalize(&variant).unwrap();
2633 assert!(matches!(normalized, HgvsVariant::NullAllele));
2634 }
2635
2636 #[test]
2637 fn test_normalize_unknown_allele() {
2638 let provider = MockProvider::with_test_data();
2639 let normalizer = Normalizer::new(provider);
2640
2641 let variant = HgvsVariant::UnknownAllele;
2643 let normalized = normalizer.normalize(&variant).unwrap();
2644 assert!(matches!(normalized, HgvsVariant::UnknownAllele));
2645 }
2646
2647 #[test]
2648 fn test_normalize_allele_variant() {
2649 let provider = MockProvider::with_test_data();
2650 let normalizer = Normalizer::new(provider);
2651
2652 let variant = parse_hgvs("[NM_000088.3:c.10A>G;NM_000088.3:c.20C>T]").unwrap();
2655 let result = normalizer.normalize(&variant).unwrap();
2656
2657 assert!(matches!(result, HgvsVariant::Allele(_)));
2659
2660 assert_eq!(
2662 format!("{}", result),
2663 "NM_000088.3:c.[10A>G;20C>T]",
2664 "Allele display should use canonical compact form"
2665 );
2666 }
2667
2668 #[test]
2669 fn test_normalize_5prime_direction() {
2670 let provider = MockProvider::with_test_data();
2676 let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2677 let normalizer = Normalizer::with_config(provider, config);
2678
2679 let variant = parse_hgvs("NM_001234.1:c.20del").unwrap();
2681 let result = normalizer.normalize(&variant).unwrap();
2682 let output = format!("{}", result);
2683
2684 assert!(
2685 output.contains("del"),
2686 "Should remain a deletion, got: {}",
2687 output
2688 );
2689 assert!(
2692 output.contains("c.13del") || output.contains("c.12del"),
2693 "5' direction should shift deletion toward exon start, got: {}",
2694 output
2695 );
2696 }
2697
2698 #[test]
2699 fn test_normalize_3prime_direction() {
2700 let provider = MockProvider::with_test_data();
2701 let config = NormalizeConfig::default().with_direction(ShuffleDirection::ThreePrime);
2702 let normalizer = Normalizer::with_config(provider, config);
2703
2704 let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2705 let result = normalizer.normalize(&variant);
2706 assert!(result.is_ok());
2707 }
2708
2709 #[test]
2710 fn test_normalize_with_cross_boundaries() {
2711 let provider = MockProvider::with_test_data();
2712 let config = NormalizeConfig::default().allow_crossing_boundaries();
2713 let normalizer = Normalizer::with_config(provider, config);
2714
2715 let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2716 let result = normalizer.normalize(&variant);
2717 assert!(result.is_ok());
2718 }
2719
2720 #[test]
2721 fn test_normalize_genomic_variant() {
2722 let provider = MockProvider::with_test_data();
2723 let normalizer = Normalizer::new(provider);
2724
2725 let variant = parse_hgvs("NC_000001.11:g.12345del").unwrap();
2727 let result = normalizer.normalize(&variant);
2728 assert!(result.is_ok());
2729 }
2730
2731 #[test]
2732 fn test_normalize_tx_variant() {
2733 let provider = MockProvider::with_test_data();
2734 let normalizer = Normalizer::new(provider);
2735
2736 let variant = parse_hgvs("NR_000001.1:n.100del").unwrap();
2737 let result = normalizer.normalize(&variant);
2738 assert!(result.is_ok());
2739 }
2740
2741 #[test]
2742 fn test_normalize_mt_variant() {
2743 let provider = MockProvider::with_test_data();
2744 let normalizer = Normalizer::new(provider);
2745
2746 let variant = parse_hgvs("NC_012920.1:m.100A>G").unwrap();
2748 let normalized = normalizer.normalize(&variant).unwrap();
2749 assert!(matches!(normalized, HgvsVariant::Mt(_)));
2750 }
2751
2752 #[test]
2753 fn test_cds_to_tx_pos_utr5() {
2754 let provider = MockProvider::with_test_data();
2755 let normalizer = Normalizer::new(provider);
2756
2757 let cds_pos = CdsPos {
2759 base: -5,
2760 offset: None,
2761 utr3: false,
2762 };
2763 let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2764 assert!(result.is_ok());
2765 }
2766
2767 #[test]
2768 fn test_cds_to_tx_pos_utr3() {
2769 let provider = MockProvider::with_test_data();
2770 let normalizer = Normalizer::new(provider);
2771
2772 let cds_pos = CdsPos {
2774 base: 5,
2775 offset: None,
2776 utr3: true,
2777 };
2778 let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2779 assert!(result.is_ok());
2780 assert_eq!(result.unwrap(), 55);
2781 }
2782
2783 #[test]
2784 fn test_cds_to_tx_pos_coding() {
2785 let provider = MockProvider::with_test_data();
2786 let normalizer = Normalizer::new(provider);
2787
2788 let cds_pos = CdsPos {
2790 base: 10,
2791 offset: None,
2792 utr3: false,
2793 };
2794 let result = normalizer.cds_to_tx_pos(&cds_pos, 5, Some(50));
2795 assert!(result.is_ok());
2796 assert_eq!(result.unwrap(), 14); }
2798
2799 #[test]
2800 fn test_tx_to_cds_pos_utr5() {
2801 let provider = MockProvider::with_test_data();
2802 let normalizer = Normalizer::new(provider);
2803
2804 let result = normalizer.tx_to_cds_pos(3, 10, Some(50));
2806 assert!(result.is_ok());
2807 let cds_pos = result.unwrap();
2808 assert!(cds_pos.base < 0);
2809 assert!(!cds_pos.utr3);
2810 }
2811
2812 #[test]
2813 fn test_tx_to_cds_pos_utr3() {
2814 let provider = MockProvider::with_test_data();
2815 let normalizer = Normalizer::new(provider);
2816
2817 let result = normalizer.tx_to_cds_pos(55, 10, Some(50));
2819 assert!(result.is_ok());
2820 let cds_pos = result.unwrap();
2821 assert!(cds_pos.utr3);
2822 }
2823
2824 #[test]
2825 fn test_tx_to_cds_pos_coding() {
2826 let provider = MockProvider::with_test_data();
2827 let normalizer = Normalizer::new(provider);
2828
2829 let result = normalizer.tx_to_cds_pos(20, 10, Some(50));
2831 assert!(result.is_ok());
2832 let cds_pos = result.unwrap();
2833 assert!(!cds_pos.utr3);
2834 assert_eq!(cds_pos.base, 11); }
2836
2837 #[test]
2838 fn test_config_default() {
2839 let config = NormalizeConfig::default();
2840 assert_eq!(config.shuffle_direction, ShuffleDirection::ThreePrime);
2841 assert!(!config.cross_boundaries);
2842 }
2843
2844 #[test]
2845 #[allow(deprecated)]
2846 fn test_config_builder() {
2847 let config = NormalizeConfig::default()
2848 .with_direction(ShuffleDirection::FivePrime)
2849 .allow_crossing_boundaries()
2850 .skip_validation();
2851
2852 assert_eq!(config.shuffle_direction, ShuffleDirection::FivePrime);
2853 assert!(config.cross_boundaries);
2854 assert!(!config.should_reject_ref_mismatch());
2856 assert!(!config.should_warn_ref_mismatch());
2857 }
2858
2859 #[test]
2860 fn test_duplication_3prime_shift_two_bases() {
2861 let provider = MockProvider::with_test_data();
2871 let normalizer = Normalizer::new(provider);
2872
2873 let variant = parse_hgvs("NM_888888.1:c.8dup").unwrap();
2874 let result = normalizer.normalize(&variant).unwrap();
2875 let output = format!("{}", result);
2876
2877 assert!(
2879 output.contains("dup"),
2880 "Single-base duplication should remain as dup notation, got: {}",
2881 output
2882 );
2883 }
2884
2885 #[test]
2886 fn test_duplication_3prime_shift_three_bases() {
2887 let provider = MockProvider::with_test_data();
2895 let normalizer = Normalizer::new(provider);
2896
2897 let variant = parse_hgvs("NM_888888.1:c.21dup").unwrap();
2898 let result = normalizer.normalize(&variant).unwrap();
2899 let output = format!("{}", result);
2900
2901 assert!(
2903 output.contains("dup"),
2904 "Single-base duplication should remain as dup notation, got: {}",
2905 output
2906 );
2907 }
2908
2909 #[test]
2910 fn test_duplication_no_shift_when_unique() {
2911 let provider = MockProvider::with_test_data();
2918 let normalizer = Normalizer::new(provider);
2919
2920 let variant = parse_hgvs("NM_888888.1:c.7dup").unwrap();
2921 let result = normalizer.normalize(&variant).unwrap();
2922 let output = format!("{}", result);
2923
2924 assert!(
2925 output.contains("dup"),
2926 "Should remain a duplication, got: {}",
2927 output
2928 );
2929 assert!(
2930 output.contains("c.7dup"),
2931 "Duplication of unique G at c.7 should not shift, got: {}",
2932 output
2933 );
2934 }
2935
2936 fn make_boundary_test_provider() -> MockProvider {
2973 use crate::reference::transcript::{Exon, ManeStatus, Strand, Transcript};
2974 use std::sync::OnceLock;
2975
2976 let mut provider = MockProvider::new();
2977
2978 let tx_seq = "ATGCCCAAAGGGTTTAGGCCAAAGGGTTTAGGCCCAAAAAGGGTTTAGGCCCAAATGA";
2984
2985 let mut genomic_seq = String::new();
2993
2994 for _ in 0..1000 {
2996 genomic_seq.push('N');
2997 }
2998
2999 genomic_seq.push_str("ATGCCCAAAGGGTTTAGGCC");
3001
3002 genomic_seq.push_str("GTAAGCTAAA");
3005
3006 genomic_seq.push_str("AAAGGGTTTAGGCCCAAAAA");
3008
3009 genomic_seq.push_str("AAAGTAAGGG");
3011
3012 genomic_seq.push_str("GGGTTTAGGCCCAAATGA");
3014
3015 for _ in 0..100 {
3017 genomic_seq.push('N');
3018 }
3019
3020 provider.add_genomic_sequence("chr1", genomic_seq);
3022
3023 provider.add_transcript(Transcript {
3025 id: "NM_BOUNDARY.1".to_string(),
3026 gene_symbol: Some("BOUNDARY".to_string()),
3027 strand: Strand::Plus,
3028 sequence: tx_seq.to_string(),
3029 cds_start: Some(1),
3030 cds_end: Some(58),
3031 exons: vec![
3032 Exon::with_genomic(1, 1, 20, 1000, 1019),
3033 Exon::with_genomic(2, 21, 40, 1030, 1049),
3034 Exon::with_genomic(3, 41, 58, 1060, 1077),
3035 ],
3036 chromosome: Some("chr1".to_string()),
3037 genomic_start: Some(1000),
3038 genomic_end: Some(1077),
3039 genome_build: Default::default(),
3040 mane_status: ManeStatus::None,
3041 refseq_match: None,
3042 ensembl_match: None,
3043 exon_cigars: Vec::new(),
3044 cached_introns: OnceLock::new(),
3045 });
3046
3047 provider
3048 }
3049
3050 #[test]
3051 fn test_boundary_spanning_exonic_to_intronic_del() {
3052 let provider = make_boundary_test_provider();
3059 let normalizer = Normalizer::new(provider);
3060
3061 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+3del").unwrap();
3062 let result = normalizer.normalize(&variant);
3063
3064 assert!(
3065 result.is_ok(),
3066 "Boundary-spanning deletion should normalize, got error: {:?}",
3067 result.err()
3068 );
3069
3070 let output = format!("{}", result.unwrap());
3071 assert!(
3072 output.contains("del"),
3073 "Should remain a deletion, got: {}",
3074 output
3075 );
3076 }
3077
3078 #[test]
3079 fn test_boundary_spanning_intronic_to_exonic_del() {
3080 let provider = make_boundary_test_provider();
3088 let normalizer = Normalizer::new(provider);
3089
3090 let variant = parse_hgvs("NM_BOUNDARY.1:c.21-3_23del").unwrap();
3091 let result = normalizer.normalize(&variant);
3092
3093 assert!(
3094 result.is_ok(),
3095 "Boundary-spanning deletion should normalize, got error: {:?}",
3096 result.err()
3097 );
3098
3099 let output = format!("{}", result.unwrap());
3100 assert!(
3101 output.contains("del"),
3102 "Should remain a deletion, got: {}",
3103 output
3104 );
3105 }
3106
3107 #[test]
3108 fn test_boundary_spanning_same_base_position() {
3109 let provider = make_boundary_test_provider();
3112 let normalizer = Normalizer::new(provider);
3113
3114 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+5del").unwrap();
3115 let result = normalizer.normalize(&variant);
3116
3117 assert!(
3118 result.is_ok(),
3119 "Same-base boundary-spanning deletion should normalize, got error: {:?}",
3120 result.err()
3121 );
3122 }
3123
3124 #[test]
3125 fn test_boundary_splice_site_plus1() {
3126 let provider = make_boundary_test_provider();
3129 let normalizer = Normalizer::new(provider);
3130
3131 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+1del").unwrap();
3132 let result = normalizer.normalize(&variant);
3133
3134 assert!(
3135 result.is_ok(),
3136 "Splice site +1 deletion should normalize, got error: {:?}",
3137 result.err()
3138 );
3139 }
3140
3141 #[test]
3142 fn test_boundary_splice_site_minus1() {
3143 let provider = make_boundary_test_provider();
3147 let normalizer = Normalizer::new(provider);
3148
3149 let variant = parse_hgvs("NM_BOUNDARY.1:c.21-1_22del").unwrap();
3150 let result = normalizer.normalize(&variant);
3151
3152 assert!(
3153 result.is_ok(),
3154 "Splice site -1 deletion should normalize, got error: {:?}",
3155 result.err()
3156 );
3157 }
3158
3159 #[test]
3160 fn test_boundary_spanning_dup() {
3161 let provider = make_boundary_test_provider();
3167 let normalizer = Normalizer::new(provider);
3168
3169 let variant = parse_hgvs("NM_BOUNDARY.1:c.40_40+3dup").unwrap();
3170 let result = normalizer.normalize(&variant);
3171
3172 assert!(
3173 result.is_ok(),
3174 "Boundary-spanning duplication should normalize, got error: {:?}",
3175 result.err()
3176 );
3177
3178 let output = format!("{}", result.unwrap());
3179 assert!(
3181 output.contains("dup") || output.contains("["),
3182 "Should remain a duplication or become repeat notation, got: {}",
3183 output
3184 );
3185 }
3186
3187 #[test]
3188 fn test_boundary_spanning_delins() {
3189 let provider = make_boundary_test_provider();
3191 let normalizer = Normalizer::new(provider);
3192
3193 let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+2delinsTTT").unwrap();
3194 let result = normalizer.normalize(&variant);
3195
3196 assert!(
3197 result.is_ok(),
3198 "Boundary-spanning delins should normalize, got error: {:?}",
3199 result.err()
3200 );
3201
3202 let output = format!("{}", result.unwrap());
3203 assert!(
3204 output.contains("delins") || output.contains(">"),
3205 "Should remain a delins or become substitution, got: {}",
3206 output
3207 );
3208 }
3209
3210 #[test]
3211 fn test_boundary_no_genomic_data_returns_error() {
3212 let provider = MockProvider::with_test_data(); let normalizer = Normalizer::new(provider);
3215
3216 let variant = parse_hgvs("NM_001234.1:c.11_11+3del").unwrap();
3218 let result = normalizer.normalize(&variant);
3219
3220 assert!(
3221 result.is_err(),
3222 "Boundary-spanning without genomic data should return error"
3223 );
3224 }
3225
3226 #[test]
3227 fn test_deletion_3prime_shift_consecutive_bases() {
3228 use crate::reference::transcript::{Exon, ManeStatus, Strand};
3231 use std::sync::OnceLock;
3232
3233 let mut provider = MockProvider::new();
3234
3235 let mut seq = String::new();
3240 for _ in 0..516 {
3241 seq.push('G'); }
3243 seq.push('A'); seq.push('A'); for _ in 519..=600 {
3246 seq.push('G'); }
3248
3249 provider.add_transcript(crate::reference::transcript::Transcript {
3250 id: "NM_777777.1".to_string(),
3251 gene_symbol: Some("SHIFTTEST".to_string()),
3252 strand: Strand::Plus,
3253 sequence: seq.clone(),
3254 cds_start: Some(1),
3255 cds_end: Some(600),
3256 exons: vec![Exon::new(1, 1, 600)], chromosome: None,
3258 genomic_start: None,
3259 genomic_end: None,
3260 genome_build: Default::default(),
3261 mane_status: ManeStatus::None,
3262 refseq_match: None,
3263 ensembl_match: None,
3264 exon_cigars: Vec::new(),
3265 cached_introns: OnceLock::new(),
3266 });
3267
3268 let normalizer = Normalizer::new(provider);
3269
3270 let variant = parse_hgvs("NM_777777.1:c.517delA").unwrap();
3272
3273 println!("Sequence length: {}", seq.len());
3275 println!(
3276 "Position 516 (0-based 515): {}",
3277 seq.chars().nth(515).unwrap()
3278 );
3279 println!(
3280 "Position 517 (0-based 516): {}",
3281 seq.chars().nth(516).unwrap()
3282 );
3283 println!(
3284 "Position 518 (0-based 517): {}",
3285 seq.chars().nth(517).unwrap()
3286 );
3287 println!(
3288 "Position 519 (0-based 518): {}",
3289 seq.chars().nth(518).unwrap()
3290 );
3291
3292 let result = normalizer.normalize(&variant).unwrap();
3293 let output = format!("{}", result);
3294
3295 println!("Input: NM_777777.1:c.517delA");
3296 println!("Output: {}", output);
3297
3298 assert!(
3301 output.contains("c.518del"),
3302 "Deletion at c.517 should shift to c.518 (3' rule), got: {}",
3303 output
3304 );
3305 }
3306
3307 #[test]
3308 fn test_deletion_3prime_shift_with_utr() {
3309 use crate::reference::transcript::{Exon, ManeStatus, Strand};
3312 use std::sync::OnceLock;
3313
3314 let mut provider = MockProvider::new();
3315
3316 let utr_len = 100;
3322 let mut seq = String::new();
3323
3324 for _ in 0..utr_len {
3326 seq.push('T');
3327 }
3328 for _ in 0..516 {
3330 seq.push('G');
3331 }
3332 seq.push('A'); seq.push('A'); for _ in 0..100 {
3335 seq.push('G');
3336 }
3337
3338 let seq_len = seq.len();
3339 println!("Test with UTR:");
3340 println!(" Sequence length: {}", seq_len);
3341 println!(" CDS start (1-based): 101");
3342 println!(" c.517 = tx position 617 (0-based 616)");
3343 println!(" c.518 = tx position 618 (0-based 617)");
3344 println!(
3345 " tx pos 617 (0-based 616): {}",
3346 seq.chars().nth(616).unwrap()
3347 );
3348 println!(
3349 " tx pos 618 (0-based 617): {}",
3350 seq.chars().nth(617).unwrap()
3351 );
3352
3353 provider.add_transcript(crate::reference::transcript::Transcript {
3354 id: "NM_666666.1".to_string(),
3355 gene_symbol: Some("UTRTEST".to_string()),
3356 strand: Strand::Plus,
3357 sequence: seq.clone(),
3358 cds_start: Some(101),
3359 cds_end: Some(seq_len as u64),
3360 exons: vec![Exon::new(1, 1, seq_len as u64)], chromosome: None,
3362 genomic_start: None,
3363 genomic_end: None,
3364 genome_build: Default::default(),
3365 mane_status: ManeStatus::None,
3366 refseq_match: None,
3367 ensembl_match: None,
3368 exon_cigars: Vec::new(),
3369 cached_introns: OnceLock::new(),
3370 });
3371
3372 let normalizer = Normalizer::new(provider);
3373
3374 let variant = parse_hgvs("NM_666666.1:c.517delA").unwrap();
3375 let result = normalizer.normalize(&variant).unwrap();
3376 let output = format!("{}", result);
3377
3378 println!("Input: NM_666666.1:c.517delA");
3379 println!("Output: {}", output);
3380
3381 assert!(
3382 output.contains("c.518del"),
3383 "Deletion at c.517 should shift to c.518 (3' rule) even with UTR, got: {}",
3384 output
3385 );
3386 }
3387
3388 #[test]
3389 fn test_normalize_inverted_range_insertion_no_panic() {
3390 let provider = MockProvider::with_test_data();
3395 let normalizer = Normalizer::new(provider);
3396
3397 let variant = parse_hgvs("NC_000011.10:g.5238138_5153222insTATTT").unwrap();
3398 let result = normalizer.normalize(&variant);
3399 let _ = result;
3402 }
3403
3404 #[test]
3405 fn test_delins_should_not_shift() {
3406 use crate::reference::transcript::{Exon, ManeStatus, Strand};
3409 use std::sync::OnceLock;
3410
3411 let mut provider = MockProvider::new();
3412
3413 let seq = "GGGGGGGGGGAATTCCGGGGGGGGGG".to_string(); provider.add_transcript(crate::reference::transcript::Transcript {
3419 id: "NM_555555.1".to_string(),
3420 gene_symbol: Some("DELINSTEST".to_string()),
3421 strand: Strand::Plus,
3422 sequence: seq,
3423 cds_start: Some(1),
3424 cds_end: Some(26),
3425 exons: vec![Exon::new(1, 1, 26)],
3426 chromosome: None,
3427 genomic_start: None,
3428 genomic_end: None,
3429 genome_build: Default::default(),
3430 mane_status: ManeStatus::None,
3431 refseq_match: None,
3432 ensembl_match: None,
3433 exon_cigars: Vec::new(),
3434 cached_introns: OnceLock::new(),
3435 });
3436
3437 let normalizer = Normalizer::new(provider);
3438
3439 let variant = parse_hgvs("NM_555555.1:c.10_11delinsTT").unwrap();
3441 let result = normalizer.normalize(&variant).unwrap();
3442 let output = format!("{}", result);
3443
3444 assert!(
3445 output.contains("c.10_11delins"),
3446 "Delins should NOT be shifted (HGVS spec), got: {}",
3447 output
3448 );
3449 }
3450
3451 #[test]
3452 fn test_cds_to_tx_pos_utr5_underflow() {
3453 let provider = MockProvider::with_test_data();
3455 let normalizer = Normalizer::new(provider);
3456 let pos = CdsPos {
3457 base: -6,
3458 offset: None,
3459 utr3: false,
3460 };
3461 let result = normalizer.cds_to_tx_pos(&pos, 5, Some(38));
3462 assert!(
3463 result.is_err(),
3464 "cds_to_tx_pos should return Err for positions before transcript start, got: {:?}",
3465 result
3466 );
3467 }
3468
3469 #[test]
3470 fn test_cds_to_tx_pos_utr5_valid() {
3471 let provider = MockProvider::with_test_data();
3476 let normalizer = Normalizer::new(provider);
3477 let pos = CdsPos {
3478 base: -3,
3479 offset: None,
3480 utr3: false,
3481 };
3482 let result = normalizer.cds_to_tx_pos(&pos, 5, Some(38));
3483 assert_eq!(result.unwrap(), 2);
3484 }
3485
3486 #[test]
3487 fn test_variant_positions_negative_cds_base() {
3488 let provider = MockProvider::with_test_data();
3490 let normalizer = Normalizer::new(provider);
3491 let variant = parse_hgvs("NM_001234.1:c.-3A>G").unwrap();
3492 let positions = normalizer.extract_position_range(&variant);
3493 assert_eq!(
3494 positions, None,
3495 "variant_positions should return None for 5' UTR CDS positions"
3496 );
3497 }
3498
3499 #[test]
3500 fn test_normalize_cds_utr5_deep_negative() {
3501 let provider = MockProvider::with_test_data();
3503 let normalizer = Normalizer::new(provider);
3504 let variant = parse_hgvs("NM_001234.1:c.-88A>G").unwrap();
3506 let result = normalizer.normalize(&variant);
3507 let _ = result;
3509 }
3510
3511 #[test]
3512 fn test_normalize_unknown_offset_returns_unchanged() {
3513 let provider = MockProvider::with_test_data();
3516 let normalizer = Normalizer::new(provider);
3517
3518 let variant = parse_hgvs("NM_000088.3:c.-85-?_834+?del").unwrap();
3520 let result = normalizer.normalize(&variant);
3521 assert!(
3522 result.is_ok(),
3523 "Unknown offset should not error, got: {:?}",
3524 result.err()
3525 );
3526 }
3527
3528 #[test]
3529 fn test_normalize_unknown_offset_single_position() {
3530 let provider = MockProvider::with_test_data();
3532 let normalizer = Normalizer::new(provider);
3533
3534 let variant = parse_hgvs("NM_000088.3:c.10-?del").unwrap();
3535 let result = normalizer.normalize(&variant);
3536 assert!(
3537 result.is_ok(),
3538 "Single unknown offset should not error, got: {:?}",
3539 result.err()
3540 );
3541 }
3542
3543 #[test]
3544 fn test_normalize_utr_before_tx_start_returns_unchanged() {
3545 let provider = MockProvider::with_test_data();
3548 let normalizer = Normalizer::new(provider);
3549
3550 let variant = parse_hgvs("NM_001234.1:c.-215_-214del").unwrap();
3551 let result = normalizer.normalize(&variant);
3552 assert!(
3553 result.is_ok(),
3554 "UTR before transcript start should not error, got: {:?}",
3555 result.err()
3556 );
3557 }
3558
3559 #[test]
3560 fn test_normalize_no_cds_returns_unchanged() {
3561 let mut provider = MockProvider::new();
3563 use crate::reference::transcript::{Exon, Transcript};
3564 provider.add_transcript(Transcript {
3565 id: "NR_001566.1".to_string(),
3566 gene_symbol: Some("NCRNA".to_string()),
3567 strand: crate::reference::transcript::Strand::Plus,
3568 sequence: "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC".to_string(),
3569 cds_start: None,
3570 cds_end: None,
3571 exons: vec![Exon::new(1, 1, 51)],
3572 chromosome: None,
3573 genomic_start: None,
3574 genomic_end: None,
3575 genome_build: Default::default(),
3576 mane_status: Default::default(),
3577 refseq_match: None,
3578 ensembl_match: None,
3579 exon_cigars: Vec::new(),
3580 cached_introns: std::sync::OnceLock::new(),
3581 });
3582 let normalizer = Normalizer::new(provider);
3583
3584 let variant = parse_hgvs("NR_001566.1:c.10del").unwrap();
3586 let result = normalizer.normalize(&variant);
3587 assert!(
3588 result.is_ok(),
3589 "No CDS should not error, got: {:?}",
3590 result.err()
3591 );
3592 }
3593
3594 #[test]
3595 fn test_normalize_tx_intronic() {
3596 use crate::reference::transcript::{Exon, GenomeBuild, ManeStatus, Strand, Transcript};
3599
3600 let mut provider = MockProvider::new();
3601
3602 let tx_sequence = "A".repeat(200);
3608
3609 provider.add_transcript(Transcript {
3610 id: "NR_038982.1".to_string(),
3611 gene_symbol: Some("NCRNA_TEST".to_string()),
3612 strand: Strand::Plus,
3613 sequence: tx_sequence,
3614 cds_start: None,
3615 cds_end: None,
3616 exons: vec![
3617 Exon::with_genomic(1, 1, 100, 1000, 1099),
3618 Exon::with_genomic(2, 101, 200, 1200, 1299),
3619 ],
3620 chromosome: Some("chr1".to_string()),
3621 genomic_start: Some(1000),
3622 genomic_end: Some(1299),
3623 genome_build: GenomeBuild::GRCh38,
3624 mane_status: ManeStatus::None,
3625 refseq_match: None,
3626 ensembl_match: None,
3627 exon_cigars: Vec::new(),
3628 cached_introns: std::sync::OnceLock::new(),
3629 });
3630
3631 let mut genomic = String::new();
3634 for _ in 0..325 {
3635 genomic.push_str("AGCT");
3636 }
3637 provider.add_genomic_sequence("chr1", genomic);
3638
3639 let normalizer = Normalizer::new(provider);
3640
3641 let variant = parse_hgvs("NR_038982.1:n.100+4del").unwrap();
3643 let result = normalizer.normalize(&variant);
3644 assert!(
3645 result.is_ok(),
3646 "n. intronic normalization should succeed, got: {:?}",
3647 result.err()
3648 );
3649 let output = format!("{}", result.unwrap());
3650 assert!(
3651 output.contains('+') || output.contains('-'),
3652 "Normalized intronic n. variant should retain intronic notation, got: {}",
3653 output
3654 );
3655 }
3656}