Skip to main content

ferro_hgvs/normalize/
mod.rs

1//! Normalization engine
2//!
3//! Implements the core HGVS variant normalization algorithm including
4//! 3'/5' shifting and boundary detection.
5//!
6//! # Coordinate Systems
7//!
8//! This module uses multiple coordinate systems:
9//!
10//! | Context | Basis | Type/Notes |
11//! |---------|-------|------------|
12//! | HGVS variant positions | 1-based | `u64` for genomic, `i64` for CDS/Tx |
13//! | Array indexing | 0-based | `usize` for sequence slicing |
14//! | Boundaries struct | 1-based | `(start, end)` inclusive |
15//! | Shuffle input/output | 0-based | Uses array indices |
16//! | Relative positions | 1-based | Positions within fetched window |
17//!
18//! Key conversions:
19//! - `hgvs_pos_to_index(pos)` converts 1-based HGVS position to 0-based index
20//! - `index_to_hgvs_pos(idx)` converts 0-based index to 1-based HGVS position
21//! - `pos.saturating_sub(1)` manually converts 1-based to 0-based
22//! - `idx + 1` manually converts 0-based to 1-based
23//!
24//! For type-safe coordinate handling, see [`crate::coords`].
25
26pub mod boundary;
27pub mod config;
28pub mod rules;
29pub mod shuffle;
30pub mod validate;
31
32use crate::coords::{hgvs_pos_to_index, index_to_hgvs_pos};
33use crate::error::FerroError;
34use crate::hgvs::edit::NaEdit;
35use crate::hgvs::interval::Interval;
36use crate::hgvs::location::{CdsPos, GenomePos, TxPos};
37use crate::hgvs::variant::{CdsVariant, GenomeVariant, HgvsVariant, LocEdit, TxVariant};
38use crate::hgvs::HgvsVariant as HV;
39use crate::reference::ReferenceProvider;
40use boundary::Boundaries;
41pub use config::{NormalizeConfig, ShuffleDirection};
42use rules::{canonicalize_edit, needs_normalization, should_canonicalize};
43use shuffle::shuffle;
44
45/// Warning generated during normalization
46#[derive(Debug, Clone)]
47pub struct NormalizationWarning {
48    /// Warning code (e.g., "REFSEQ_MISMATCH")
49    pub code: String,
50    /// Human-readable description
51    pub message: String,
52    /// What the input claimed as reference
53    pub stated_ref: String,
54    /// What the actual reference sequence has
55    pub actual_ref: String,
56    /// Position info
57    pub position: String,
58    /// Whether the mismatch was auto-corrected
59    pub corrected: bool,
60}
61
62/// Result of normalization with optional warnings
63#[derive(Debug, Clone)]
64pub struct NormalizeResultWithWarnings {
65    /// The normalized variant
66    pub result: HgvsVariant,
67    /// Warnings generated during normalization
68    pub warnings: Vec<NormalizationWarning>,
69}
70
71impl NormalizeResultWithWarnings {
72    /// Create a new result without warnings
73    pub fn new(result: HgvsVariant) -> Self {
74        Self {
75            result,
76            warnings: vec![],
77        }
78    }
79
80    /// Create a result with warnings
81    pub fn with_warnings(result: HgvsVariant, warnings: Vec<NormalizationWarning>) -> Self {
82        Self { result, warnings }
83    }
84
85    /// Add a warning to the result
86    pub fn add_warning(&mut self, warning: NormalizationWarning) {
87        self.warnings.push(warning);
88    }
89
90    /// Check if there are any warnings
91    pub fn has_warnings(&self) -> bool {
92        !self.warnings.is_empty()
93    }
94
95    /// Check if there's a reference mismatch warning
96    pub fn has_ref_mismatch(&self) -> bool {
97        self.warnings.iter().any(|w| w.code == "REFSEQ_MISMATCH")
98    }
99}
100
101/// Main normalizer struct
102pub struct Normalizer<P: ReferenceProvider> {
103    provider: P,
104    config: NormalizeConfig,
105}
106
107impl<P: ReferenceProvider> Normalizer<P> {
108    /// Create a new normalizer with the given reference provider
109    pub fn new(provider: P) -> Self {
110        Self {
111            provider,
112            config: NormalizeConfig::default(),
113        }
114    }
115
116    /// Create a normalizer with custom configuration
117    pub fn with_config(provider: P, config: NormalizeConfig) -> Self {
118        Self { provider, config }
119    }
120
121    /// Get the configuration
122    pub fn config(&self) -> &NormalizeConfig {
123        &self.config
124    }
125
126    /// Normalize a variant
127    ///
128    /// In strict mode (default), rejects variants with reference mismatches.
129    /// Use `normalize_with_warnings` for lenient mode that corrects mismatches.
130    pub fn normalize(&self, variant: &HgvsVariant) -> Result<HgvsVariant, FerroError> {
131        let result = self.normalize_with_warnings(variant)?;
132
133        // In strict mode, reject if there were reference mismatches
134        if self.config.should_reject_ref_mismatch() && result.has_ref_mismatch() {
135            if let Some(warning) = result.warnings.iter().find(|w| w.code == "REFSEQ_MISMATCH") {
136                return Err(FerroError::ReferenceMismatch {
137                    location: warning.position.clone(),
138                    expected: warning.stated_ref.clone(),
139                    found: warning.actual_ref.clone(),
140                });
141            }
142        }
143
144        Ok(result.result)
145    }
146
147    /// Normalize a variant with detailed warnings
148    ///
149    /// Returns the normalized variant along with any warnings generated during
150    /// normalization (e.g., reference sequence mismatches that were auto-corrected).
151    /// Use this method when you want to track what corrections were made.
152    pub fn normalize_with_warnings(
153        &self,
154        variant: &HgvsVariant,
155    ) -> Result<NormalizeResultWithWarnings, FerroError> {
156        let (result, warnings) = match variant {
157            HV::Genome(v) => self.normalize_genome(v)?,
158            HV::Cds(v) => self.normalize_cds(v)?,
159            HV::Tx(v) => self.normalize_tx(v)?,
160            HV::Protein(v) => self.normalize_protein(v)?,
161            HV::Rna(v) => self.normalize_rna(v)?,
162            HV::Mt(v) => self.normalize_mt(v)?,
163            HV::Allele(a) => self.normalize_allele(a)?,
164            // Circular variants normalize like genomic variants
165            HV::Circular(v) => (
166                HV::Circular(crate::hgvs::variant::CircularVariant {
167                    accession: v.accession.clone(),
168                    gene_symbol: v.gene_symbol.clone(),
169                    loc_edit: v.loc_edit.clone(),
170                }),
171                vec![],
172            ),
173            // RNA fusions pass through unchanged (no normalization needed for fusions)
174            HV::RnaFusion(v) => (HV::RnaFusion(v.clone()), vec![]),
175            // Null and unknown allele markers pass through unchanged
176            HV::NullAllele => (HV::NullAllele, vec![]),
177            HV::UnknownAllele => (HV::UnknownAllele, vec![]),
178        };
179
180        Ok(NormalizeResultWithWarnings::with_warnings(result, warnings))
181    }
182
183    /// Normalize an allele (compound) variant
184    ///
185    /// Normalizes each variant in the allele individually, with overlap prevention.
186    /// After normalization, checks if variants would overlap and constrains shifting
187    /// to prevent collisions.
188    fn normalize_allele(
189        &self,
190        allele: &crate::hgvs::variant::AlleleVariant,
191    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
192        // First pass: normalize each variant independently, collecting all warnings
193        let mut all_warnings = Vec::new();
194        let mut normalized = Vec::new();
195
196        for v in &allele.variants {
197            let result = self.normalize_with_warnings(v)?;
198            all_warnings.extend(result.warnings);
199            normalized.push(result.result);
200        }
201
202        // Second pass: check for overlaps and resolve
203        if self.config.prevent_overlap {
204            normalized = self.resolve_overlaps(normalized)?;
205        }
206
207        Ok((
208            HgvsVariant::Allele(crate::hgvs::variant::AlleleVariant::new(
209                normalized,
210                allele.phase,
211            )),
212            all_warnings,
213        ))
214    }
215
216    /// Resolve overlaps between normalized variants in a compound allele.
217    ///
218    /// When normalization shifts variants, they may end up overlapping. This function
219    /// detects overlaps and constrains the normalization to prevent collisions.
220    fn resolve_overlaps(&self, variants: Vec<HgvsVariant>) -> Result<Vec<HgvsVariant>, FerroError> {
221        if variants.len() < 2 {
222            return Ok(variants);
223        }
224
225        // Extract positions for comparison
226        let mut positions: Vec<(usize, Option<(u64, u64)>)> = variants
227            .iter()
228            .enumerate()
229            .map(|(i, v)| (i, self.extract_position_range(v)))
230            .collect();
231
232        // Sort by start position
233        positions.sort_by(|a, b| match (&a.1, &b.1) {
234            (Some((s1, _)), Some((s2, _))) => s1.cmp(s2),
235            (Some(_), None) => std::cmp::Ordering::Less,
236            (None, Some(_)) => std::cmp::Ordering::Greater,
237            (None, None) => std::cmp::Ordering::Equal,
238        });
239
240        // Check for overlaps and mark variants that need adjustment
241        let mut needs_adjustment = vec![false; variants.len()];
242
243        for i in 0..positions.len() - 1 {
244            let (_idx_a, pos_a) = &positions[i];
245            let (idx_b, pos_b) = &positions[i + 1];
246
247            if let (Some((_, end_a)), Some((start_b, _))) = (pos_a, pos_b) {
248                // Check for overlap: end of A >= start of B
249                if end_a >= start_b {
250                    // Mark the second variant for potential adjustment
251                    needs_adjustment[*idx_b] = true;
252                }
253            }
254        }
255
256        // For now, we report overlaps but don't modify the variants
257        // A more sophisticated implementation would re-normalize with constraints
258        // or report a warning
259        for (i, needs_adj) in needs_adjustment.iter().enumerate() {
260            if *needs_adj {
261                // In a complete implementation, we would constrain normalization
262                // For now, we preserve the variant as-is with a potential warning
263                // Future: add to a warnings collection
264                let _ = i; // Suppress unused variable warning
265            }
266        }
267
268        Ok(variants)
269    }
270
271    /// Extract the genomic/CDS position range from a variant.
272    fn extract_position_range(&self, variant: &HgvsVariant) -> Option<(u64, u64)> {
273        match variant {
274            HV::Genome(v) => {
275                let start = v.loc_edit.location.start.inner()?.base;
276                let end = v.loc_edit.location.end.inner()?.base;
277                Some((start, end))
278            }
279            HV::Cds(v) => {
280                let start_pos = v.loc_edit.location.start.inner()?;
281                let end_pos = v.loc_edit.location.end.inner()?;
282                // Only return positions for exonic variants
283                if start_pos.is_intronic() || end_pos.is_intronic() {
284                    None
285                } else {
286                    Some((start_pos.base as u64, end_pos.base as u64))
287                }
288            }
289            HV::Tx(v) => {
290                let start = v.loc_edit.location.start.inner()?.base;
291                let end = v.loc_edit.location.end.inner()?.base;
292                // Only return positions for positive bases (within transcript)
293                if start < 1 || end < 1 {
294                    None
295                } else {
296                    Some((start as u64, end as u64))
297                }
298            }
299            _ => None,
300        }
301    }
302
303    /// Normalize a genomic variant
304    fn normalize_genome(
305        &self,
306        variant: &GenomeVariant,
307    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
308        // Can't normalize variants with unknown edits or positions
309        let edit = match variant.loc_edit.edit.inner() {
310            Some(e) => e,
311            None => return Ok((HV::Genome(variant.clone()), vec![])),
312        };
313
314        // Only normalize indels
315        if !needs_normalization(edit) {
316            return Ok((HV::Genome(variant.clone()), vec![]));
317        }
318
319        // Get sequence from provider - can't normalize with unknown positions
320        let accession = variant.accession.full();
321        let start = match variant.loc_edit.location.start.inner() {
322            Some(pos) => pos.base,
323            None => {
324                return Ok((
325                    HV::Genome(self.canonicalize_genome_variant(variant)),
326                    vec![],
327                ))
328            }
329        };
330        let end = match variant.loc_edit.location.end.inner() {
331            Some(pos) => pos.base,
332            None => {
333                return Ok((
334                    HV::Genome(self.canonicalize_genome_variant(variant)),
335                    vec![],
336                ))
337            }
338        };
339
340        // Try to get transcript/sequence, fall back to minimal notation if not found
341        let window_start = start.saturating_sub(self.config.window_size);
342        let seq_result = self.provider.get_sequence(
343            &accession,
344            window_start,
345            end.saturating_add(self.config.window_size),
346        );
347
348        let ref_seq = match seq_result {
349            Ok(s) => s,
350            // Can't do full normalization without sequence, but apply minimal notation
351            Err(_) => {
352                return Ok((
353                    HV::Genome(self.canonicalize_genome_variant(variant)),
354                    vec![],
355                ))
356            }
357        };
358
359        // Adjust coordinates to be relative to the window
360        let rel_start = start - window_start;
361        let rel_end = end - window_start;
362
363        // Perform normalization
364        let (new_rel_start, new_rel_end, new_edit, warnings) = self.normalize_na_edit(
365            ref_seq.as_bytes(),
366            edit,
367            rel_start,
368            rel_end,
369            &Boundaries::new(1, ref_seq.len() as u64),
370        )?;
371
372        // Adjust back to genomic coordinates
373        let new_start = new_rel_start + window_start;
374        let new_end = new_rel_end + window_start;
375
376        // Reconstruct variant with new position (using adjusted coordinates)
377        let new_variant = GenomeVariant {
378            accession: variant.accession.clone(),
379            gene_symbol: variant.gene_symbol.clone(),
380            loc_edit: LocEdit::new(
381                Interval::new(GenomePos::new(new_start), GenomePos::new(new_end)),
382                new_edit,
383            ),
384        };
385
386        Ok((HV::Genome(new_variant), warnings))
387    }
388
389    /// Normalize a CDS variant
390    fn normalize_cds(
391        &self,
392        variant: &CdsVariant,
393    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
394        // Can't normalize variants with unknown edits or positions
395        let edit = match variant.loc_edit.edit.inner() {
396            Some(e) => e,
397            None => return Ok((HV::Cds(variant.clone()), vec![])),
398        };
399
400        // Only normalize indels
401        if !needs_normalization(edit) {
402            return Ok((HV::Cds(variant.clone()), vec![]));
403        }
404
405        // Check for intronic variants or unknown positions - can't normalize those
406        let start_pos = match variant.loc_edit.location.start.inner() {
407            Some(pos) => pos,
408            None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
409        };
410        let end_pos = match variant.loc_edit.location.end.inner() {
411            Some(pos) => pos,
412            None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
413        };
414
415        // Try to get transcript first - we need it for intronic normalization too
416        let accession = variant.accession.full();
417        let transcript = match self.provider.get_transcript(&accession) {
418            Ok(t) => t,
419            // Can't do full normalization without transcript, but apply minimal notation
420            Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
421        };
422
423        // Handle intronic variants specially
424        if start_pos.is_intronic() || end_pos.is_intronic() {
425            // Check if both positions are intronic and in the same intron
426            if start_pos.is_intronic() && end_pos.is_intronic() {
427                return self.normalize_intronic_cds(variant, &transcript, start_pos, end_pos, edit);
428            }
429            // Variant spans exon-intron boundary - normalize in genomic space
430            return self.normalize_boundary_spanning_cds(
431                variant,
432                &transcript,
433                start_pos,
434                end_pos,
435                edit,
436            );
437        }
438
439        // Convert CDS to transcript coordinates for normalization
440        let cds_start = transcript
441            .cds_start
442            .ok_or_else(|| FerroError::ConversionError {
443                msg: "Transcript has no CDS".to_string(),
444            })?;
445
446        // Calculate transcript positions
447        let tx_start = self.cds_to_tx_pos(start_pos, cds_start, transcript.cds_end)?;
448        let tx_end = self.cds_to_tx_pos(end_pos, cds_start, transcript.cds_end)?;
449
450        // Get boundaries (stay within exon unless configured otherwise)
451        let boundaries = if self.config.cross_boundaries {
452            Boundaries::new(1, transcript.sequence.len() as u64)
453        } else {
454            boundary::get_cds_boundaries(&transcript, tx_start, &self.config)?
455        };
456
457        // Perform normalization on transcript sequence
458        let seq = transcript.sequence.as_bytes();
459        let (new_tx_start, new_tx_end, new_edit, warnings) =
460            self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
461
462        // Convert back to CDS coordinates
463        let new_start = self.tx_to_cds_pos(new_tx_start, cds_start, transcript.cds_end)?;
464        let new_end = self.tx_to_cds_pos(new_tx_end, cds_start, transcript.cds_end)?;
465
466        let new_variant = CdsVariant {
467            accession: variant.accession.clone(),
468            gene_symbol: variant.gene_symbol.clone(),
469            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
470        };
471
472        Ok((HV::Cds(new_variant), warnings))
473    }
474
475    /// Normalize a transcript variant
476    fn normalize_tx(
477        &self,
478        variant: &TxVariant,
479    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
480        // Can't normalize variants with unknown edits or positions
481        let edit = match variant.loc_edit.edit.inner() {
482            Some(e) => e,
483            None => return Ok((HV::Tx(variant.clone()), vec![])),
484        };
485
486        // Only normalize indels
487        if !needs_normalization(edit) {
488            return Ok((HV::Tx(variant.clone()), vec![]));
489        }
490
491        // Check for intronic variants or unknown positions
492        let start_pos = match variant.loc_edit.location.start.inner() {
493            Some(pos) => pos,
494            None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
495        };
496        let end_pos = match variant.loc_edit.location.end.inner() {
497            Some(pos) => pos,
498            None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
499        };
500
501        if start_pos.is_intronic() || end_pos.is_intronic() {
502            return Err(FerroError::IntronicVariant {
503                variant: format!("{}", variant),
504            });
505        }
506
507        // Try to get transcript
508        let accession = variant.accession.full();
509        let transcript = match self.provider.get_transcript(&accession) {
510            Ok(t) => t,
511            // Can't do full normalization without transcript, but apply minimal notation
512            Err(_) => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
513        };
514
515        // Only normalize positive positions (within transcript)
516        // Negative positions are outside the transcript sequence
517        if start_pos.base < 1 || end_pos.base < 1 {
518            return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
519        }
520
521        let tx_start = start_pos.base as u64;
522        let tx_end = end_pos.base as u64;
523
524        // Get boundaries
525        let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
526
527        // Perform normalization
528        let seq = transcript.sequence.as_bytes();
529        let (new_start, new_end, new_edit, warnings) =
530            self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
531
532        let new_variant = TxVariant {
533            accession: variant.accession.clone(),
534            gene_symbol: variant.gene_symbol.clone(),
535            loc_edit: LocEdit::new(
536                Interval::new(TxPos::new(new_start as i64), TxPos::new(new_end as i64)),
537                new_edit,
538            ),
539        };
540
541        Ok((HV::Tx(new_variant), warnings))
542    }
543
544    /// Normalize a protein variant
545    ///
546    /// Protein normalization differs from nucleic acid normalization - there's no
547    /// 3'/5' shifting. Instead, we perform formatting standardization:
548    ///
549    /// 1. **Reference validation**: Check that position amino acids match the
550    ///    reference protein sequence (if protein data is available).
551    ///
552    /// 2. **Redundant sequence removal**: Remove explicit sequences in deletions
553    ///    when they match the amino acids at the deletion position.
554    ///    Example: `p.Val600delVal` → `p.Val600del`
555    ///
556    /// 3. **1-letter to 3-letter conversion**: (handled by parser/display)
557    fn normalize_protein(
558        &self,
559        variant: &crate::hgvs::variant::ProteinVariant,
560    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
561        use crate::hgvs::edit::ProteinEdit;
562        use crate::hgvs::variant::{LocEdit, ProteinVariant};
563
564        // Validate reference amino acids if provider has protein data
565        if self.provider.has_protein_data() {
566            self.validate_protein_reference(variant)?;
567        }
568
569        // Get the current edit
570        let edit = match variant.loc_edit.edit.inner() {
571            Some(e) => e,
572            None => return Ok((HV::Protein(variant.clone()), vec![])),
573        };
574
575        // Apply normalization based on edit type
576        let normalized_edit = match edit {
577            ProteinEdit::Deletion { sequence, count } => {
578                // Check for redundant sequence that matches the position
579                if let Some(seq) = sequence {
580                    if self.is_redundant_protein_deletion_sequence(&variant.loc_edit.location, seq)
581                    {
582                        // Remove redundant sequence
583                        ProteinEdit::Deletion {
584                            sequence: None,
585                            count: *count,
586                        }
587                    } else {
588                        edit.clone()
589                    }
590                } else {
591                    edit.clone()
592                }
593            }
594            // Other edits pass through unchanged
595            _ => edit.clone(),
596        };
597
598        // Only create a new variant if the edit changed
599        if &normalized_edit != edit {
600            let new_variant = ProteinVariant {
601                accession: variant.accession.clone(),
602                gene_symbol: variant.gene_symbol.clone(),
603                loc_edit: LocEdit::with_uncertainty(
604                    variant.loc_edit.location.clone(),
605                    variant.loc_edit.edit.map_ref(|_| normalized_edit),
606                ),
607            };
608            Ok((HV::Protein(new_variant), vec![]))
609        } else {
610            Ok((HV::Protein(variant.clone()), vec![]))
611        }
612    }
613
614    /// Validate that the amino acids in a protein variant match the reference
615    ///
616    /// Returns an error if there's a mismatch between the variant's stated
617    /// amino acid(s) and the actual reference protein sequence.
618    fn validate_protein_reference(
619        &self,
620        variant: &crate::hgvs::variant::ProteinVariant,
621    ) -> Result<(), FerroError> {
622        use crate::hgvs::edit::ProteinEdit;
623
624        let accession = variant.accession.full();
625
626        // Get start and end positions
627        let start_pos = match variant.loc_edit.location.start.inner() {
628            Some(pos) => pos,
629            None => return Ok(()), // Can't validate uncertain positions
630        };
631        let end_pos = match variant.loc_edit.location.end.inner() {
632            Some(pos) => pos,
633            None => return Ok(()),
634        };
635
636        // Get the edit to know what amino acids to validate
637        let edit = match variant.loc_edit.edit.inner() {
638            Some(e) => e,
639            None => return Ok(()),
640        };
641
642        // Only validate edits that specify reference amino acids
643        match edit {
644            ProteinEdit::Substitution { .. } => {
645                // For substitutions, the reference AA comes from the position (start_pos.aa),
646                // not from the edit (which may be Xaa placeholder)
647                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
648            }
649            ProteinEdit::Deletion { .. } => {
650                // Validate start position (from the interval's start AA)
651                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
652                // Validate end position if different
653                if end_pos.number != start_pos.number {
654                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
655                }
656            }
657            ProteinEdit::Duplication => {
658                // Validate start and end positions
659                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
660                if end_pos.number != start_pos.number {
661                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
662                }
663            }
664            ProteinEdit::Insertion { .. } => {
665                // Validate flanking positions
666                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
667                self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
668            }
669            ProteinEdit::Delins { .. } => {
670                // Validate start and end
671                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
672                if end_pos.number != start_pos.number {
673                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
674                }
675            }
676            ProteinEdit::Frameshift { .. } => {
677                // Validate the frameshift position
678                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
679            }
680            ProteinEdit::Extension { .. } => {
681                // Extension typically at Ter position - validate
682                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
683            }
684            _ => {
685                // Identity, Unknown, etc. - no validation needed
686            }
687        }
688
689        Ok(())
690    }
691
692    /// Validate a single protein position against reference
693    fn validate_protein_position(
694        &self,
695        accession: &str,
696        position: u64,
697        expected_aa: &crate::hgvs::location::AminoAcid,
698    ) -> Result<(), FerroError> {
699        // Position is 1-based in HGVS, convert to 0-based for sequence access
700        // get_protein_sequence uses half-open interval [start, end)
701        let start = hgvs_pos_to_index(position) as u64;
702        let end = position; // exclusive end
703
704        // Try to get the reference amino acid
705        match self.provider.get_protein_sequence(accession, start, end) {
706            Ok(ref_seq) => {
707                if ref_seq.len() != 1 {
708                    return Ok(()); // Unexpected, skip validation
709                }
710
711                let ref_aa_char = ref_seq.chars().next().unwrap();
712                let expected_char = expected_aa.to_one_letter();
713
714                if ref_aa_char != expected_char {
715                    return Err(FerroError::AminoAcidMismatch {
716                        accession: accession.to_string(),
717                        position,
718                        expected: expected_aa.to_string(),
719                        found: ref_aa_char.to_string(),
720                    });
721                }
722            }
723            Err(_) => {
724                // Protein sequence not available, skip validation
725            }
726        }
727
728        Ok(())
729    }
730
731    /// Check if the deletion sequence is redundant (matches the position amino acids)
732    ///
733    /// A deletion sequence is redundant if it exactly matches the amino acids
734    /// specified in the interval. For example:
735    /// - `p.Val600delVal` - sequence [Val] matches position 600's Val → redundant
736    /// - `p.Lys23_Leu24delLysLeu` - sequence [Lys, Leu] matches positions 23-24 → redundant
737    fn is_redundant_protein_deletion_sequence(
738        &self,
739        interval: &crate::hgvs::interval::ProtInterval,
740        sequence: &crate::hgvs::edit::AminoAcidSeq,
741    ) -> bool {
742        // Get the start and end positions
743        let start_pos = match interval.start.inner() {
744            Some(pos) => pos,
745            None => return false,
746        };
747        let end_pos = match interval.end.inner() {
748            Some(pos) => pos,
749            None => return false,
750        };
751
752        // Calculate expected sequence length from interval
753        let interval_len = if end_pos.number >= start_pos.number {
754            (end_pos.number - start_pos.number + 1) as usize
755        } else {
756            return false;
757        };
758
759        // Check if sequence length matches
760        if sequence.len() != interval_len {
761            return false;
762        }
763
764        // For a point deletion (single AA), check if the sequence matches the position AA
765        if interval_len == 1 {
766            return sequence.0.len() == 1 && sequence.0[0] == start_pos.aa;
767        }
768
769        // For a range deletion, check first and last AAs
770        // The sequence should be [start_aa, ..., end_aa]
771        if let (Some(first), Some(last)) = (sequence.0.first(), sequence.0.last()) {
772            return *first == start_pos.aa && *last == end_pos.aa;
773        }
774
775        false
776    }
777
778    /// Normalize an RNA variant
779    ///
780    /// RNA variants (r.) are similar to transcript variants (n.) and undergo
781    /// the same 3'/5' shifting normalization for indels. The main difference
782    /// is that RNA uses lowercase nucleotides in HGVS notation.
783    fn normalize_rna(
784        &self,
785        variant: &crate::hgvs::variant::RnaVariant,
786    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
787        use crate::hgvs::interval::RnaInterval;
788        use crate::hgvs::location::RnaPos;
789        use crate::hgvs::variant::{LocEdit, RnaVariant};
790
791        // Can't normalize variants with unknown edits or positions
792        let edit = match variant.loc_edit.edit.inner() {
793            Some(e) => e,
794            None => return Ok((HV::Rna(variant.clone()), vec![])),
795        };
796
797        // Only normalize indels
798        if !needs_normalization(edit) {
799            return Ok((HV::Rna(variant.clone()), vec![]));
800        }
801
802        // Check for intronic variants or unknown positions
803        let start_pos = match variant.loc_edit.location.start.inner() {
804            Some(pos) => pos,
805            None => return Ok((HV::Rna(variant.clone()), vec![])),
806        };
807        let end_pos = match variant.loc_edit.location.end.inner() {
808            Some(pos) => pos,
809            None => return Ok((HV::Rna(variant.clone()), vec![])),
810        };
811
812        if start_pos.is_intronic() || end_pos.is_intronic() {
813            return Err(FerroError::IntronicVariant {
814                variant: format!("{}", variant),
815            });
816        }
817
818        // Try to get transcript (RNA uses the same accession as mRNA transcripts)
819        let accession = variant.accession.full();
820        let transcript = match self.provider.get_transcript(&accession) {
821            Ok(t) => t,
822            Err(_) => return Ok((HV::Rna(variant.clone()), vec![])),
823        };
824
825        // Only normalize positive positions (within transcript)
826        // Negative positions are outside the transcript sequence
827        if start_pos.base < 1 || end_pos.base < 1 {
828            return Ok((HV::Rna(variant.clone()), vec![]));
829        }
830
831        let rna_start = start_pos.base as u64;
832        let rna_end = end_pos.base as u64;
833
834        // Get boundaries
835        let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
836
837        // Perform normalization
838        let seq = transcript.sequence.as_bytes();
839        let (new_start, new_end, new_edit, warnings) =
840            self.normalize_na_edit(seq, edit, rna_start, rna_end, &boundaries)?;
841
842        let new_variant = RnaVariant {
843            accession: variant.accession.clone(),
844            gene_symbol: variant.gene_symbol.clone(),
845            loc_edit: LocEdit::new(
846                RnaInterval::new(RnaPos::new(new_start as i64), RnaPos::new(new_end as i64)),
847                new_edit,
848            ),
849        };
850
851        Ok((HV::Rna(new_variant), warnings))
852    }
853
854    /// Normalize a mitochondrial variant
855    fn normalize_mt(
856        &self,
857        variant: &crate::hgvs::variant::MtVariant,
858    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
859        // MT variants are similar to genomic
860        // For now, return as-is (could implement circular genome handling)
861        Ok((HV::Mt(variant.clone()), vec![]))
862    }
863
864    /// Normalize an intronic CDS variant
865    ///
866    /// This converts the intronic position to genomic coordinates, normalizes
867    /// in genomic space, and converts back to CDS intronic notation.
868    fn normalize_intronic_cds(
869        &self,
870        variant: &CdsVariant,
871        transcript: &crate::reference::transcript::Transcript,
872        start_pos: &CdsPos,
873        end_pos: &CdsPos,
874        edit: &NaEdit,
875    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
876        use crate::convert::CoordinateMapper;
877
878        // Check if we have genomic data available
879        if !self.provider.has_genomic_data() {
880            return Err(FerroError::IntronicVariant {
881                variant: format!("{}", variant),
882            });
883        }
884
885        // Get the chromosome for this transcript
886        let chromosome =
887            transcript
888                .chromosome
889                .as_ref()
890                .ok_or_else(|| FerroError::ConversionError {
891                    msg: "Transcript has no chromosome mapping for intronic normalization"
892                        .to_string(),
893                })?;
894
895        // Create coordinate mapper
896        let mapper = CoordinateMapper::new(transcript);
897
898        // Convert CDS intronic positions to genomic
899        let g_start = mapper.cds_to_genomic_with_intron(start_pos)?;
900        let g_end = mapper.cds_to_genomic_with_intron(end_pos)?;
901
902        // Ensure start <= end (may be reversed on minus strand)
903        let (g_start, g_end) = if g_start <= g_end {
904            (g_start, g_end)
905        } else {
906            (g_end, g_start)
907        };
908
909        // Get a window of genomic sequence around the variant for normalization
910        // Use the same window size as for exonic normalization
911        let window = self.config.window_size;
912        let seq_start = g_start.saturating_sub(window);
913        let seq_end = g_end.saturating_add(window);
914
915        // Fetch genomic sequence
916        let genomic_seq = self
917            .provider
918            .get_genomic_sequence(chromosome, seq_start, seq_end)?;
919
920        // Calculate the variant position relative to the fetched sequence
921        let rel_start = (g_start - seq_start) + 1; // 1-based
922        let rel_end = (g_end - seq_start) + 1;
923
924        // Define boundaries within the intron
925        // For intronic variants, we can shift within the intron but not into exons
926        // Find the intron boundaries
927        // Use exon-aware CDS-to-tx mapping to account for cdot's gap positions
928        let tx_pos = mapper.cds_to_tx(start_pos)?;
929        let tx_start = tx_pos.base as u64;
930
931        let intron = transcript
932            .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
933            .ok_or_else(|| FerroError::ConversionError {
934                msg: "Could not find intron for normalization".to_string(),
935            })?;
936
937        // Get intron boundaries in genomic coordinates
938        let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
939            (Some(s), Some(e)) => (s, e),
940            _ => {
941                return Err(FerroError::ConversionError {
942                    msg: "Intron has no genomic coordinates".to_string(),
943                })
944            }
945        };
946
947        // Calculate relative intron boundaries
948        let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
949        let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
950        let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
951
952        // Perform normalization in genomic space
953        let seq_bytes = genomic_seq.as_bytes();
954        let (new_rel_start, new_rel_end, new_edit, warnings) =
955            self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
956
957        // Convert the normalized genomic position back to absolute genomic
958        let new_g_start = seq_start + new_rel_start - 1;
959        let new_g_end = seq_start + new_rel_end - 1;
960
961        // Convert back to CDS intronic notation
962        let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
963        let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
964
965        let new_variant = CdsVariant {
966            accession: variant.accession.clone(),
967            gene_symbol: variant.gene_symbol.clone(),
968            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
969        };
970
971        Ok((HV::Cds(new_variant), warnings))
972    }
973
974    /// Normalize a CDS variant that spans an exon-intron boundary
975    ///
976    /// This handles cases like:
977    /// - c.914_918+3del (exonic start, intronic end)
978    /// - c.194-64_233del (intronic start, exonic end)
979    ///
980    /// Strategy: Convert to genomic coordinates, normalize there, convert back.
981    fn normalize_boundary_spanning_cds(
982        &self,
983        variant: &CdsVariant,
984        transcript: &crate::reference::transcript::Transcript,
985        start_pos: &CdsPos,
986        end_pos: &CdsPos,
987        edit: &NaEdit,
988    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
989        use crate::convert::CoordinateMapper;
990
991        // Require genomic data for boundary spanning normalization
992        if !self.provider.has_genomic_data() {
993            return Err(FerroError::ExonIntronBoundary {
994                exon: 0,
995                variant: format!("{}", variant),
996            });
997        }
998
999        let chromosome =
1000            transcript
1001                .chromosome
1002                .as_ref()
1003                .ok_or_else(|| FerroError::ConversionError {
1004                    msg: "Transcript has no chromosome for boundary normalization".to_string(),
1005                })?;
1006
1007        let mapper = CoordinateMapper::new(transcript);
1008
1009        // Convert both positions to genomic
1010        // For exonic positions, use standard conversion
1011        // For intronic positions, use intronic conversion
1012        let g_start = self.cds_pos_to_genomic(&mapper, start_pos)?;
1013        let g_end = self.cds_pos_to_genomic(&mapper, end_pos)?;
1014
1015        // Handle strand orientation - ensure start <= end
1016        let (g_start, g_end) = if g_start <= g_end {
1017            (g_start, g_end)
1018        } else {
1019            (g_end, g_start)
1020        };
1021
1022        // Fetch genomic sequence with window for normalization
1023        let window = self.config.window_size;
1024        let seq_start = g_start.saturating_sub(window);
1025        let seq_end = g_end.saturating_add(window);
1026        let genomic_seq = self
1027            .provider
1028            .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1029
1030        // Calculate relative positions (1-based)
1031        let rel_start = (g_start - seq_start) + 1;
1032        let rel_end = (g_end - seq_start) + 1;
1033
1034        // Determine normalization boundaries
1035        // For boundary-spanning variants, use the union of exon and intron boundaries
1036        let boundaries =
1037            self.get_boundary_spanning_limits(transcript, &mapper, start_pos, end_pos, seq_start)?;
1038
1039        // Normalize in genomic space
1040        let seq_bytes = genomic_seq.as_bytes();
1041        let (new_rel_start, new_rel_end, new_edit, warnings) =
1042            self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
1043
1044        // Convert back to absolute genomic
1045        let new_g_start = seq_start + new_rel_start - 1;
1046        let new_g_end = seq_start + new_rel_end - 1;
1047
1048        // Convert genomic back to CDS
1049        // The result might be:
1050        // - Still boundary-spanning
1051        // - Fully exonic (if shifted into exon)
1052        // - Fully intronic (if shifted into intron)
1053        let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1054        let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1055
1056        let new_variant = CdsVariant {
1057            accession: variant.accession.clone(),
1058            gene_symbol: variant.gene_symbol.clone(),
1059            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1060        };
1061
1062        Ok((HV::Cds(new_variant), warnings))
1063    }
1064
1065    /// Convert a CDS position (exonic or intronic) to genomic coordinate
1066    fn cds_pos_to_genomic(
1067        &self,
1068        mapper: &crate::convert::CoordinateMapper,
1069        pos: &CdsPos,
1070    ) -> Result<u64, FerroError> {
1071        if pos.is_intronic() {
1072            mapper.cds_to_genomic_with_intron(pos)
1073        } else {
1074            mapper
1075                .cds_to_genomic(pos)?
1076                .ok_or_else(|| FerroError::ConversionError {
1077                    msg: format!("CDS position {} not in exons", pos.base),
1078                })
1079        }
1080    }
1081
1082    /// Calculate normalization boundaries for boundary-spanning variants
1083    ///
1084    /// Returns the genomic region within which we can shift the variant,
1085    /// which is the union of the exon and intron containing the variant ends.
1086    fn get_boundary_spanning_limits(
1087        &self,
1088        transcript: &crate::reference::transcript::Transcript,
1089        mapper: &crate::convert::CoordinateMapper,
1090        start_pos: &CdsPos,
1091        end_pos: &CdsPos,
1092        seq_start: u64,
1093    ) -> Result<Boundaries, FerroError> {
1094        // Find the genomic extent of the region we can shift within
1095        // This is the union of:
1096        // - The exon containing the exonic position
1097        // - The intron containing the intronic position
1098
1099        // Identify which position is exonic and which is intronic
1100        let (exonic_pos, intronic_pos) = if start_pos.is_intronic() {
1101            (end_pos, start_pos)
1102        } else {
1103            (start_pos, end_pos)
1104        };
1105
1106        // Get exon boundaries in genomic coords
1107        let tx_pos = mapper.cds_to_tx(exonic_pos)?;
1108        let exon =
1109            transcript
1110                .exon_at(tx_pos.base as u64)
1111                .ok_or_else(|| FerroError::ConversionError {
1112                    msg: "Could not find exon for boundary normalization".to_string(),
1113                })?;
1114
1115        // Get intron boundaries in genomic coords
1116        let tx_boundary = mapper.cds_to_tx(intronic_pos)?;
1117        let offset = intronic_pos.offset.unwrap_or(0);
1118        let intron = transcript
1119            .find_intron_at_tx_boundary(tx_boundary.base as u64, offset)
1120            .ok_or_else(|| FerroError::ConversionError {
1121                msg: "Could not find intron for boundary normalization".to_string(),
1122            })?;
1123
1124        // Get genomic coordinates from exon and intron
1125        let (exon_g_start, exon_g_end): (u64, u64) = match (exon.genomic_start, exon.genomic_end) {
1126            (Some(s), Some(e)) => (s, e),
1127            _ => {
1128                return Err(FerroError::ConversionError {
1129                    msg: "Exon has no genomic coordinates".to_string(),
1130                })
1131            }
1132        };
1133
1134        let (intron_g_start, intron_g_end): (u64, u64) =
1135            match (intron.genomic_start, intron.genomic_end) {
1136                (Some(s), Some(e)) => (s, e),
1137                _ => {
1138                    return Err(FerroError::ConversionError {
1139                        msg: "Intron has no genomic coordinates".to_string(),
1140                    })
1141                }
1142            };
1143
1144        // Union of exon and intron genomic coordinates
1145        let combined_start = exon_g_start.min(intron_g_start);
1146        let combined_end = exon_g_end.max(intron_g_end);
1147
1148        // Convert to relative positions (1-based within our fetched sequence)
1149        let rel_start = combined_start.saturating_sub(seq_start) + 1;
1150        let rel_end = combined_end.saturating_sub(seq_start) + 1;
1151
1152        Ok(Boundaries::new(rel_start, rel_end))
1153    }
1154
1155    /// Core normalization for nucleic acid edits
1156    ///
1157    /// Returns (new_start, new_end, new_edit, warnings)
1158    fn normalize_na_edit(
1159        &self,
1160        ref_seq: &[u8],
1161        edit: &NaEdit,
1162        start: u64,
1163        end: u64,
1164        boundaries: &Boundaries,
1165    ) -> Result<(u64, u64, NaEdit, Vec<NormalizationWarning>), FerroError> {
1166        let mut warnings = Vec::new();
1167
1168        // Validate reference allele before normalization
1169        let validation = validate::validate_reference(edit, ref_seq, start, end);
1170        if !validation.valid {
1171            warnings.push(NormalizationWarning {
1172                code: "REFSEQ_MISMATCH".to_string(),
1173                message: validation.warning.unwrap_or_default(),
1174                stated_ref: validation.stated_ref.unwrap_or_default(),
1175                actual_ref: validation.actual_ref.unwrap_or_default(),
1176                position: format!("{}-{}", start, end),
1177                corrected: true,
1178            });
1179        }
1180
1181        // Get the alternate sequence for the edit
1182        let alt_seq = match edit {
1183            NaEdit::Deletion { .. } => vec![],
1184            NaEdit::Insertion { sequence } => {
1185                // Only shuffle if we have a literal sequence
1186                if let Some(bases) = sequence.bases() {
1187                    bases.iter().map(|b| b.to_u8()).collect()
1188                } else {
1189                    // Cannot shuffle non-literal insertions (counts, ranges, etc.)
1190                    return Ok((start, end, edit.clone(), warnings.clone()));
1191                }
1192            }
1193            NaEdit::Duplication { sequence, .. } => {
1194                if let Some(seq) = sequence {
1195                    seq.bases().iter().map(|b| b.to_u8()).collect()
1196                } else {
1197                    // Get duplicated sequence from reference
1198                    // start is 1-based, convert to 0-based index for array access
1199                    let s = hgvs_pos_to_index(start);
1200                    let e = end as usize;
1201                    if e <= ref_seq.len() {
1202                        ref_seq[s..e].to_vec()
1203                    } else {
1204                        vec![]
1205                    }
1206                }
1207            }
1208            NaEdit::Delins { sequence } => {
1209                use crate::hgvs::edit::InsertedSequence;
1210
1211                // HGVS spec: delins should NOT be 3' shifted like del/dup/ins
1212                // But we should check if delins should become a duplication first
1213                // Example: c.5delinsGG where position 5 is G → dup
1214                if let InsertedSequence::Literal(seq) = sequence {
1215                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1216                    let start_idx = hgvs_pos_to_index(start);
1217                    let end_idx = end as usize;
1218
1219                    if rules::delins_is_duplication(ref_seq, start_idx, end_idx, &seq_bytes) {
1220                        // Convert to duplication with minimal notation
1221                        return Ok((
1222                            start,
1223                            end,
1224                            NaEdit::Duplication {
1225                                sequence: None,
1226                                length: None,
1227                                uncertain_extent: None,
1228                            },
1229                            warnings.clone(),
1230                        ));
1231                    }
1232                }
1233                // Return unchanged (only apply minimal representation rules, not shuffling)
1234                return Ok((start, end, edit.clone(), warnings.clone()));
1235            }
1236            NaEdit::Inversion { sequence, length } => {
1237                // Apply inversion shortening: remove outer bases that cancel out
1238                // when the first base is complementary to the last base
1239                let start_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1240                let end_idx = end as usize; // end is exclusive (0-based)
1241
1242                if let Some((new_s, new_e)) = rules::shorten_inversion(ref_seq, start_idx, end_idx)
1243                {
1244                    // Inversion was shortened - convert back to 1-based
1245                    let shortened_edit = NaEdit::Inversion {
1246                        sequence: sequence.clone(),
1247                        length: *length,
1248                    };
1249                    return Ok((
1250                        index_to_hgvs_pos(new_s),
1251                        new_e as u64,
1252                        shortened_edit,
1253                        warnings,
1254                    ));
1255                } else {
1256                    // Inversion reduced to identity - return as identity
1257                    return Ok((
1258                        start,
1259                        end,
1260                        NaEdit::Identity {
1261                            sequence: None,
1262                            whole_entity: false,
1263                        },
1264                        warnings,
1265                    ));
1266                }
1267            }
1268            NaEdit::Repeat {
1269                sequence,
1270                count,
1271                additional_counts,
1272                trailing,
1273            } => {
1274                use crate::hgvs::edit::RepeatCount;
1275
1276                // Only normalize exact counts with a sequence
1277                let Some(seq) = sequence else {
1278                    return Ok((start, end, edit.clone(), warnings.clone()));
1279                };
1280
1281                let RepeatCount::Exact(specified_count) = count else {
1282                    return Ok((start, end, edit.clone(), warnings.clone()));
1283                };
1284
1285                // Skip if there are additional counts (genotype notation)
1286                if !additional_counts.is_empty() || trailing.is_some() {
1287                    return Ok((start, end, edit.clone(), warnings.clone()));
1288                }
1289
1290                // Get the repeat unit as bytes
1291                let repeat_unit: Vec<u8> = seq.bases().iter().map(|b| b.to_u8()).collect();
1292                let pos_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1293
1294                // Normalize the repeat
1295                match rules::normalize_repeat(ref_seq, pos_idx, &repeat_unit, *specified_count) {
1296                    rules::RepeatNormResult::Deletion {
1297                        start: del_start,
1298                        end: del_end,
1299                    } => {
1300                        // Minimal notation - no explicit length
1301                        let del_edit = NaEdit::Deletion {
1302                            sequence: None,
1303                            length: None,
1304                        };
1305                        return Ok((del_start, del_end, del_edit, warnings));
1306                    }
1307                    rules::RepeatNormResult::Duplication {
1308                        start: dup_start,
1309                        end: dup_end,
1310                        sequence: _dup_seq,
1311                    } => {
1312                        // Minimal notation - no explicit sequence or length
1313                        let dup_edit = NaEdit::Duplication {
1314                            sequence: None,
1315                            length: None,
1316                            uncertain_extent: None,
1317                        };
1318                        return Ok((dup_start, dup_end, dup_edit, warnings));
1319                    }
1320                    rules::RepeatNormResult::Repeat {
1321                        start: rep_start,
1322                        end: rep_end,
1323                        sequence: rep_seq,
1324                        count: rep_count,
1325                    } => {
1326                        use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1327                        let bases: Vec<Base> = rep_seq
1328                            .iter()
1329                            .filter_map(|&b| Base::from_char(b as char))
1330                            .collect();
1331                        let rep_edit = NaEdit::Repeat {
1332                            sequence: Some(Sequence::new(bases)),
1333                            count: RepeatCount::Exact(rep_count),
1334                            additional_counts: vec![],
1335                            trailing: None,
1336                        };
1337                        return Ok((rep_start, rep_end, rep_edit, warnings));
1338                    }
1339                    rules::RepeatNormResult::Unchanged => {
1340                        return Ok((start, end, edit.clone(), warnings.clone()));
1341                    }
1342                }
1343            }
1344            _ => return Ok((start, end, edit.clone(), warnings.clone())), // Other edits don't need shuffling
1345        };
1346
1347        // Perform shuffle
1348        // For insertions, the HGVS interval X_Y (where Y = X+1) represents flanking positions.
1349        // We need to adjust the end coordinate so shuffle checks the correct reference position.
1350        // For c.445_446insA: start=634, end=635 (1-based tx coords)
1351        // We want shuffle to check ref_seq[634-1] = ref_seq[633] for first flanking
1352        // and ref_seq[635-1] = ref_seq[634] for second flanking (the position to check for 3' shift)
1353        //
1354        // For insertions, we need to adjust the start passed to shuffle so that the alt_idx
1355        // calculation starts at 0 (not 1). The shuffle's alt_idx formula is:
1356        //   alt_idx = (new_end - start) % alt_seq.len()
1357        // For insertions, new_end starts at shuffle_end (which is end - 1 for insertions).
1358        // If we pass start_idx directly, alt_idx = (end-1) - start_idx = 1 (wrong, should be 0).
1359        // By passing start_idx + 1, we get alt_idx = (end-1) - (start_idx+1) = 0 (correct).
1360        let shuffle_end = match edit {
1361            NaEdit::Insertion { .. } => end.saturating_sub(1), // Use second flanking position
1362            _ => end,
1363        };
1364        let start_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1365        let shuffle_start = match edit {
1366            NaEdit::Insertion { .. } => start_idx as u64 + 1, // Adjust so alt_idx starts at 0
1367            _ => start_idx as u64,
1368        };
1369        let result = shuffle(
1370            ref_seq,
1371            &alt_seq,
1372            shuffle_start,
1373            shuffle_end, // Adjusted for insertions
1374            boundaries,
1375            self.config.shuffle_direction,
1376        );
1377
1378        // Convert back to 1-based HGVS positions
1379        // For insertions, we adjusted the start for shuffle, now adjust back
1380        let shuffle_result_start = match edit {
1381            NaEdit::Insertion { .. } => result.start.saturating_sub(1), // Adjust back
1382            _ => result.start,
1383        };
1384        let new_start = index_to_hgvs_pos(shuffle_result_start as usize);
1385        // For insertions, we adjusted the end for shuffle, now restore the HGVS X_(X+1) format
1386        let new_end = match edit {
1387            NaEdit::Insertion { .. } => index_to_hgvs_pos(result.end as usize), // Restore second flanking position
1388            _ => result.end,
1389        };
1390
1391        // Determine the canonical form for the edit
1392        // HGVS rules:
1393        // - Deletions ALWAYS stay as deletions (just shift 3')
1394        // - Insertions become duplications if single-base matches adjacent
1395        // - Multi-base insertions/dups in homopolymer become repeat notation
1396        let (final_start, final_end, new_edit) = match edit {
1397            NaEdit::Insertion { sequence } => {
1398                use crate::hgvs::edit::{InsertedSequence, RepeatCount, Sequence};
1399
1400                if let InsertedSequence::Literal(seq) = sequence {
1401                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1402
1403                    // Check for repeat notation first (multi-base homopolymer insertion)
1404                    // Use the ORIGINAL position (start), not shuffled position (result.start)
1405                    // because repeat notation refers to the reference tract position
1406                    if seq_bytes.len() > 1 {
1407                        let original_pos_idx = hgvs_pos_to_index(start) as u64; // 0-based original position
1408                        if let Some((base, count, rep_start, rep_end)) =
1409                            rules::insertion_to_repeat(ref_seq, original_pos_idx, &seq_bytes)
1410                        {
1411                            // Convert to repeat notation: BASE[count]
1412                            use crate::hgvs::edit::Base;
1413                            if let Some(base_enum) = Base::from_char(base as char) {
1414                                let repeat_seq = Sequence::new(vec![base_enum]);
1415                                let repeat_edit = NaEdit::Repeat {
1416                                    sequence: Some(repeat_seq),
1417                                    count: RepeatCount::Exact(count),
1418                                    additional_counts: vec![],
1419                                    trailing: None,
1420                                };
1421                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1422                            }
1423                        }
1424                    }
1425
1426                    // Check for simple duplication (single-base or matching adjacent)
1427                    // When shifting an insertion through a repeat region, the effective sequence
1428                    // rotates. For example, shifting "GGC" by 1 position gives "GCG".
1429                    let shift_amount =
1430                        (result.start as usize).saturating_sub(shuffle_start as usize);
1431                    let rotation = shift_amount % seq_bytes.len();
1432                    let rotated_seq: Vec<u8> = if rotation > 0 {
1433                        seq_bytes[rotation..]
1434                            .iter()
1435                            .chain(seq_bytes[..rotation].iter())
1436                            .copied()
1437                            .collect()
1438                    } else {
1439                        seq_bytes.clone()
1440                    };
1441
1442                    if rules::insertion_is_duplication(ref_seq, result.start, &rotated_seq) {
1443                        // For duplication, use minimal notation without explicit sequence
1444                        // Position: for c.X_(X+1)ins that duplicates preceding sequence,
1445                        // the result is c.(X-len+1)_Xdup
1446                        let dup_len = rotated_seq.len() as u64;
1447                        let (dup_start, dup_end) = if dup_len == 1 {
1448                            (new_start, new_start) // Single position for single-base dup
1449                        } else {
1450                            // Multi-base dup: the duplicated region is BEFORE the insertion point
1451                            (new_start - dup_len + 1, new_start)
1452                        };
1453                        (
1454                            dup_start,
1455                            dup_end,
1456                            NaEdit::Duplication {
1457                                sequence: None, // Minimal notation - no explicit sequence
1458                                length: None,
1459                                uncertain_extent: None,
1460                            },
1461                        )
1462                    } else {
1463                        // Output the rotated sequence for shifted insertions
1464                        if rotation > 0 {
1465                            use crate::hgvs::edit::{Base, InsertedSequence, Sequence};
1466                            let rotated_bases: Vec<Base> = rotated_seq
1467                                .iter()
1468                                .filter_map(|&b| Base::from_char(b as char))
1469                                .collect();
1470                            let new_sequence =
1471                                InsertedSequence::Literal(Sequence::new(rotated_bases));
1472                            (
1473                                new_start,
1474                                new_end,
1475                                NaEdit::Insertion {
1476                                    sequence: new_sequence,
1477                                },
1478                            )
1479                        } else {
1480                            (new_start, new_end, edit.clone())
1481                        }
1482                    }
1483                } else {
1484                    (new_start, new_end, edit.clone())
1485                }
1486            }
1487            NaEdit::Duplication { .. } => {
1488                use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1489
1490                // Check if duplication should become repeat notation
1491                // Use the shuffled positions (result.start, result.end) which are 0-based
1492                // This applies to both single-base dups in homopolymers and multi-base tandem dups
1493                if let Some(dup_result) =
1494                    rules::duplication_to_repeat(ref_seq, result.start, result.end)
1495                {
1496                    match dup_result {
1497                        rules::DupToRepeatResult::Homopolymer {
1498                            base,
1499                            count,
1500                            start: rep_start,
1501                            end: rep_end,
1502                        } => {
1503                            if let Some(base_enum) = Base::from_char(base as char) {
1504                                let repeat_seq = Sequence::new(vec![base_enum]);
1505                                let repeat_edit = NaEdit::Repeat {
1506                                    sequence: Some(repeat_seq),
1507                                    count: RepeatCount::Exact(count),
1508                                    additional_counts: vec![],
1509                                    trailing: None,
1510                                };
1511                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1512                            }
1513                        }
1514                        rules::DupToRepeatResult::TandemRepeat {
1515                            unit,
1516                            count,
1517                            start: rep_start,
1518                            end: rep_end,
1519                        } => {
1520                            let bases: Vec<Base> = unit
1521                                .iter()
1522                                .filter_map(|&b| Base::from_char(b as char))
1523                                .collect();
1524                            if bases.len() == unit.len() {
1525                                let repeat_seq = Sequence::new(bases);
1526                                let repeat_edit = NaEdit::Repeat {
1527                                    sequence: Some(repeat_seq),
1528                                    count: RepeatCount::Exact(count),
1529                                    additional_counts: vec![],
1530                                    trailing: None,
1531                                };
1532                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1533                            }
1534                        }
1535                    }
1536                }
1537                // Keep as duplication but strip explicit sequence (minimal notation)
1538                (
1539                    new_start,
1540                    new_end,
1541                    NaEdit::Duplication {
1542                        sequence: None,
1543                        length: None,
1544                        uncertain_extent: None,
1545                    },
1546                )
1547            }
1548            NaEdit::Delins { sequence } => {
1549                use crate::hgvs::edit::{InsertedSequence, Sequence};
1550
1551                // Check if delins should become a duplication
1552                // Example: c.5delinsGG where position 5 is G → dup
1553                if let InsertedSequence::Literal(seq) = sequence {
1554                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1555                    let start_0 = result.start as usize;
1556                    let end_0 = result.end as usize;
1557
1558                    if rules::delins_is_duplication(ref_seq, start_0, end_0, &seq_bytes) {
1559                        // Convert to duplication - the duplicated sequence is the deleted region
1560                        let dup_len = end_0 - start_0;
1561                        let dup_seq_bytes = &ref_seq[start_0..end_0];
1562                        let dup_bases: Vec<crate::hgvs::edit::Base> = dup_seq_bytes
1563                            .iter()
1564                            .filter_map(|&b| crate::hgvs::edit::Base::from_char(b as char))
1565                            .collect();
1566                        let dup_seq = Sequence::new(dup_bases);
1567
1568                        (
1569                            new_start,
1570                            new_end,
1571                            NaEdit::Duplication {
1572                                sequence: Some(dup_seq),
1573                                length: Some(dup_len as u64),
1574                                uncertain_extent: None,
1575                            },
1576                        )
1577                    } else {
1578                        (new_start, new_end, edit.clone())
1579                    }
1580                } else {
1581                    (new_start, new_end, edit.clone())
1582                }
1583            }
1584            // Deletions - strip explicit length for minimal notation
1585            NaEdit::Deletion { .. } => (
1586                new_start,
1587                new_end,
1588                NaEdit::Deletion {
1589                    sequence: None,
1590                    length: None,
1591                },
1592            ),
1593            // All other edit types stay unchanged
1594            _ => (new_start, new_end, edit.clone()),
1595        };
1596
1597        Ok((final_start, final_end, new_edit, warnings))
1598    }
1599
1600    /// Convert CDS position to transcript position
1601    fn cds_to_tx_pos(
1602        &self,
1603        pos: &CdsPos,
1604        cds_start: u64,
1605        cds_end: Option<u64>,
1606    ) -> Result<u64, FerroError> {
1607        if pos.utr3 {
1608            let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1609                msg: "No CDS end".to_string(),
1610            })?;
1611            Ok(end + pos.base as u64)
1612        } else if pos.base < 1 {
1613            Ok((cds_start as i64 + pos.base - 1) as u64)
1614        } else {
1615            Ok(cds_start + pos.base as u64 - 1)
1616        }
1617    }
1618
1619    /// Convert transcript position to CDS position
1620    fn tx_to_cds_pos(
1621        &self,
1622        pos: u64,
1623        cds_start: u64,
1624        cds_end: Option<u64>,
1625    ) -> Result<CdsPos, FerroError> {
1626        let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1627            msg: "No CDS end".to_string(),
1628        })?;
1629
1630        if pos < cds_start {
1631            // For 5'UTR: cds_to_tx_pos formula is tx = cds_start + base - 1
1632            // So inverse is: base = tx - cds_start + 1
1633            Ok(CdsPos {
1634                base: pos as i64 - cds_start as i64 + 1,
1635                offset: None,
1636                utr3: false,
1637            })
1638        } else if pos > end {
1639            Ok(CdsPos {
1640                base: (pos - end) as i64,
1641                offset: None,
1642                utr3: true,
1643            })
1644        } else {
1645            Ok(CdsPos {
1646                base: (pos - cds_start + 1) as i64,
1647                offset: None,
1648                utr3: false,
1649            })
1650        }
1651    }
1652
1653    /// Apply minimal notation to a CDS variant without full normalization.
1654    ///
1655    /// This is used when we can't do full normalization (e.g., missing transcript)
1656    /// but still want to apply minimal HGVS notation rules.
1657    fn canonicalize_cds_variant(&self, variant: &CdsVariant) -> CdsVariant {
1658        let edit = match variant.loc_edit.edit.inner() {
1659            Some(e) => e,
1660            None => return variant.clone(),
1661        };
1662
1663        // Only canonicalize if the edit has redundant information
1664        if !should_canonicalize(edit) {
1665            return variant.clone();
1666        }
1667
1668        let canonical_edit = canonicalize_edit(edit);
1669
1670        CdsVariant {
1671            accession: variant.accession.clone(),
1672            gene_symbol: variant.gene_symbol.clone(),
1673            loc_edit: LocEdit::with_uncertainty(
1674                variant.loc_edit.location.clone(),
1675                variant.loc_edit.edit.map_ref(|_| canonical_edit),
1676            ),
1677        }
1678    }
1679
1680    /// Apply minimal notation to a genome variant without full normalization.
1681    fn canonicalize_genome_variant(&self, variant: &GenomeVariant) -> GenomeVariant {
1682        let edit = match variant.loc_edit.edit.inner() {
1683            Some(e) => e,
1684            None => return variant.clone(),
1685        };
1686
1687        if !should_canonicalize(edit) {
1688            return variant.clone();
1689        }
1690
1691        let canonical_edit = canonicalize_edit(edit);
1692
1693        GenomeVariant {
1694            accession: variant.accession.clone(),
1695            gene_symbol: variant.gene_symbol.clone(),
1696            loc_edit: LocEdit::with_uncertainty(
1697                variant.loc_edit.location.clone(),
1698                variant.loc_edit.edit.map_ref(|_| canonical_edit),
1699            ),
1700        }
1701    }
1702
1703    /// Apply minimal notation to a transcript variant without full normalization.
1704    fn canonicalize_tx_variant(&self, variant: &TxVariant) -> TxVariant {
1705        let edit = match variant.loc_edit.edit.inner() {
1706            Some(e) => e,
1707            None => return variant.clone(),
1708        };
1709
1710        if !should_canonicalize(edit) {
1711            return variant.clone();
1712        }
1713
1714        let canonical_edit = canonicalize_edit(edit);
1715
1716        TxVariant {
1717            accession: variant.accession.clone(),
1718            gene_symbol: variant.gene_symbol.clone(),
1719            loc_edit: LocEdit::with_uncertainty(
1720                variant.loc_edit.location.clone(),
1721                variant.loc_edit.edit.map_ref(|_| canonical_edit),
1722            ),
1723        }
1724    }
1725}
1726
1727#[cfg(test)]
1728mod tests {
1729    use super::*;
1730    use crate::hgvs::parser::parse_hgvs;
1731    use crate::reference::MockProvider;
1732
1733    #[test]
1734    fn test_normalize_substitution_unchanged() {
1735        let provider = MockProvider::with_test_data();
1736        let normalizer = Normalizer::new(provider);
1737
1738        let variant = parse_hgvs("NM_000088.3:c.10A>G").unwrap();
1739        let normalized = normalizer.normalize(&variant).unwrap();
1740
1741        // Substitutions should not change
1742        assert_eq!(format!("{}", variant), format!("{}", normalized));
1743    }
1744
1745    #[test]
1746    fn test_normalize_with_config() {
1747        let provider = MockProvider::with_test_data();
1748        let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
1749        let normalizer = Normalizer::with_config(provider, config);
1750
1751        assert_eq!(
1752            normalizer.config().shuffle_direction,
1753            ShuffleDirection::FivePrime
1754        );
1755    }
1756
1757    #[test]
1758    fn test_normalizer_handles_missing_transcript() {
1759        let provider = MockProvider::new(); // Empty provider
1760        let normalizer = Normalizer::new(provider);
1761
1762        // Should return variant unchanged when transcript not found
1763        let variant = parse_hgvs("NM_MISSING.1:c.100del").unwrap();
1764        let result = normalizer.normalize(&variant);
1765        assert!(result.is_ok());
1766        // Verify output equals input (unchanged)
1767        assert_eq!(
1768            format!("{}", variant),
1769            format!("{}", result.unwrap()),
1770            "Missing transcript should return variant unchanged"
1771        );
1772    }
1773
1774    #[test]
1775    fn test_normalize_deletion_shifts_3prime() {
1776        // NM_001234.1 has G repeat spanning exon boundaries
1777        // Exon 1: c.1-11, Exon 2: c.12-26, Exon 3: c.27+
1778        // G repeat is at c.9-c.33, but shift stops at exon boundary
1779        // c.10 is in exon 1 (ends at c.11), so deletion shifts to c.11
1780        let provider = MockProvider::with_test_data();
1781        let normalizer = Normalizer::new(provider);
1782
1783        let variant = parse_hgvs("NM_001234.1:c.10del").unwrap();
1784        let result = normalizer.normalize(&variant).unwrap();
1785        let output = format!("{}", result);
1786
1787        // Should remain a deletion
1788        assert!(
1789            output.contains("del"),
1790            "Deletion should remain a deletion, got: {}",
1791            output
1792        );
1793        // Should shift from c.10 to c.11 (3'-most within exon 1)
1794        assert!(
1795            output.contains("c.11del"),
1796            "Deletion should shift 3' to exon boundary (c.11), got: {}",
1797            output
1798        );
1799    }
1800
1801    #[test]
1802    fn test_normalize_insertion_becomes_dup() {
1803        // NM_001234.1 has G repeat at CDS positions c.9-c.33
1804        // Inserting G after position 10 should shift 3' and become dup
1805        let provider = MockProvider::with_test_data();
1806        let normalizer = Normalizer::new(provider);
1807
1808        let variant = parse_hgvs("NM_001234.1:c.10_11insG").unwrap();
1809        let result = normalizer.normalize(&variant).unwrap();
1810        let output = format!("{}", result);
1811
1812        // Inserting G in a G-repeat should become dup
1813        assert!(
1814            output.contains("dup"),
1815            "Insertion of matching base should become dup, got: {}",
1816            output
1817        );
1818    }
1819
1820    #[test]
1821    fn test_normalize_duplication_shifts_3prime() {
1822        // NM_001234.1 has G repeat spanning positions c.9-33 (25 G's)
1823        // Single-base duplications stay as simple dups (only 2+ base dups become repeat notation)
1824        let provider = MockProvider::with_test_data();
1825        let normalizer = Normalizer::new(provider);
1826
1827        let variant = parse_hgvs("NM_001234.1:c.10dup").unwrap();
1828        let result = normalizer.normalize(&variant).unwrap();
1829        let output = format!("{}", result);
1830
1831        // Single-base duplication should shift 3' and stay as dup
1832        // c.10dup in GGGGG...GGG tract shifts to rightmost position (c.33) but stays as dup
1833        assert!(
1834            output.contains("dup"),
1835            "Single-base duplication should remain as dup notation, got: {}",
1836            output
1837        );
1838    }
1839
1840    #[test]
1841    fn test_normalize_delins_unchanged() {
1842        // A delins that doesn't simplify should stay as delins
1843        // Deleting G and inserting AT is not a dup pattern
1844        let provider = MockProvider::with_test_data();
1845        let normalizer = Normalizer::new(provider);
1846
1847        let variant = parse_hgvs("NM_000088.3:c.10delinsAT").unwrap();
1848        let result = normalizer.normalize(&variant).unwrap();
1849        let output = format!("{}", result);
1850
1851        assert!(
1852            output.contains("delinsAT"),
1853            "Delins should remain unchanged, got: {}",
1854            output
1855        );
1856    }
1857
1858    #[test]
1859    fn test_normalize_protein_substitution_unchanged() {
1860        let provider = MockProvider::with_test_data();
1861        let normalizer = Normalizer::new(provider);
1862
1863        // Protein substitution variants should pass through unchanged
1864        let variant = parse_hgvs("NP_000079.2:p.Val600Glu").unwrap();
1865        let normalized = normalizer.normalize(&variant).unwrap();
1866        assert_eq!(format!("{}", variant), format!("{}", normalized));
1867    }
1868
1869    #[test]
1870    fn test_normalize_protein_deletion_removes_redundant_sequence() {
1871        // Redundant sequence removal: p.Val600delVal → p.Val600del
1872        let provider = MockProvider::with_test_data();
1873        let normalizer = Normalizer::new(provider);
1874
1875        let variant = parse_hgvs("NP_000079.2:p.Val600delVal").unwrap();
1876        let normalized = normalizer.normalize(&variant).unwrap();
1877        let output = format!("{}", normalized);
1878
1879        // Should remove redundant "Val" from the deletion
1880        assert_eq!(
1881            output, "NP_000079.2:p.Val600del",
1882            "Redundant sequence should be removed from deletion"
1883        );
1884    }
1885
1886    #[test]
1887    fn test_normalize_protein_deletion_range_removes_redundant_sequence() {
1888        // Redundant sequence removal for range: p.Lys23_Glu25delLysAlaGlu → p.Lys23_Glu25del
1889        let provider = MockProvider::with_test_data();
1890        let normalizer = Normalizer::new(provider);
1891
1892        let variant = parse_hgvs("NP_000079.2:p.Lys23_Glu25delLysAlaGlu").unwrap();
1893        let normalized = normalizer.normalize(&variant).unwrap();
1894        let output = format!("{}", normalized);
1895
1896        // Should remove redundant sequence from the deletion
1897        assert_eq!(
1898            output, "NP_000079.2:p.Lys23_Glu25del",
1899            "Redundant sequence should be removed from range deletion"
1900        );
1901    }
1902
1903    #[test]
1904    fn test_normalize_protein_deletion_non_matching_sequence_unchanged() {
1905        // Non-matching sequence should stay: p.Val600delGlu should NOT change
1906        let provider = MockProvider::with_test_data();
1907        let normalizer = Normalizer::new(provider);
1908
1909        let variant = parse_hgvs("NP_000079.2:p.Val600delGlu").unwrap();
1910        let normalized = normalizer.normalize(&variant).unwrap();
1911        let output = format!("{}", normalized);
1912
1913        // Should NOT remove non-matching sequence
1914        assert_eq!(
1915            output, "NP_000079.2:p.Val600delGlu",
1916            "Non-matching sequence should not be removed"
1917        );
1918    }
1919
1920    #[test]
1921    fn test_normalize_protein_deletion_no_sequence_unchanged() {
1922        // Deletion without sequence should stay unchanged
1923        let provider = MockProvider::with_test_data();
1924        let normalizer = Normalizer::new(provider);
1925
1926        let variant = parse_hgvs("NP_000079.2:p.Val600del").unwrap();
1927        let normalized = normalizer.normalize(&variant).unwrap();
1928        let output = format!("{}", normalized);
1929
1930        assert_eq!(
1931            output, "NP_000079.2:p.Val600del",
1932            "Deletion without sequence should remain unchanged"
1933        );
1934    }
1935
1936    #[test]
1937    fn test_normalize_protein_duplication_unchanged() {
1938        // Duplications should pass through unchanged
1939        let provider = MockProvider::with_test_data();
1940        let normalizer = Normalizer::new(provider);
1941
1942        let variant = parse_hgvs("NP_000079.2:p.Val600dup").unwrap();
1943        let normalized = normalizer.normalize(&variant).unwrap();
1944        assert_eq!(format!("{}", variant), format!("{}", normalized));
1945    }
1946
1947    #[test]
1948    fn test_normalize_protein_frameshift_unchanged() {
1949        // Frameshifts should pass through unchanged
1950        let provider = MockProvider::with_test_data();
1951        let normalizer = Normalizer::new(provider);
1952
1953        let variant = parse_hgvs("NP_000079.2:p.Arg97ProfsTer23").unwrap();
1954        let normalized = normalizer.normalize(&variant).unwrap();
1955        assert_eq!(format!("{}", variant), format!("{}", normalized));
1956    }
1957
1958    #[test]
1959    fn test_normalize_protein_reference_validation_match() {
1960        // Test that validation passes when amino acid matches reference
1961        // NP_TEST.1 has: M at position 1, V at position 2
1962        let provider = MockProvider::with_test_data();
1963        let normalizer = Normalizer::new(provider);
1964
1965        // Position 1 = M (Met), Position 2 = V (Val)
1966        let variant = parse_hgvs("NP_TEST.1:p.Met1Val").unwrap();
1967        let result = normalizer.normalize(&variant);
1968        assert!(
1969            result.is_ok(),
1970            "Validation should pass for matching amino acid"
1971        );
1972    }
1973
1974    #[test]
1975    fn test_normalize_protein_reference_validation_mismatch() {
1976        // Test that validation fails when amino acid doesn't match reference
1977        // NP_TEST.1 has M at position 1, but we claim it's Val
1978        let provider = MockProvider::with_test_data();
1979        let normalizer = Normalizer::new(provider);
1980
1981        // Position 1 is M (Met), not V (Val)
1982        let variant = parse_hgvs("NP_TEST.1:p.Val1Glu").unwrap();
1983        let result = normalizer.normalize(&variant);
1984        assert!(
1985            result.is_err(),
1986            "Validation should fail for mismatched amino acid"
1987        );
1988
1989        if let Err(crate::error::FerroError::AminoAcidMismatch {
1990            position,
1991            expected,
1992            found,
1993            ..
1994        }) = result
1995        {
1996            assert_eq!(position, 1);
1997            assert_eq!(expected, "Val");
1998            assert_eq!(found, "M");
1999        } else {
2000            panic!("Expected AminoAcidMismatch error");
2001        }
2002    }
2003
2004    #[test]
2005    fn test_normalize_protein_reference_validation_deletion() {
2006        // Test validation for deletion variants
2007        // NP_TEST.1 has V at position 2
2008        let provider = MockProvider::with_test_data();
2009        let normalizer = Normalizer::new(provider);
2010
2011        // Position 2 = V (Val) - should pass
2012        let variant = parse_hgvs("NP_TEST.1:p.Val2del").unwrap();
2013        let result = normalizer.normalize(&variant);
2014        assert!(
2015            result.is_ok(),
2016            "Validation should pass for matching deletion position"
2017        );
2018    }
2019
2020    #[test]
2021    fn test_normalize_protein_reference_validation_missing_protein() {
2022        // Test that missing protein data skips validation (doesn't fail)
2023        let provider = MockProvider::with_test_data();
2024        let normalizer = Normalizer::new(provider);
2025
2026        // NP_MISSING.1 doesn't exist in provider
2027        let variant = parse_hgvs("NP_MISSING.1:p.Val600Glu").unwrap();
2028        let result = normalizer.normalize(&variant);
2029        // Should NOT error - just skip validation when protein not available
2030        assert!(
2031            result.is_ok(),
2032            "Missing protein should skip validation, not fail"
2033        );
2034    }
2035
2036    #[test]
2037    fn test_normalize_rna_substitution_unchanged() {
2038        let provider = MockProvider::with_test_data();
2039        let normalizer = Normalizer::new(provider);
2040
2041        // RNA substitutions should pass through unchanged
2042        let variant = parse_hgvs("NM_000088.3:r.10a>g").unwrap();
2043        let normalized = normalizer.normalize(&variant).unwrap();
2044        assert_eq!(format!("{}", variant), format!("{}", normalized));
2045    }
2046
2047    #[test]
2048    fn test_normalize_rna_deletion_shifts_3prime() {
2049        // NM_001234.1 has G repeat at positions 13-37 (position 37 is actually T,
2050        // but the normalization shifts to the 3'-most position)
2051        let provider = MockProvider::with_test_data();
2052        let normalizer = Normalizer::new(provider);
2053
2054        let variant = parse_hgvs("NM_001234.1:r.14del").unwrap();
2055        let result = normalizer.normalize(&variant).unwrap();
2056        let output = format!("{}", result);
2057
2058        // Should remain a deletion and shift 3'
2059        assert!(
2060            output.contains("del"),
2061            "Deletion should remain a deletion, got: {}",
2062            output
2063        );
2064        // Verify it shifted 3' (position should be > 14)
2065        assert!(
2066            output.contains("r.37del"),
2067            "Deletion should shift 3' to position 37, got: {}",
2068            output
2069        );
2070    }
2071
2072    #[test]
2073    fn test_normalize_rna_insertion_becomes_dup() {
2074        // NM_001234.1 has G repeat at positions 13-36
2075        // Inserting g after position 14 should shift 3' and become dup
2076        let provider = MockProvider::with_test_data();
2077        let normalizer = Normalizer::new(provider);
2078
2079        let variant = parse_hgvs("NM_001234.1:r.14_15insg").unwrap();
2080        let result = normalizer.normalize(&variant).unwrap();
2081        let output = format!("{}", result);
2082
2083        // Inserting g in a G-repeat should become dup
2084        assert!(
2085            output.contains("dup"),
2086            "Insertion of matching base should become dup, got: {}",
2087            output
2088        );
2089    }
2090
2091    #[test]
2092    fn test_normalize_rna_duplication_shifts_3prime() {
2093        // NM_001234.1 has G repeat - single-base duplications stay as simple dups
2094        let provider = MockProvider::with_test_data();
2095        let normalizer = Normalizer::new(provider);
2096
2097        let variant = parse_hgvs("NM_001234.1:r.14dup").unwrap();
2098        let result = normalizer.normalize(&variant).unwrap();
2099        let output = format!("{}", result);
2100
2101        // Single-base duplication should shift 3' and stay as dup
2102        assert!(
2103            output.contains("dup"),
2104            "Single-base RNA duplication should remain as dup notation, got: {}",
2105            output
2106        );
2107    }
2108
2109    #[test]
2110    fn test_normalize_rna_intronic_returns_error() {
2111        let provider = MockProvider::with_test_data();
2112        let normalizer = Normalizer::new(provider);
2113
2114        // Intronic RNA variants should return an error
2115        let variant = parse_hgvs("NM_001234.1:r.10+5del").unwrap();
2116        let result = normalizer.normalize(&variant);
2117        assert!(result.is_err(), "Intronic RNA variant should return error");
2118    }
2119
2120    #[test]
2121    fn test_normalize_rna_missing_transcript_unchanged() {
2122        let provider = MockProvider::with_test_data();
2123        let normalizer = Normalizer::new(provider);
2124
2125        // Missing transcript should return variant unchanged
2126        let variant = parse_hgvs("NM_MISSING.1:r.100del").unwrap();
2127        let normalized = normalizer.normalize(&variant).unwrap();
2128        assert_eq!(format!("{}", variant), format!("{}", normalized));
2129    }
2130
2131    #[test]
2132    fn test_normalize_null_allele() {
2133        let provider = MockProvider::with_test_data();
2134        let normalizer = Normalizer::new(provider);
2135
2136        // Null alleles should pass through
2137        let variant = HgvsVariant::NullAllele;
2138        let normalized = normalizer.normalize(&variant).unwrap();
2139        assert!(matches!(normalized, HgvsVariant::NullAllele));
2140    }
2141
2142    #[test]
2143    fn test_normalize_unknown_allele() {
2144        let provider = MockProvider::with_test_data();
2145        let normalizer = Normalizer::new(provider);
2146
2147        // Unknown alleles should pass through
2148        let variant = HgvsVariant::UnknownAllele;
2149        let normalized = normalizer.normalize(&variant).unwrap();
2150        assert!(matches!(normalized, HgvsVariant::UnknownAllele));
2151    }
2152
2153    #[test]
2154    fn test_normalize_allele_variant() {
2155        let provider = MockProvider::with_test_data();
2156        let normalizer = Normalizer::new(provider);
2157
2158        // Allele variants should normalize each component
2159        // Using substitutions which should remain unchanged
2160        let variant = parse_hgvs("[NM_000088.3:c.10A>G;NM_000088.3:c.20C>T]").unwrap();
2161        let result = normalizer.normalize(&variant).unwrap();
2162
2163        // Verify it's still an allele
2164        assert!(matches!(result, HgvsVariant::Allele(_)));
2165
2166        // Verify output format preserves both variants
2167        let output = format!("{}", result);
2168        assert!(
2169            output.contains("c.10A>G"),
2170            "Allele should contain first variant, got: {}",
2171            output
2172        );
2173        assert!(
2174            output.contains("c.20C>T"),
2175            "Allele should contain second variant, got: {}",
2176            output
2177        );
2178    }
2179
2180    #[test]
2181    fn test_normalize_5prime_direction() {
2182        // Test that 5' direction shifts toward 5' end instead of 3'
2183        // NM_001234.1 has G repeat spanning exons
2184        // Exon 2: c.12-c.26
2185        // With 5' direction, deletion at c.20 should shift toward c.12
2186        // Note: Actual shift depends on boundary handling
2187        let provider = MockProvider::with_test_data();
2188        let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2189        let normalizer = Normalizer::with_config(provider, config);
2190
2191        // Delete G at position 20 (middle of G repeat in exon 2)
2192        let variant = parse_hgvs("NM_001234.1:c.20del").unwrap();
2193        let result = normalizer.normalize(&variant).unwrap();
2194        let output = format!("{}", result);
2195
2196        assert!(
2197            output.contains("del"),
2198            "Should remain a deletion, got: {}",
2199            output
2200        );
2201        // With 5' direction within exon 2, should shift toward 5' boundary
2202        // The exact position depends on exon boundary handling
2203        assert!(
2204            output.contains("c.13del") || output.contains("c.12del"),
2205            "5' direction should shift deletion toward exon start, got: {}",
2206            output
2207        );
2208    }
2209
2210    #[test]
2211    fn test_normalize_3prime_direction() {
2212        let provider = MockProvider::with_test_data();
2213        let config = NormalizeConfig::default().with_direction(ShuffleDirection::ThreePrime);
2214        let normalizer = Normalizer::with_config(provider, config);
2215
2216        let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2217        let result = normalizer.normalize(&variant);
2218        assert!(result.is_ok());
2219    }
2220
2221    #[test]
2222    fn test_normalize_with_cross_boundaries() {
2223        let provider = MockProvider::with_test_data();
2224        let config = NormalizeConfig::default().allow_crossing_boundaries();
2225        let normalizer = Normalizer::with_config(provider, config);
2226
2227        let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2228        let result = normalizer.normalize(&variant);
2229        assert!(result.is_ok());
2230    }
2231
2232    #[test]
2233    fn test_normalize_genomic_variant() {
2234        let provider = MockProvider::with_test_data();
2235        let normalizer = Normalizer::new(provider);
2236
2237        // Genomic variants with missing sequence should pass through unchanged
2238        let variant = parse_hgvs("NC_000001.11:g.12345del").unwrap();
2239        let result = normalizer.normalize(&variant);
2240        assert!(result.is_ok());
2241    }
2242
2243    #[test]
2244    fn test_normalize_tx_variant() {
2245        let provider = MockProvider::with_test_data();
2246        let normalizer = Normalizer::new(provider);
2247
2248        let variant = parse_hgvs("NR_000001.1:n.100del").unwrap();
2249        let result = normalizer.normalize(&variant);
2250        assert!(result.is_ok());
2251    }
2252
2253    #[test]
2254    fn test_normalize_mt_variant() {
2255        let provider = MockProvider::with_test_data();
2256        let normalizer = Normalizer::new(provider);
2257
2258        // MT variants pass through unchanged
2259        let variant = parse_hgvs("NC_012920.1:m.100A>G").unwrap();
2260        let normalized = normalizer.normalize(&variant).unwrap();
2261        assert!(matches!(normalized, HgvsVariant::Mt(_)));
2262    }
2263
2264    #[test]
2265    fn test_cds_to_tx_pos_utr5() {
2266        let provider = MockProvider::with_test_data();
2267        let normalizer = Normalizer::new(provider);
2268
2269        // 5' UTR position (negative)
2270        let cds_pos = CdsPos {
2271            base: -5,
2272            offset: None,
2273            utr3: false,
2274        };
2275        let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2276        assert!(result.is_ok());
2277    }
2278
2279    #[test]
2280    fn test_cds_to_tx_pos_utr3() {
2281        let provider = MockProvider::with_test_data();
2282        let normalizer = Normalizer::new(provider);
2283
2284        // 3' UTR position
2285        let cds_pos = CdsPos {
2286            base: 5,
2287            offset: None,
2288            utr3: true,
2289        };
2290        let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2291        assert!(result.is_ok());
2292        assert_eq!(result.unwrap(), 55);
2293    }
2294
2295    #[test]
2296    fn test_cds_to_tx_pos_coding() {
2297        let provider = MockProvider::with_test_data();
2298        let normalizer = Normalizer::new(provider);
2299
2300        // Normal coding position
2301        let cds_pos = CdsPos {
2302            base: 10,
2303            offset: None,
2304            utr3: false,
2305        };
2306        let result = normalizer.cds_to_tx_pos(&cds_pos, 5, Some(50));
2307        assert!(result.is_ok());
2308        assert_eq!(result.unwrap(), 14); // 5 + 10 - 1 = 14
2309    }
2310
2311    #[test]
2312    fn test_tx_to_cds_pos_utr5() {
2313        let provider = MockProvider::with_test_data();
2314        let normalizer = Normalizer::new(provider);
2315
2316        // Position before CDS start
2317        let result = normalizer.tx_to_cds_pos(3, 10, Some(50));
2318        assert!(result.is_ok());
2319        let cds_pos = result.unwrap();
2320        assert!(cds_pos.base < 0);
2321        assert!(!cds_pos.utr3);
2322    }
2323
2324    #[test]
2325    fn test_tx_to_cds_pos_utr3() {
2326        let provider = MockProvider::with_test_data();
2327        let normalizer = Normalizer::new(provider);
2328
2329        // Position after CDS end
2330        let result = normalizer.tx_to_cds_pos(55, 10, Some(50));
2331        assert!(result.is_ok());
2332        let cds_pos = result.unwrap();
2333        assert!(cds_pos.utr3);
2334    }
2335
2336    #[test]
2337    fn test_tx_to_cds_pos_coding() {
2338        let provider = MockProvider::with_test_data();
2339        let normalizer = Normalizer::new(provider);
2340
2341        // Normal coding position
2342        let result = normalizer.tx_to_cds_pos(20, 10, Some(50));
2343        assert!(result.is_ok());
2344        let cds_pos = result.unwrap();
2345        assert!(!cds_pos.utr3);
2346        assert_eq!(cds_pos.base, 11); // 20 - 10 + 1 = 11
2347    }
2348
2349    #[test]
2350    fn test_config_default() {
2351        let config = NormalizeConfig::default();
2352        assert_eq!(config.shuffle_direction, ShuffleDirection::ThreePrime);
2353        assert!(!config.cross_boundaries);
2354    }
2355
2356    #[test]
2357    #[allow(deprecated)]
2358    fn test_config_builder() {
2359        let config = NormalizeConfig::default()
2360            .with_direction(ShuffleDirection::FivePrime)
2361            .allow_crossing_boundaries()
2362            .skip_validation();
2363
2364        assert_eq!(config.shuffle_direction, ShuffleDirection::FivePrime);
2365        assert!(config.cross_boundaries);
2366        // skip_validation now sets RefSeqMismatch to SilentCorrect
2367        assert!(!config.should_reject_ref_mismatch());
2368        assert!(!config.should_warn_ref_mismatch());
2369    }
2370
2371    #[test]
2372    fn test_duplication_3prime_shift_two_bases() {
2373        // Test the exact scenario from ClinVar: c.4159dup vs c.4160dup
2374        // When duplicating an A in a homopolymer tract (AA),
2375        // HGVS requires converting to repeat notation.
2376        //
2377        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2378        // Positions (1-based):  12345678901234567890...
2379        // c.8 = A, c.9 = A (the "AA" in "GAA")
2380        //
2381        // Single-base duplications stay as simple dups
2382        let provider = MockProvider::with_test_data();
2383        let normalizer = Normalizer::new(provider);
2384
2385        let variant = parse_hgvs("NM_888888.1:c.8dup").unwrap();
2386        let result = normalizer.normalize(&variant).unwrap();
2387        let output = format!("{}", result);
2388
2389        // Single-base duplication should shift 3' and stay as dup
2390        assert!(
2391            output.contains("dup"),
2392            "Single-base duplication should remain as dup notation, got: {}",
2393            output
2394        );
2395    }
2396
2397    #[test]
2398    fn test_duplication_3prime_shift_three_bases() {
2399        // Test with three consecutive identical bases (TTT)
2400        //
2401        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2402        // Positions:           ...              2021222324...
2403        // c.20 = G, c.21 = T, c.22 = T, c.23 = T, c.24 = G
2404        //
2405        // Single-base duplications stay as simple dups
2406        let provider = MockProvider::with_test_data();
2407        let normalizer = Normalizer::new(provider);
2408
2409        let variant = parse_hgvs("NM_888888.1:c.21dup").unwrap();
2410        let result = normalizer.normalize(&variant).unwrap();
2411        let output = format!("{}", result);
2412
2413        // Single-base duplication should shift 3' to c.23 and stay as dup
2414        assert!(
2415            output.contains("dup"),
2416            "Single-base duplication should remain as dup notation, got: {}",
2417            output
2418        );
2419    }
2420
2421    #[test]
2422    fn test_duplication_no_shift_when_unique() {
2423        // Test that a duplication of a unique base doesn't shift
2424        //
2425        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2426        // c.7 = G (followed by AA, so no G to shift to)
2427        //
2428        // c.7dup should stay as c.7dup
2429        let provider = MockProvider::with_test_data();
2430        let normalizer = Normalizer::new(provider);
2431
2432        let variant = parse_hgvs("NM_888888.1:c.7dup").unwrap();
2433        let result = normalizer.normalize(&variant).unwrap();
2434        let output = format!("{}", result);
2435
2436        assert!(
2437            output.contains("dup"),
2438            "Should remain a duplication, got: {}",
2439            output
2440        );
2441        assert!(
2442            output.contains("c.7dup"),
2443            "Duplication of unique G at c.7 should not shift, got: {}",
2444            output
2445        );
2446    }
2447
2448    // =====================================================================
2449    // Exon-Intron Boundary Spanning Variant Tests
2450    // =====================================================================
2451
2452    /// Create a provider with a transcript that has genomic coordinates and introns
2453    /// for testing boundary-spanning variant normalization.
2454    ///
2455    /// Transcript structure (NM_BOUNDARY.1):
2456    /// - Gene: BOUNDARY
2457    /// - Strand: Plus
2458    /// - Chromosome: chr1
2459    ///
2460    /// Genomic layout (chr1):
2461    /// Position: 1000                   1020  1030                   1050  1060                   1080
2462    ///           |-------- Exon 1 ------|      |-------- Exon 2 ------|      |-------- Exon 3 ------|
2463    ///           ATGCCCAAAGGGTTTAGGCCC       AAAGGGTTTAGGCCCAAAAAA       GGGTTTAGGCCCAAATGA
2464    ///                                 ^^^  ^^^                   ^^^  ^^^
2465    ///                              intron 1                   intron 2
2466    ///
2467    /// Transcript positions (tx):
2468    /// - Exon 1: tx 1-20 = genomic 1000-1019
2469    /// - Intron 1: genomic 1020-1029 (10 bp)
2470    /// - Exon 2: tx 21-40 = genomic 1030-1049
2471    /// - Intron 2: genomic 1050-1059 (10 bp)
2472    /// - Exon 3: tx 41-58 = genomic 1060-1077
2473    ///
2474    /// CDS: starts at tx 1 (no 5' UTR for simplicity)
2475    /// CDS positions: c.1 = tx 1, c.20 = tx 20 (last of exon 1), c.21 = tx 21 (first of exon 2)
2476    ///
2477    /// Intron 1 sequence (g.1020-1029): "GTAAGCTAGG" (10 bp)
2478    ///   - c.20+1 = g.1020 (G)
2479    ///   - c.20+10 = g.1029 (G)
2480    ///   - c.21-10 = g.1020 (G)
2481    ///   - c.21-1 = g.1029 (G)
2482    ///
2483    /// Intron 2 sequence (g.1050-1059): "GTAAGTAAGG" (10 bp)
2484    fn make_boundary_test_provider() -> MockProvider {
2485        use crate::reference::transcript::{Exon, ManeStatus, Strand, Transcript};
2486        use std::sync::OnceLock;
2487
2488        let mut provider = MockProvider::new();
2489
2490        // Build transcript sequence (exons only, spliced)
2491        // Exon 1 (20bp): ATGCCCAAAGGGTTTAGGCC (ends with CC at exon boundary)
2492        // Exon 2 (20bp): AAAGGGTTTAGGCCCAAAAA (AA repeat at both boundaries)
2493        // Exon 3 (18bp): GGGTTTAGGCCCAAATGA
2494        // Total: 58bp transcript
2495        let tx_seq = "ATGCCCAAAGGGTTTAGGCCAAAGGGTTTAGGCCCAAAAAGGGTTTAGGCCCAAATGA";
2496
2497        // Build genomic sequence around the transcript
2498        // We'll create 100bp before, the gene region, and 100bp after
2499        // Gene region: exon1 + intron1 + exon2 + intron2 + exon3
2500        // = 20 + 10 + 20 + 10 + 18 = 78bp
2501        //
2502        // Genomic: 900-999 (padding) + 1000-1019 (exon1) + 1020-1029 (intron1) +
2503        //          1030-1049 (exon2) + 1050-1059 (intron2) + 1060-1077 (exon3) + 1078+ (padding)
2504        let mut genomic_seq = String::new();
2505
2506        // Padding before (positions 0-999, 1000 bytes at 0-based)
2507        for _ in 0..1000 {
2508            genomic_seq.push('N');
2509        }
2510
2511        // Exon 1 (positions 1000-1019, 0-based 1000-1019)
2512        genomic_seq.push_str("ATGCCCAAAGGGTTTAGGCC");
2513
2514        // Intron 1 (positions 1020-1029) - with splice consensus
2515        // Note: The intron has AAA at the end (1027-1029) to test shifting
2516        genomic_seq.push_str("GTAAGCTAAA");
2517
2518        // Exon 2 (positions 1030-1049)
2519        genomic_seq.push_str("AAAGGGTTTAGGCCCAAAAA");
2520
2521        // Intron 2 (positions 1050-1059) - with AAA at start for testing
2522        genomic_seq.push_str("AAAGTAAGGG");
2523
2524        // Exon 3 (positions 1060-1077)
2525        genomic_seq.push_str("GGGTTTAGGCCCAAATGA");
2526
2527        // Padding after (100 bytes)
2528        for _ in 0..100 {
2529            genomic_seq.push('N');
2530        }
2531
2532        // Add genomic sequence to provider
2533        provider.add_genomic_sequence("chr1", genomic_seq);
2534
2535        // Create transcript with exons that have genomic coordinates
2536        provider.add_transcript(Transcript {
2537            id: "NM_BOUNDARY.1".to_string(),
2538            gene_symbol: Some("BOUNDARY".to_string()),
2539            strand: Strand::Plus,
2540            sequence: tx_seq.to_string(),
2541            cds_start: Some(1),
2542            cds_end: Some(58),
2543            exons: vec![
2544                Exon::with_genomic(1, 1, 20, 1000, 1019),
2545                Exon::with_genomic(2, 21, 40, 1030, 1049),
2546                Exon::with_genomic(3, 41, 58, 1060, 1077),
2547            ],
2548            chromosome: Some("chr1".to_string()),
2549            genomic_start: Some(1000),
2550            genomic_end: Some(1077),
2551            genome_build: Default::default(),
2552            mane_status: ManeStatus::None,
2553            refseq_match: None,
2554            ensembl_match: None,
2555            cached_introns: OnceLock::new(),
2556        });
2557
2558        provider
2559    }
2560
2561    #[test]
2562    fn test_boundary_spanning_exonic_to_intronic_del() {
2563        // Test: c.20_20+3del - deletion from last exon base into intron
2564        // c.20 = last base of exon 1 (C at g.1019)
2565        // c.20+3 = 3rd intronic base (A at g.1022)
2566        // Deletes: C (exonic) + GTA (intronic) = 4 bases
2567        //
2568        // This should normalize without error (using genomic space)
2569        let provider = make_boundary_test_provider();
2570        let normalizer = Normalizer::new(provider);
2571
2572        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+3del").unwrap();
2573        let result = normalizer.normalize(&variant);
2574
2575        assert!(
2576            result.is_ok(),
2577            "Boundary-spanning deletion should normalize, got error: {:?}",
2578            result.err()
2579        );
2580
2581        let output = format!("{}", result.unwrap());
2582        assert!(
2583            output.contains("del"),
2584            "Should remain a deletion, got: {}",
2585            output
2586        );
2587    }
2588
2589    #[test]
2590    fn test_boundary_spanning_intronic_to_exonic_del() {
2591        // Test: c.21-3_23del - deletion from intron into exon
2592        // c.21-3 = 3rd base before exon 2 (A at g.1027)
2593        // c.23 = 3rd base of exon 2 (A at g.1032)
2594        // Deletes: AAA (intronic) + AAA (exonic) = 6 bases
2595        //
2596        // The intron ends with AAA (g.1027-1029) and exon starts with AAA (g.1030-1032)
2597        // This is a repeat, so normalization might shift
2598        let provider = make_boundary_test_provider();
2599        let normalizer = Normalizer::new(provider);
2600
2601        let variant = parse_hgvs("NM_BOUNDARY.1:c.21-3_23del").unwrap();
2602        let result = normalizer.normalize(&variant);
2603
2604        assert!(
2605            result.is_ok(),
2606            "Boundary-spanning deletion should normalize, got error: {:?}",
2607            result.err()
2608        );
2609
2610        let output = format!("{}", result.unwrap());
2611        assert!(
2612            output.contains("del"),
2613            "Should remain a deletion, got: {}",
2614            output
2615        );
2616    }
2617
2618    #[test]
2619    fn test_boundary_spanning_same_base_position() {
2620        // Test: c.20_20+5del - deletion starting and ending at same CDS base
2621        // Start is exonic (c.20), end is intronic (c.20+5)
2622        let provider = make_boundary_test_provider();
2623        let normalizer = Normalizer::new(provider);
2624
2625        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+5del").unwrap();
2626        let result = normalizer.normalize(&variant);
2627
2628        assert!(
2629            result.is_ok(),
2630            "Same-base boundary-spanning deletion should normalize, got error: {:?}",
2631            result.err()
2632        );
2633    }
2634
2635    #[test]
2636    fn test_boundary_splice_site_plus1() {
2637        // Test: c.20_20+1del - deletion of last exon base + splice donor (GT)
2638        // This is a clinically important splice site variant
2639        let provider = make_boundary_test_provider();
2640        let normalizer = Normalizer::new(provider);
2641
2642        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+1del").unwrap();
2643        let result = normalizer.normalize(&variant);
2644
2645        assert!(
2646            result.is_ok(),
2647            "Splice site +1 deletion should normalize, got error: {:?}",
2648            result.err()
2649        );
2650    }
2651
2652    #[test]
2653    fn test_boundary_splice_site_minus1() {
2654        // Test: c.21-1_22del - deletion of splice acceptor + first exon bases
2655        // c.21-1 = last intronic base before exon 2 (A at g.1029)
2656        // c.22 = 2nd base of exon 2 (A at g.1031)
2657        let provider = make_boundary_test_provider();
2658        let normalizer = Normalizer::new(provider);
2659
2660        let variant = parse_hgvs("NM_BOUNDARY.1:c.21-1_22del").unwrap();
2661        let result = normalizer.normalize(&variant);
2662
2663        assert!(
2664            result.is_ok(),
2665            "Splice site -1 deletion should normalize, got error: {:?}",
2666            result.err()
2667        );
2668    }
2669
2670    #[test]
2671    fn test_boundary_spanning_dup() {
2672        // Test: c.40_40+3dup - duplication spanning exon-intron boundary
2673        // c.40 = last base of exon 2 (A at g.1049)
2674        // c.40+3 = 3rd intronic base of intron 2 (A at g.1052)
2675        // Since intron 2 starts with AAA (and exon 2 ends with AAAAA), this creates a repeat pattern
2676        // The normalizer correctly converts this to repeat notation (A[N])
2677        let provider = make_boundary_test_provider();
2678        let normalizer = Normalizer::new(provider);
2679
2680        let variant = parse_hgvs("NM_BOUNDARY.1:c.40_40+3dup").unwrap();
2681        let result = normalizer.normalize(&variant);
2682
2683        assert!(
2684            result.is_ok(),
2685            "Boundary-spanning duplication should normalize, got error: {:?}",
2686            result.err()
2687        );
2688
2689        let output = format!("{}", result.unwrap());
2690        // May remain as dup or be converted to repeat notation for poly-A
2691        assert!(
2692            output.contains("dup") || output.contains("["),
2693            "Should remain a duplication or become repeat notation, got: {}",
2694            output
2695        );
2696    }
2697
2698    #[test]
2699    fn test_boundary_spanning_delins() {
2700        // Test: c.20_20+2delinsTTT - delins spanning exon-intron boundary
2701        let provider = make_boundary_test_provider();
2702        let normalizer = Normalizer::new(provider);
2703
2704        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+2delinsTTT").unwrap();
2705        let result = normalizer.normalize(&variant);
2706
2707        assert!(
2708            result.is_ok(),
2709            "Boundary-spanning delins should normalize, got error: {:?}",
2710            result.err()
2711        );
2712
2713        let output = format!("{}", result.unwrap());
2714        assert!(
2715            output.contains("delins") || output.contains(">"),
2716            "Should remain a delins or become substitution, got: {}",
2717            output
2718        );
2719    }
2720
2721    #[test]
2722    fn test_boundary_no_genomic_data_returns_error() {
2723        // Test that without genomic data, we still get the ExonIntronBoundary error
2724        let provider = MockProvider::with_test_data(); // No genomic data
2725        let normalizer = Normalizer::new(provider);
2726
2727        // NM_001234.1 doesn't have genomic coordinates
2728        let variant = parse_hgvs("NM_001234.1:c.11_11+3del").unwrap();
2729        let result = normalizer.normalize(&variant);
2730
2731        assert!(
2732            result.is_err(),
2733            "Boundary-spanning without genomic data should return error"
2734        );
2735    }
2736
2737    #[test]
2738    fn test_deletion_3prime_shift_consecutive_bases() {
2739        // Test case simulating NM_001408491.1:c.517delA -> should become c.518del
2740        // Create a transcript with consecutive A's at positions that should shift
2741        use crate::reference::transcript::{Exon, ManeStatus, Strand};
2742        use std::sync::OnceLock;
2743
2744        let mut provider = MockProvider::new();
2745
2746        // Create a sequence where c.517 and c.518 are both 'A'
2747        // CDS starts at position 1, so c.N = transcript position N
2748        // Put "AA" at positions 517-518 (1-based)
2749        // Sequence: 516 bases of padding + "AA" + more padding
2750        let mut seq = String::new();
2751        for _ in 0..516 {
2752            seq.push('G'); // Padding (not A to ensure we see the shift)
2753        }
2754        seq.push('A'); // Position 517 (c.517)
2755        seq.push('A'); // Position 518 (c.518)
2756        for _ in 519..=600 {
2757            seq.push('G'); // More padding
2758        }
2759
2760        provider.add_transcript(crate::reference::transcript::Transcript {
2761            id: "NM_777777.1".to_string(),
2762            gene_symbol: Some("SHIFTTEST".to_string()),
2763            strand: Strand::Plus,
2764            sequence: seq.clone(),
2765            cds_start: Some(1),
2766            cds_end: Some(600),
2767            exons: vec![Exon::new(1, 1, 600)], // Single exon covering all
2768            chromosome: None,
2769            genomic_start: None,
2770            genomic_end: None,
2771            genome_build: Default::default(),
2772            mane_status: ManeStatus::None,
2773            refseq_match: None,
2774            ensembl_match: None,
2775            cached_introns: OnceLock::new(),
2776        });
2777
2778        let normalizer = Normalizer::new(provider);
2779
2780        // Parse c.517delA - deleting the first A
2781        let variant = parse_hgvs("NM_777777.1:c.517delA").unwrap();
2782
2783        // Debug: print the sequence around positions 517-518
2784        println!("Sequence length: {}", seq.len());
2785        println!(
2786            "Position 516 (0-based 515): {}",
2787            seq.chars().nth(515).unwrap()
2788        );
2789        println!(
2790            "Position 517 (0-based 516): {}",
2791            seq.chars().nth(516).unwrap()
2792        );
2793        println!(
2794            "Position 518 (0-based 517): {}",
2795            seq.chars().nth(517).unwrap()
2796        );
2797        println!(
2798            "Position 519 (0-based 518): {}",
2799            seq.chars().nth(518).unwrap()
2800        );
2801
2802        let result = normalizer.normalize(&variant).unwrap();
2803        let output = format!("{}", result);
2804
2805        println!("Input:  NM_777777.1:c.517delA");
2806        println!("Output: {}", output);
2807
2808        // The deletion should shift from c.517 to c.518 (3' rule)
2809        // because both positions are 'A'
2810        assert!(
2811            output.contains("c.518del"),
2812            "Deletion at c.517 should shift to c.518 (3' rule), got: {}",
2813            output
2814        );
2815    }
2816
2817    #[test]
2818    fn test_deletion_3prime_shift_with_utr() {
2819        // Same test but with a 5' UTR (cds_start > 1)
2820        // This simulates real transcripts more accurately
2821        use crate::reference::transcript::{Exon, ManeStatus, Strand};
2822        use std::sync::OnceLock;
2823
2824        let mut provider = MockProvider::new();
2825
2826        // Create a transcript with 100bp 5' UTR
2827        // CDS starts at position 101, so:
2828        // c.1 = tx position 101
2829        // c.517 = tx position 617
2830        // c.518 = tx position 618
2831        let utr_len = 100;
2832        let mut seq = String::new();
2833
2834        // 5' UTR (100 bases)
2835        for _ in 0..utr_len {
2836            seq.push('T');
2837        }
2838        // CDS: 516 bases of G padding, then "AA", then more G
2839        for _ in 0..516 {
2840            seq.push('G');
2841        }
2842        seq.push('A'); // tx position 617 = c.517
2843        seq.push('A'); // tx position 618 = c.518
2844        for _ in 0..100 {
2845            seq.push('G');
2846        }
2847
2848        let seq_len = seq.len();
2849        println!("Test with UTR:");
2850        println!("  Sequence length: {}", seq_len);
2851        println!("  CDS start (1-based): 101");
2852        println!("  c.517 = tx position 617 (0-based 616)");
2853        println!("  c.518 = tx position 618 (0-based 617)");
2854        println!(
2855            "  tx pos 617 (0-based 616): {}",
2856            seq.chars().nth(616).unwrap()
2857        );
2858        println!(
2859            "  tx pos 618 (0-based 617): {}",
2860            seq.chars().nth(617).unwrap()
2861        );
2862
2863        provider.add_transcript(crate::reference::transcript::Transcript {
2864            id: "NM_666666.1".to_string(),
2865            gene_symbol: Some("UTRTEST".to_string()),
2866            strand: Strand::Plus,
2867            sequence: seq.clone(),
2868            cds_start: Some(101),
2869            cds_end: Some(seq_len as u64),
2870            exons: vec![Exon::new(1, 1, seq_len as u64)], // Single exon
2871            chromosome: None,
2872            genomic_start: None,
2873            genomic_end: None,
2874            genome_build: Default::default(),
2875            mane_status: ManeStatus::None,
2876            refseq_match: None,
2877            ensembl_match: None,
2878            cached_introns: OnceLock::new(),
2879        });
2880
2881        let normalizer = Normalizer::new(provider);
2882
2883        let variant = parse_hgvs("NM_666666.1:c.517delA").unwrap();
2884        let result = normalizer.normalize(&variant).unwrap();
2885        let output = format!("{}", result);
2886
2887        println!("Input:  NM_666666.1:c.517delA");
2888        println!("Output: {}", output);
2889
2890        assert!(
2891            output.contains("c.518del"),
2892            "Deletion at c.517 should shift to c.518 (3' rule) even with UTR, got: {}",
2893            output
2894        );
2895    }
2896
2897    #[test]
2898    fn test_delins_should_not_shift() {
2899        // HGVS spec: delins should NOT be 3' shifted like del/dup/ins
2900        // This test ensures we don't incorrectly shift delins positions
2901        use crate::reference::transcript::{Exon, ManeStatus, Strand};
2902        use std::sync::OnceLock;
2903
2904        let mut provider = MockProvider::new();
2905
2906        // Create a transcript where delins could theoretically shift if we were wrong
2907        // Sequence: ...GGAATTCC... where we do c.10_11delinsXX
2908        // If incorrectly shifted, it might become c.11_12delinsXX
2909        let seq = "GGGGGGGGGGAATTCCGGGGGGGGGG".to_string(); // c.10=A, c.11=A, c.12=T, c.13=T
2910
2911        provider.add_transcript(crate::reference::transcript::Transcript {
2912            id: "NM_555555.1".to_string(),
2913            gene_symbol: Some("DELINSTEST".to_string()),
2914            strand: Strand::Plus,
2915            sequence: seq,
2916            cds_start: Some(1),
2917            cds_end: Some(26),
2918            exons: vec![Exon::new(1, 1, 26)],
2919            chromosome: None,
2920            genomic_start: None,
2921            genomic_end: None,
2922            genome_build: Default::default(),
2923            mane_status: ManeStatus::None,
2924            refseq_match: None,
2925            ensembl_match: None,
2926            cached_introns: OnceLock::new(),
2927        });
2928
2929        let normalizer = Normalizer::new(provider);
2930
2931        // Test delins - should NOT shift
2932        let variant = parse_hgvs("NM_555555.1:c.10_11delinsTT").unwrap();
2933        let result = normalizer.normalize(&variant).unwrap();
2934        let output = format!("{}", result);
2935
2936        assert!(
2937            output.contains("c.10_11delins"),
2938            "Delins should NOT be shifted (HGVS spec), got: {}",
2939            output
2940        );
2941    }
2942}