Skip to main content

ferro_hgvs/normalize/
mod.rs

1//! Normalization engine
2//!
3//! Implements the core HGVS variant normalization algorithm including
4//! 3'/5' shifting and boundary detection.
5//!
6//! # Coordinate Systems
7//!
8//! This module uses multiple coordinate systems:
9//!
10//! | Context | Basis | Type/Notes |
11//! |---------|-------|------------|
12//! | HGVS variant positions | 1-based | `u64` for genomic, `i64` for CDS/Tx |
13//! | Array indexing | 0-based | `usize` for sequence slicing |
14//! | Boundaries struct | 1-based | `(start, end)` inclusive |
15//! | Shuffle input/output | 0-based | Uses array indices |
16//! | Relative positions | 1-based | Positions within fetched window |
17//!
18//! Key conversions:
19//! - `hgvs_pos_to_index(pos)` converts 1-based HGVS position to 0-based index
20//! - `index_to_hgvs_pos(idx)` converts 0-based index to 1-based HGVS position
21//! - `pos.saturating_sub(1)` manually converts 1-based to 0-based
22//! - `idx + 1` manually converts 0-based to 1-based
23//!
24//! For type-safe coordinate handling, see [`crate::coords`].
25
26pub mod boundary;
27pub mod config;
28pub(crate) mod merge;
29pub mod rules;
30pub mod shuffle;
31pub mod validate;
32
33use crate::coords::{hgvs_pos_to_index, index_to_hgvs_pos};
34use crate::error::FerroError;
35use crate::hgvs::edit::NaEdit;
36use crate::hgvs::interval::Interval;
37use crate::hgvs::location::{CdsPos, GenomePos, TxPos};
38use crate::hgvs::parser::position::{OFFSET_UNKNOWN_NEGATIVE, OFFSET_UNKNOWN_POSITIVE};
39use crate::hgvs::variant::{CdsVariant, GenomeVariant, HgvsVariant, LocEdit, TxVariant};
40use crate::hgvs::HgvsVariant as HV;
41use crate::reference::transcript::Strand;
42use crate::reference::ReferenceProvider;
43use boundary::Boundaries;
44pub use config::{NormalizeConfig, ShuffleDirection};
45use rules::{canonicalize_edit, needs_normalization, should_canonicalize};
46use shuffle::shuffle;
47
48/// Check if a CDS position has an unknown (?) offset sentinel value
49fn has_unknown_offset_cds(pos: &CdsPos) -> bool {
50    matches!(
51        pos.offset,
52        Some(OFFSET_UNKNOWN_POSITIVE) | Some(OFFSET_UNKNOWN_NEGATIVE)
53    )
54}
55
56/// Check if a TxPos has an unknown (?) offset sentinel value
57fn has_unknown_offset_tx(pos: &TxPos) -> bool {
58    matches!(
59        pos.offset,
60        Some(OFFSET_UNKNOWN_POSITIVE) | Some(OFFSET_UNKNOWN_NEGATIVE)
61    )
62}
63
64/// Warning generated during normalization
65#[derive(Debug, Clone)]
66pub struct NormalizationWarning {
67    /// Warning code (e.g., "REFSEQ_MISMATCH")
68    pub code: String,
69    /// Human-readable description
70    pub message: String,
71    /// What the input claimed as reference
72    pub stated_ref: String,
73    /// What the actual reference sequence has
74    pub actual_ref: String,
75    /// Position info
76    pub position: String,
77    /// Whether the mismatch was auto-corrected
78    pub corrected: bool,
79}
80
81/// Result of normalization with optional warnings
82#[derive(Debug, Clone)]
83pub struct NormalizeResultWithWarnings {
84    /// The normalized variant
85    pub result: HgvsVariant,
86    /// Warnings generated during normalization
87    pub warnings: Vec<NormalizationWarning>,
88}
89
90impl NormalizeResultWithWarnings {
91    /// Create a new result without warnings
92    pub fn new(result: HgvsVariant) -> Self {
93        Self {
94            result,
95            warnings: vec![],
96        }
97    }
98
99    /// Create a result with warnings
100    pub fn with_warnings(result: HgvsVariant, warnings: Vec<NormalizationWarning>) -> Self {
101        Self { result, warnings }
102    }
103
104    /// Add a warning to the result
105    pub fn add_warning(&mut self, warning: NormalizationWarning) {
106        self.warnings.push(warning);
107    }
108
109    /// Check if there are any warnings
110    pub fn has_warnings(&self) -> bool {
111        !self.warnings.is_empty()
112    }
113
114    /// Check if there's a reference mismatch warning
115    pub fn has_ref_mismatch(&self) -> bool {
116        self.warnings.iter().any(|w| w.code == "REFSEQ_MISMATCH")
117    }
118}
119
120/// Main normalizer struct
121pub struct Normalizer<P: ReferenceProvider> {
122    provider: P,
123    config: NormalizeConfig,
124}
125
126impl<P: ReferenceProvider> Normalizer<P> {
127    /// Create a new normalizer with the given reference provider
128    pub fn new(provider: P) -> Self {
129        Self {
130            provider,
131            config: NormalizeConfig::default(),
132        }
133    }
134
135    /// Create a normalizer with custom configuration
136    pub fn with_config(provider: P, config: NormalizeConfig) -> Self {
137        Self { provider, config }
138    }
139
140    /// Get the configuration
141    pub fn config(&self) -> &NormalizeConfig {
142        &self.config
143    }
144
145    /// Normalize a variant
146    ///
147    /// In strict mode (default), rejects variants with reference mismatches.
148    /// Use `normalize_with_warnings` for lenient mode that corrects mismatches.
149    pub fn normalize(&self, variant: &HgvsVariant) -> Result<HgvsVariant, FerroError> {
150        let result = self.normalize_with_warnings(variant)?;
151
152        // In strict mode, reject if there were reference mismatches
153        if self.config.should_reject_ref_mismatch() && result.has_ref_mismatch() {
154            if let Some(warning) = result.warnings.iter().find(|w| w.code == "REFSEQ_MISMATCH") {
155                return Err(FerroError::ReferenceMismatch {
156                    location: warning.position.clone(),
157                    expected: warning.stated_ref.clone(),
158                    found: warning.actual_ref.clone(),
159                });
160            }
161        }
162
163        Ok(result.result)
164    }
165
166    /// Normalize a variant with detailed warnings
167    ///
168    /// Returns the normalized variant along with any warnings generated during
169    /// normalization (e.g., reference sequence mismatches that were auto-corrected).
170    /// Use this method when you want to track what corrections were made.
171    pub fn normalize_with_warnings(
172        &self,
173        variant: &HgvsVariant,
174    ) -> Result<NormalizeResultWithWarnings, FerroError> {
175        let (result, warnings) = match variant {
176            HV::Genome(v) => self.normalize_genome(v)?,
177            HV::Cds(v) => self.normalize_cds(v)?,
178            HV::Tx(v) => self.normalize_tx(v)?,
179            HV::Protein(v) => self.normalize_protein(v)?,
180            HV::Rna(v) => self.normalize_rna(v)?,
181            HV::Mt(v) => self.normalize_mt(v)?,
182            HV::Allele(a) => self.normalize_allele(a)?,
183            // Circular variants normalize like genomic variants
184            HV::Circular(v) => (
185                HV::Circular(crate::hgvs::variant::CircularVariant {
186                    accession: v.accession.clone(),
187                    gene_symbol: v.gene_symbol.clone(),
188                    loc_edit: v.loc_edit.clone(),
189                }),
190                vec![],
191            ),
192            // RNA fusions pass through unchanged (no normalization needed for fusions)
193            HV::RnaFusion(v) => (HV::RnaFusion(v.clone()), vec![]),
194            // Null and unknown allele markers pass through unchanged
195            HV::NullAllele => (HV::NullAllele, vec![]),
196            HV::UnknownAllele => (HV::UnknownAllele, vec![]),
197        };
198
199        Ok(NormalizeResultWithWarnings::with_warnings(result, warnings))
200    }
201
202    /// Normalize an allele (compound) variant
203    ///
204    /// Normalizes each variant in the allele individually, with overlap prevention.
205    /// After normalization, checks if variants would overlap and constrains shifting
206    /// to prevent collisions.
207    fn normalize_allele(
208        &self,
209        allele: &crate::hgvs::variant::AlleleVariant,
210    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
211        // First pass: normalize each variant independently, collecting all warnings
212        let mut all_warnings = Vec::new();
213        let mut normalized = Vec::new();
214
215        for v in &allele.variants {
216            let result = self.normalize_with_warnings(v)?;
217            all_warnings.extend(result.warnings);
218            normalized.push(result.result);
219        }
220
221        // Second pass: check for overlaps and resolve
222        if self.config.prevent_overlap {
223            normalized = self.resolve_overlaps(normalized)?;
224        }
225
226        // HGVS requires consecutive edits in cis to render as a single delins.
227        // Track input length so we only unwrap when a merge actually collapsed
228        // multiple sub-variants — not for pre-existing singleton alleles, which
229        // must round-trip with the Allele wrapper intact for programmatic callers
230        // (Display already renders singletons in bare form regardless).
231        let original_len = normalized.len();
232        let mut merged = merge::merge_consecutive_edits(normalized, allele.phase, &self.provider);
233
234        let result = if allele.phase == crate::hgvs::variant::AllelePhase::Cis
235            && original_len > 1
236            && merged.len() == 1
237        {
238            merged.pop().unwrap()
239        } else {
240            HgvsVariant::Allele(crate::hgvs::variant::AlleleVariant::new(
241                merged,
242                allele.phase,
243            ))
244        };
245
246        Ok((result, all_warnings))
247    }
248
249    /// Resolve overlaps between normalized variants in a compound allele.
250    ///
251    /// When normalization shifts variants, they may end up overlapping. This function
252    /// detects overlaps and constrains the normalization to prevent collisions.
253    fn resolve_overlaps(&self, variants: Vec<HgvsVariant>) -> Result<Vec<HgvsVariant>, FerroError> {
254        if variants.len() < 2 {
255            return Ok(variants);
256        }
257
258        // Extract positions for comparison
259        let mut positions: Vec<(usize, Option<(u64, u64)>)> = variants
260            .iter()
261            .enumerate()
262            .map(|(i, v)| (i, self.extract_position_range(v)))
263            .collect();
264
265        // Sort by start position
266        positions.sort_by(|a, b| match (&a.1, &b.1) {
267            (Some((s1, _)), Some((s2, _))) => s1.cmp(s2),
268            (Some(_), None) => std::cmp::Ordering::Less,
269            (None, Some(_)) => std::cmp::Ordering::Greater,
270            (None, None) => std::cmp::Ordering::Equal,
271        });
272
273        // Check for overlaps and mark variants that need adjustment
274        let mut needs_adjustment = vec![false; variants.len()];
275
276        for i in 0..positions.len() - 1 {
277            let (_idx_a, pos_a) = &positions[i];
278            let (idx_b, pos_b) = &positions[i + 1];
279
280            if let (Some((_, end_a)), Some((start_b, _))) = (pos_a, pos_b) {
281                // Check for overlap: end of A >= start of B
282                if end_a >= start_b {
283                    // Mark the second variant for potential adjustment
284                    needs_adjustment[*idx_b] = true;
285                }
286            }
287        }
288
289        // For now, we report overlaps but don't modify the variants
290        // A more sophisticated implementation would re-normalize with constraints
291        // or report a warning
292        for (i, needs_adj) in needs_adjustment.iter().enumerate() {
293            if *needs_adj {
294                // In a complete implementation, we would constrain normalization
295                // For now, we preserve the variant as-is with a potential warning
296                // Future: add to a warnings collection
297                let _ = i; // Suppress unused variable warning
298            }
299        }
300
301        Ok(variants)
302    }
303
304    /// Extract the genomic/CDS position range from a variant.
305    fn extract_position_range(&self, variant: &HgvsVariant) -> Option<(u64, u64)> {
306        match variant {
307            HV::Genome(v) => {
308                let start = v.loc_edit.location.start.inner()?.base;
309                let end = v.loc_edit.location.end.inner()?.base;
310                Some((start, end))
311            }
312            HV::Cds(v) => {
313                let start_pos = v.loc_edit.location.start.inner()?;
314                let end_pos = v.loc_edit.location.end.inner()?;
315                // Only return positions for exonic, non-UTR variants
316                if start_pos.is_intronic()
317                    || end_pos.is_intronic()
318                    || start_pos.base < 1
319                    || end_pos.base < 1
320                {
321                    None
322                } else {
323                    Some((start_pos.base as u64, end_pos.base as u64))
324                }
325            }
326            HV::Tx(v) => {
327                let start = v.loc_edit.location.start.inner()?.base;
328                let end = v.loc_edit.location.end.inner()?.base;
329                // Only return positions for positive bases (within transcript)
330                if start < 1 || end < 1 {
331                    None
332                } else {
333                    Some((start as u64, end as u64))
334                }
335            }
336            _ => None,
337        }
338    }
339
340    /// Normalize a genomic variant
341    fn normalize_genome(
342        &self,
343        variant: &GenomeVariant,
344    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
345        // Can't normalize variants with unknown edits or positions
346        let edit = match variant.loc_edit.edit.inner() {
347            Some(e) => e,
348            None => return Ok((HV::Genome(variant.clone()), vec![])),
349        };
350
351        // Only normalize indels
352        if !needs_normalization(edit) {
353            return Ok((HV::Genome(variant.clone()), vec![]));
354        }
355
356        // Get sequence from provider - can't normalize with unknown positions
357        let accession = variant.accession.transcript_accession();
358        let start = match variant.loc_edit.location.start.inner() {
359            Some(pos) => pos.base,
360            None => {
361                return Ok((
362                    HV::Genome(self.canonicalize_genome_variant(variant)),
363                    vec![],
364                ))
365            }
366        };
367        let end = match variant.loc_edit.location.end.inner() {
368            Some(pos) => pos.base,
369            None => {
370                return Ok((
371                    HV::Genome(self.canonicalize_genome_variant(variant)),
372                    vec![],
373                ))
374            }
375        };
376
377        // Try to get transcript/sequence, fall back to minimal notation if not found
378        let window_start = start.saturating_sub(self.config.window_size);
379        let seq_result = self.provider.get_sequence(
380            &accession,
381            window_start,
382            end.saturating_add(self.config.window_size),
383        );
384
385        let ref_seq = match seq_result {
386            Ok(s) => s,
387            // Can't do full normalization without sequence, but apply minimal notation
388            Err(_) => {
389                return Ok((
390                    HV::Genome(self.canonicalize_genome_variant(variant)),
391                    vec![],
392                ))
393            }
394        };
395
396        // Adjust coordinates to be relative to the window
397        let rel_start = start - window_start;
398        let rel_end = end - window_start;
399
400        // Perform normalization
401        let (new_rel_start, new_rel_end, new_edit, warnings) = self.normalize_na_edit(
402            ref_seq.as_bytes(),
403            edit,
404            rel_start,
405            rel_end,
406            &Boundaries::new(1, ref_seq.len() as u64),
407        )?;
408
409        // Adjust back to genomic coordinates
410        let new_start = new_rel_start + window_start;
411        let new_end = new_rel_end + window_start;
412
413        // Reconstruct variant with new position (using adjusted coordinates)
414        let new_variant = GenomeVariant {
415            accession: variant.accession.clone(),
416            gene_symbol: variant.gene_symbol.clone(),
417            loc_edit: LocEdit::new(
418                Interval::new(GenomePos::new(new_start), GenomePos::new(new_end)),
419                new_edit,
420            ),
421        };
422
423        Ok((HV::Genome(new_variant), warnings))
424    }
425
426    /// Normalize a CDS variant
427    fn normalize_cds(
428        &self,
429        variant: &CdsVariant,
430    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
431        // Can't normalize variants with unknown edits or positions
432        let edit = match variant.loc_edit.edit.inner() {
433            Some(e) => e,
434            None => return Ok((HV::Cds(variant.clone()), vec![])),
435        };
436
437        // Only normalize indels
438        if !needs_normalization(edit) {
439            return Ok((HV::Cds(variant.clone()), vec![]));
440        }
441
442        // Check for intronic variants or unknown positions - can't normalize those
443        let start_pos = match variant.loc_edit.location.start.inner() {
444            Some(pos) => pos,
445            None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
446        };
447        let end_pos = match variant.loc_edit.location.end.inner() {
448            Some(pos) => pos,
449            None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
450        };
451
452        // Can't normalize variants with unknown (?) offsets - return unchanged
453        if has_unknown_offset_cds(start_pos) || has_unknown_offset_cds(end_pos) {
454            return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![]));
455        }
456
457        // Try to get transcript first - we need it for intronic normalization too
458        let accession = variant.accession.transcript_accession();
459        let transcript = match self.provider.get_transcript(&accession) {
460            Ok(t) => t,
461            // Can't do full normalization without transcript, but apply minimal notation
462            Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
463        };
464
465        // Handle intronic variants specially
466        if start_pos.is_intronic() || end_pos.is_intronic() {
467            // Check if both positions are intronic and in the same intron
468            if start_pos.is_intronic() && end_pos.is_intronic() {
469                return self.normalize_intronic_cds(variant, &transcript, start_pos, end_pos, edit);
470            }
471            // Variant spans exon-intron boundary - normalize in genomic space
472            return self.normalize_boundary_spanning_cds(
473                variant,
474                &transcript,
475                start_pos,
476                end_pos,
477                edit,
478            );
479        }
480
481        // Convert CDS to transcript coordinates for normalization
482        let cds_start = match transcript.cds_start {
483            Some(s) => s,
484            None => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
485        };
486
487        // Calculate transcript positions - return unchanged if position is out of range
488        let tx_start = match self.cds_to_tx_pos(start_pos, cds_start, transcript.cds_end) {
489            Ok(v) => v,
490            Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
491        };
492        let tx_end = match self.cds_to_tx_pos(end_pos, cds_start, transcript.cds_end) {
493            Ok(v) => v,
494            Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
495        };
496
497        // Get boundaries (stay within exon unless configured otherwise)
498        let boundaries = if self.config.cross_boundaries {
499            Boundaries::new(1, transcript.sequence.len() as u64)
500        } else {
501            match boundary::get_cds_boundaries(&transcript, tx_start, &self.config) {
502                Ok(b) => b,
503                Err(_) => return Ok((HV::Cds(self.canonicalize_cds_variant(variant)), vec![])),
504            }
505        };
506
507        // Perform normalization on transcript sequence
508        let seq = transcript.sequence.as_bytes();
509        let (new_tx_start, new_tx_end, new_edit, warnings) =
510            self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
511
512        // Convert back to CDS coordinates
513        let new_start = self.tx_to_cds_pos(new_tx_start, cds_start, transcript.cds_end)?;
514        let new_end = self.tx_to_cds_pos(new_tx_end, cds_start, transcript.cds_end)?;
515
516        let new_variant = CdsVariant {
517            accession: variant.accession.clone(),
518            gene_symbol: variant.gene_symbol.clone(),
519            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
520        };
521
522        Ok((HV::Cds(new_variant), warnings))
523    }
524
525    /// Normalize a transcript variant
526    fn normalize_tx(
527        &self,
528        variant: &TxVariant,
529    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
530        // Can't normalize variants with unknown edits or positions
531        let edit = match variant.loc_edit.edit.inner() {
532            Some(e) => e,
533            None => return Ok((HV::Tx(variant.clone()), vec![])),
534        };
535
536        // Only normalize indels
537        if !needs_normalization(edit) {
538            return Ok((HV::Tx(variant.clone()), vec![]));
539        }
540
541        // Check for intronic variants or unknown positions
542        let start_pos = match variant.loc_edit.location.start.inner() {
543            Some(pos) => pos,
544            None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
545        };
546        let end_pos = match variant.loc_edit.location.end.inner() {
547            Some(pos) => pos,
548            None => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
549        };
550
551        // Can't normalize variants with unknown (?) offsets - return unchanged
552        if has_unknown_offset_tx(start_pos) || has_unknown_offset_tx(end_pos) {
553            return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
554        }
555
556        // Try to get transcript first - we need it for intronic normalization too
557        let accession = variant.accession.transcript_accession();
558        let transcript = match self.provider.get_transcript(&accession) {
559            Ok(t) => t,
560            // Can't do full normalization without transcript, but apply minimal notation
561            Err(_) => return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![])),
562        };
563
564        if start_pos.is_intronic() || end_pos.is_intronic() {
565            // Route intronic tx variants to the intronic normalization path
566            if start_pos.is_intronic() && end_pos.is_intronic() {
567                return self.normalize_intronic_tx(variant, &transcript, start_pos, end_pos, edit);
568            }
569            // Variant spans exon-intron boundary - not yet supported for n. coords
570            return Err(FerroError::IntronicVariant {
571                variant: format!("{}", variant),
572            });
573        }
574
575        // Only normalize positive positions (within transcript)
576        // Negative positions are outside the transcript sequence
577        if start_pos.base < 1 || end_pos.base < 1 {
578            return Ok((HV::Tx(self.canonicalize_tx_variant(variant)), vec![]));
579        }
580
581        let tx_start = start_pos.base as u64;
582        let tx_end = end_pos.base as u64;
583
584        // Get boundaries
585        let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
586
587        // Perform normalization
588        let seq = transcript.sequence.as_bytes();
589        let (new_start, new_end, new_edit, warnings) =
590            self.normalize_na_edit(seq, edit, tx_start, tx_end, &boundaries)?;
591
592        let new_variant = TxVariant {
593            accession: variant.accession.clone(),
594            gene_symbol: variant.gene_symbol.clone(),
595            loc_edit: LocEdit::new(
596                Interval::new(TxPos::new(new_start as i64), TxPos::new(new_end as i64)),
597                new_edit,
598            ),
599        };
600
601        Ok((HV::Tx(new_variant), warnings))
602    }
603
604    /// Normalize a protein variant
605    ///
606    /// Protein normalization differs from nucleic acid normalization - there's no
607    /// 3'/5' shifting. Instead, we perform formatting standardization:
608    ///
609    /// 1. **Reference validation**: Check that position amino acids match the
610    ///    reference protein sequence (if protein data is available).
611    ///
612    /// 2. **Redundant sequence removal**: Remove explicit sequences in deletions
613    ///    when they match the amino acids at the deletion position.
614    ///    Example: `p.Val600delVal` → `p.Val600del`
615    ///
616    /// 3. **1-letter to 3-letter conversion**: (handled by parser/display)
617    fn normalize_protein(
618        &self,
619        variant: &crate::hgvs::variant::ProteinVariant,
620    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
621        use crate::hgvs::edit::ProteinEdit;
622        use crate::hgvs::variant::{LocEdit, ProteinVariant};
623
624        // Validate reference amino acids if provider has protein data
625        if self.provider.has_protein_data() {
626            self.validate_protein_reference(variant)?;
627        }
628
629        // Get the current edit
630        let edit = match variant.loc_edit.edit.inner() {
631            Some(e) => e,
632            None => return Ok((HV::Protein(variant.clone()), vec![])),
633        };
634
635        // Apply normalization based on edit type
636        let normalized_edit = match edit {
637            ProteinEdit::Deletion { sequence, count } => {
638                // Check for redundant sequence that matches the position
639                if let Some(seq) = sequence {
640                    if self.is_redundant_protein_deletion_sequence(&variant.loc_edit.location, seq)
641                    {
642                        // Remove redundant sequence
643                        ProteinEdit::Deletion {
644                            sequence: None,
645                            count: *count,
646                        }
647                    } else {
648                        edit.clone()
649                    }
650                } else {
651                    edit.clone()
652                }
653            }
654            // Other edits pass through unchanged
655            _ => edit.clone(),
656        };
657
658        // Only create a new variant if the edit changed
659        if &normalized_edit != edit {
660            let new_variant = ProteinVariant {
661                accession: variant.accession.clone(),
662                gene_symbol: variant.gene_symbol.clone(),
663                loc_edit: LocEdit::with_uncertainty(
664                    variant.loc_edit.location.clone(),
665                    variant.loc_edit.edit.map_ref(|_| normalized_edit),
666                ),
667            };
668            Ok((HV::Protein(new_variant), vec![]))
669        } else {
670            Ok((HV::Protein(variant.clone()), vec![]))
671        }
672    }
673
674    /// Validate that the amino acids in a protein variant match the reference
675    ///
676    /// Returns an error if there's a mismatch between the variant's stated
677    /// amino acid(s) and the actual reference protein sequence.
678    fn validate_protein_reference(
679        &self,
680        variant: &crate::hgvs::variant::ProteinVariant,
681    ) -> Result<(), FerroError> {
682        use crate::hgvs::edit::ProteinEdit;
683
684        let accession = variant.accession.transcript_accession();
685
686        // Get start and end positions
687        let start_pos = match variant.loc_edit.location.start.inner() {
688            Some(pos) => pos,
689            None => return Ok(()), // Can't validate uncertain positions
690        };
691        let end_pos = match variant.loc_edit.location.end.inner() {
692            Some(pos) => pos,
693            None => return Ok(()),
694        };
695
696        // Get the edit to know what amino acids to validate
697        let edit = match variant.loc_edit.edit.inner() {
698            Some(e) => e,
699            None => return Ok(()),
700        };
701
702        // Only validate edits that specify reference amino acids
703        match edit {
704            ProteinEdit::Substitution { .. } => {
705                // For substitutions, the reference AA comes from the position (start_pos.aa),
706                // not from the edit (which may be Xaa placeholder)
707                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
708            }
709            ProteinEdit::Deletion { .. } => {
710                // Validate start position (from the interval's start AA)
711                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
712                // Validate end position if different
713                if end_pos.number != start_pos.number {
714                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
715                }
716            }
717            ProteinEdit::Duplication => {
718                // Validate start and end positions
719                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
720                if end_pos.number != start_pos.number {
721                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
722                }
723            }
724            ProteinEdit::Insertion { .. } => {
725                // Validate flanking positions
726                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
727                self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
728            }
729            ProteinEdit::Delins { .. } => {
730                // Validate start and end
731                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
732                if end_pos.number != start_pos.number {
733                    self.validate_protein_position(&accession, end_pos.number, &end_pos.aa)?;
734                }
735            }
736            ProteinEdit::Frameshift { .. } => {
737                // Validate the frameshift position
738                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
739            }
740            ProteinEdit::Extension { .. } => {
741                // Extension typically at Ter position - validate
742                self.validate_protein_position(&accession, start_pos.number, &start_pos.aa)?;
743            }
744            _ => {
745                // Identity, Unknown, etc. - no validation needed
746            }
747        }
748
749        Ok(())
750    }
751
752    /// Validate a single protein position against reference
753    fn validate_protein_position(
754        &self,
755        accession: &str,
756        position: u64,
757        expected_aa: &crate::hgvs::location::AminoAcid,
758    ) -> Result<(), FerroError> {
759        // Position is 1-based in HGVS, convert to 0-based for sequence access
760        // get_protein_sequence uses half-open interval [start, end)
761        let start = hgvs_pos_to_index(position) as u64;
762        let end = position; // exclusive end
763
764        // Try to get the reference amino acid
765        match self.provider.get_protein_sequence(accession, start, end) {
766            Ok(ref_seq) => {
767                if ref_seq.len() != 1 {
768                    return Ok(()); // Unexpected, skip validation
769                }
770
771                let ref_aa_char = ref_seq.chars().next().unwrap();
772                let expected_char = expected_aa.to_one_letter();
773
774                if ref_aa_char != expected_char {
775                    return Err(FerroError::AminoAcidMismatch {
776                        accession: accession.to_string(),
777                        position,
778                        expected: expected_aa.to_string(),
779                        found: ref_aa_char.to_string(),
780                    });
781                }
782            }
783            Err(_) => {
784                // Protein sequence not available, skip validation
785            }
786        }
787
788        Ok(())
789    }
790
791    /// Check if the deletion sequence is redundant (matches the position amino acids)
792    ///
793    /// A deletion sequence is redundant if it exactly matches the amino acids
794    /// specified in the interval. For example:
795    /// - `p.Val600delVal` - sequence [Val] matches position 600's Val → redundant
796    /// - `p.Lys23_Leu24delLysLeu` - sequence [Lys, Leu] matches positions 23-24 → redundant
797    fn is_redundant_protein_deletion_sequence(
798        &self,
799        interval: &crate::hgvs::interval::ProtInterval,
800        sequence: &crate::hgvs::edit::AminoAcidSeq,
801    ) -> bool {
802        // Get the start and end positions
803        let start_pos = match interval.start.inner() {
804            Some(pos) => pos,
805            None => return false,
806        };
807        let end_pos = match interval.end.inner() {
808            Some(pos) => pos,
809            None => return false,
810        };
811
812        // Calculate expected sequence length from interval
813        let interval_len = if end_pos.number >= start_pos.number {
814            (end_pos.number - start_pos.number + 1) as usize
815        } else {
816            return false;
817        };
818
819        // Check if sequence length matches
820        if sequence.len() != interval_len {
821            return false;
822        }
823
824        // For a point deletion (single AA), check if the sequence matches the position AA
825        if interval_len == 1 {
826            return sequence.0.len() == 1 && sequence.0[0] == start_pos.aa;
827        }
828
829        // For a range deletion, check first and last AAs
830        // The sequence should be [start_aa, ..., end_aa]
831        if let (Some(first), Some(last)) = (sequence.0.first(), sequence.0.last()) {
832            return *first == start_pos.aa && *last == end_pos.aa;
833        }
834
835        false
836    }
837
838    /// Normalize an RNA variant
839    ///
840    /// RNA variants (r.) are similar to transcript variants (n.) and undergo
841    /// the same 3'/5' shifting normalization for indels. The main difference
842    /// is that RNA uses lowercase nucleotides in HGVS notation.
843    fn normalize_rna(
844        &self,
845        variant: &crate::hgvs::variant::RnaVariant,
846    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
847        use crate::hgvs::interval::RnaInterval;
848        use crate::hgvs::location::RnaPos;
849        use crate::hgvs::variant::{LocEdit, RnaVariant};
850
851        // Can't normalize variants with unknown edits or positions
852        let edit = match variant.loc_edit.edit.inner() {
853            Some(e) => e,
854            None => return Ok((HV::Rna(variant.clone()), vec![])),
855        };
856
857        // Only normalize indels
858        if !needs_normalization(edit) {
859            return Ok((HV::Rna(variant.clone()), vec![]));
860        }
861
862        // Check for intronic variants or unknown positions
863        let start_pos = match variant.loc_edit.location.start.inner() {
864            Some(pos) => pos,
865            None => return Ok((HV::Rna(variant.clone()), vec![])),
866        };
867        let end_pos = match variant.loc_edit.location.end.inner() {
868            Some(pos) => pos,
869            None => return Ok((HV::Rna(variant.clone()), vec![])),
870        };
871
872        if start_pos.is_intronic() || end_pos.is_intronic() {
873            return Err(FerroError::IntronicVariant {
874                variant: format!("{}", variant),
875            });
876        }
877
878        // Try to get transcript (RNA uses the same accession as mRNA transcripts)
879        let accession = variant.accession.transcript_accession();
880        let transcript = match self.provider.get_transcript(&accession) {
881            Ok(t) => t,
882            Err(_) => return Ok((HV::Rna(variant.clone()), vec![])),
883        };
884
885        // Only normalize positive positions (within transcript)
886        // Negative positions are outside the transcript sequence
887        if start_pos.base < 1 || end_pos.base < 1 {
888            return Ok((HV::Rna(variant.clone()), vec![]));
889        }
890
891        let rna_start = start_pos.base as u64;
892        let rna_end = end_pos.base as u64;
893
894        // Get boundaries
895        let boundaries = Boundaries::new(1, transcript.sequence.len() as u64);
896
897        // Perform normalization
898        let seq = transcript.sequence.as_bytes();
899        let (new_start, new_end, new_edit, warnings) =
900            self.normalize_na_edit(seq, edit, rna_start, rna_end, &boundaries)?;
901
902        let new_variant = RnaVariant {
903            accession: variant.accession.clone(),
904            gene_symbol: variant.gene_symbol.clone(),
905            loc_edit: LocEdit::new(
906                RnaInterval::new(RnaPos::new(new_start as i64), RnaPos::new(new_end as i64)),
907                new_edit,
908            ),
909        };
910
911        Ok((HV::Rna(new_variant), warnings))
912    }
913
914    /// Normalize a mitochondrial variant
915    fn normalize_mt(
916        &self,
917        variant: &crate::hgvs::variant::MtVariant,
918    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
919        // MT variants are similar to genomic
920        // For now, return as-is (could implement circular genome handling)
921        Ok((HV::Mt(variant.clone()), vec![]))
922    }
923
924    /// Normalize an intronic CDS variant
925    ///
926    /// This converts the intronic position to genomic coordinates, normalizes
927    /// in genomic space, and converts back to CDS intronic notation.
928    fn normalize_intronic_cds(
929        &self,
930        variant: &CdsVariant,
931        transcript: &crate::reference::transcript::Transcript,
932        start_pos: &CdsPos,
933        end_pos: &CdsPos,
934        edit: &NaEdit,
935    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
936        use crate::convert::CoordinateMapper;
937
938        // Check if we have genomic data available
939        if !self.provider.has_genomic_data() {
940            return Err(FerroError::IntronicVariant {
941                variant: format!("{}", variant),
942            });
943        }
944
945        // Get the chromosome for this transcript
946        let chromosome =
947            transcript
948                .chromosome
949                .as_ref()
950                .ok_or_else(|| FerroError::ConversionError {
951                    msg: "Transcript has no chromosome mapping for intronic normalization"
952                        .to_string(),
953                })?;
954
955        // Create coordinate mapper
956        let mapper = CoordinateMapper::new(transcript);
957
958        // Convert CDS intronic positions to genomic
959        let g_start = mapper.cds_to_genomic_with_intron(start_pos)?;
960        let g_end = mapper.cds_to_genomic_with_intron(end_pos)?;
961
962        // On minus strand, genomic coords may be reversed relative to coding order.
963        // Track whether we swap so we can restore coding order after normalization.
964        let swapped = g_start > g_end;
965        let (g_start, g_end) = if swapped {
966            (g_end, g_start)
967        } else {
968            (g_start, g_end)
969        };
970
971        // Get a window of genomic sequence around the variant for normalization
972        // Use the same window size as for exonic normalization
973        let window = self.config.window_size;
974        let seq_start = g_start.saturating_sub(window);
975        let seq_end = g_end.saturating_add(window);
976
977        // Fetch genomic sequence
978        let genomic_seq = self
979            .provider
980            .get_genomic_sequence(chromosome, seq_start, seq_end)?;
981
982        // Calculate the variant position relative to the fetched sequence
983        let rel_start = (g_start - seq_start) + 1; // 1-based
984        let rel_end = (g_end - seq_start) + 1;
985
986        // Define boundaries within the intron
987        // For intronic variants, we can shift within the intron but not into exons
988        // Find the intron boundaries
989        // Use exon-aware CDS-to-tx mapping to account for cdot's gap positions
990        let tx_pos = mapper.cds_to_tx(start_pos)?;
991        let tx_start = u64::try_from(tx_pos.base).map_err(|_| FerroError::ConversionError {
992            msg: format!(
993                "Negative transcript position {} during intronic normalization",
994                tx_pos.base
995            ),
996        })?;
997
998        let intron = transcript
999            .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
1000            .ok_or_else(|| FerroError::ConversionError {
1001                msg: "Could not find intron for normalization".to_string(),
1002            })?;
1003
1004        // Get intron boundaries in genomic coordinates
1005        let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
1006            (Some(s), Some(e)) => (s, e),
1007            _ => {
1008                return Err(FerroError::ConversionError {
1009                    msg: "Intron has no genomic coordinates".to_string(),
1010                })
1011            }
1012        };
1013
1014        // Calculate relative intron boundaries
1015        let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
1016        let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
1017        let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
1018
1019        // On minus-strand transcripts the genomic-strand sequence is the
1020        // reverse complement of the transcript view, but the variant's
1021        // edit alt is in transcript view. Running `normalize_na_edit` on
1022        // the genomic-strand bytes therefore defeats every rule that
1023        // compares the alt against the local reference window. Flip the
1024        // sequence and the relative positions / boundaries to transcript
1025        // view here, run normalization, then map the result positions
1026        // back to the genomic frame. (Issue #98.)
1027        let (work_seq, work_rel_start, work_rel_end, work_boundaries) = flip_intronic_for_strand(
1028            transcript.strand,
1029            &genomic_seq,
1030            rel_start,
1031            rel_end,
1032            &boundaries,
1033        );
1034
1035        // Perform normalization in transcript-view space
1036        let seq_bytes = work_seq.as_bytes();
1037        let (work_new_rel_start, work_new_rel_end, new_edit, warnings) = self.normalize_na_edit(
1038            seq_bytes,
1039            edit,
1040            work_rel_start,
1041            work_rel_end,
1042            &work_boundaries,
1043        )?;
1044
1045        // Map the result positions back to the genomic-strand frame
1046        let (new_rel_start, new_rel_end) = unflip_intronic_positions(
1047            transcript.strand,
1048            work_seq.len() as u64,
1049            work_new_rel_start,
1050            work_new_rel_end,
1051        );
1052
1053        // Convert the normalized genomic position back to absolute genomic
1054        let new_g_start = seq_start + new_rel_start - 1;
1055        let new_g_end = seq_start + new_rel_end - 1;
1056
1057        // Convert back to CDS intronic notation
1058        let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1059        let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1060
1061        // Restore coding order if positions were swapped for genomic processing
1062        let (new_start, new_end) = if swapped {
1063            (new_end, new_start)
1064        } else {
1065            (new_start, new_end)
1066        };
1067
1068        let new_variant = CdsVariant {
1069            accession: variant.accession.clone(),
1070            gene_symbol: variant.gene_symbol.clone(),
1071            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1072        };
1073
1074        Ok((HV::Cds(new_variant), warnings))
1075    }
1076
1077    /// Normalize an intronic transcript (n.) variant
1078    ///
1079    /// This mirrors `normalize_intronic_cds()` but works with TxPos instead of CdsPos.
1080    /// Converts to genomic coordinates, normalizes in genomic space, and converts back.
1081    fn normalize_intronic_tx(
1082        &self,
1083        variant: &TxVariant,
1084        transcript: &crate::reference::transcript::Transcript,
1085        start_pos: &TxPos,
1086        end_pos: &TxPos,
1087        edit: &NaEdit,
1088    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
1089        use crate::convert::CoordinateMapper;
1090
1091        // Check if we have genomic data available
1092        if !self.provider.has_genomic_data() {
1093            return Err(FerroError::IntronicVariant {
1094                variant: format!("{}", variant),
1095            });
1096        }
1097
1098        // Get the chromosome for this transcript
1099        let chromosome =
1100            transcript
1101                .chromosome
1102                .as_ref()
1103                .ok_or_else(|| FerroError::ConversionError {
1104                    msg: "Transcript has no chromosome mapping for intronic normalization"
1105                        .to_string(),
1106                })?;
1107
1108        // Create coordinate mapper
1109        let mapper = CoordinateMapper::new(transcript);
1110
1111        // Convert tx intronic positions to genomic
1112        let g_start = mapper.tx_to_genomic_with_intron(start_pos)?;
1113        let g_end = mapper.tx_to_genomic_with_intron(end_pos)?;
1114
1115        // On minus strand, genomic coords may be reversed relative to coding order.
1116        // Track whether we swap so we can restore coding order after normalization.
1117        let swapped = g_start > g_end;
1118        let (g_start, g_end) = if swapped {
1119            (g_end, g_start)
1120        } else {
1121            (g_start, g_end)
1122        };
1123
1124        // Get a window of genomic sequence around the variant
1125        let window = self.config.window_size;
1126        let seq_start = g_start.saturating_sub(window);
1127        let seq_end = g_end.saturating_add(window);
1128
1129        // Fetch genomic sequence
1130        let genomic_seq = self
1131            .provider
1132            .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1133
1134        // Calculate the variant position relative to the fetched sequence
1135        let rel_start = (g_start - seq_start) + 1; // 1-based
1136        let rel_end = (g_end - seq_start) + 1;
1137
1138        // Find the intron boundaries for normalization limits
1139        let tx_start = u64::try_from(start_pos.base).map_err(|_| FerroError::ConversionError {
1140            msg: format!(
1141                "Negative transcript position {} during intronic normalization",
1142                start_pos.base
1143            ),
1144        })?;
1145
1146        let intron = transcript
1147            .find_intron_at_tx_boundary(tx_start, start_pos.offset.unwrap_or(0))
1148            .ok_or_else(|| FerroError::ConversionError {
1149                msg: "Could not find intron for normalization".to_string(),
1150            })?;
1151
1152        // Get intron boundaries in genomic coordinates
1153        let (intron_g_start, intron_g_end) = match (intron.genomic_start, intron.genomic_end) {
1154            (Some(s), Some(e)) => (s, e),
1155            _ => {
1156                return Err(FerroError::ConversionError {
1157                    msg: "Intron has no genomic coordinates".to_string(),
1158                })
1159            }
1160        };
1161
1162        // Calculate relative intron boundaries
1163        let intron_rel_start = intron_g_start.saturating_sub(seq_start) + 1;
1164        let intron_rel_end = intron_g_end.saturating_sub(seq_start) + 1;
1165        let boundaries = Boundaries::new(intron_rel_start, intron_rel_end);
1166
1167        // See `normalize_intronic_cds`: same orientation fix for #98.
1168        let (work_seq, work_rel_start, work_rel_end, work_boundaries) = flip_intronic_for_strand(
1169            transcript.strand,
1170            &genomic_seq,
1171            rel_start,
1172            rel_end,
1173            &boundaries,
1174        );
1175
1176        // Perform normalization in transcript-view space
1177        let seq_bytes = work_seq.as_bytes();
1178        let (work_new_rel_start, work_new_rel_end, new_edit, warnings) = self.normalize_na_edit(
1179            seq_bytes,
1180            edit,
1181            work_rel_start,
1182            work_rel_end,
1183            &work_boundaries,
1184        )?;
1185
1186        // Map the result positions back to the genomic-strand frame
1187        let (new_rel_start, new_rel_end) = unflip_intronic_positions(
1188            transcript.strand,
1189            work_seq.len() as u64,
1190            work_new_rel_start,
1191            work_new_rel_end,
1192        );
1193
1194        // Convert the normalized genomic position back to absolute genomic
1195        let new_g_start = seq_start + new_rel_start - 1;
1196        let new_g_end = seq_start + new_rel_end - 1;
1197
1198        // Convert back to transcript intronic notation
1199        let new_start = mapper.genomic_to_tx_with_intron(new_g_start)?;
1200        let new_end = mapper.genomic_to_tx_with_intron(new_g_end)?;
1201
1202        // Restore coding order if positions were swapped for genomic processing
1203        let (new_start, new_end) = if swapped {
1204            (new_end, new_start)
1205        } else {
1206            (new_start, new_end)
1207        };
1208
1209        let new_variant = TxVariant {
1210            accession: variant.accession.clone(),
1211            gene_symbol: variant.gene_symbol.clone(),
1212            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1213        };
1214
1215        Ok((HV::Tx(new_variant), warnings))
1216    }
1217
1218    /// Normalize a CDS variant that spans an exon-intron boundary
1219    ///
1220    /// This handles cases like:
1221    /// - c.914_918+3del (exonic start, intronic end)
1222    /// - c.194-64_233del (intronic start, exonic end)
1223    ///
1224    /// Strategy: Convert to genomic coordinates, normalize there, convert back.
1225    fn normalize_boundary_spanning_cds(
1226        &self,
1227        variant: &CdsVariant,
1228        transcript: &crate::reference::transcript::Transcript,
1229        start_pos: &CdsPos,
1230        end_pos: &CdsPos,
1231        edit: &NaEdit,
1232    ) -> Result<(HgvsVariant, Vec<NormalizationWarning>), FerroError> {
1233        use crate::convert::CoordinateMapper;
1234
1235        // Require genomic data for boundary spanning normalization
1236        if !self.provider.has_genomic_data() {
1237            return Err(FerroError::ExonIntronBoundary {
1238                exon: 0,
1239                variant: format!("{}", variant),
1240            });
1241        }
1242
1243        let chromosome =
1244            transcript
1245                .chromosome
1246                .as_ref()
1247                .ok_or_else(|| FerroError::ConversionError {
1248                    msg: "Transcript has no chromosome for boundary normalization".to_string(),
1249                })?;
1250
1251        let mapper = CoordinateMapper::new(transcript);
1252
1253        // Convert both positions to genomic
1254        // For exonic positions, use standard conversion
1255        // For intronic positions, use intronic conversion
1256        let g_start = self.cds_pos_to_genomic(&mapper, start_pos)?;
1257        let g_end = self.cds_pos_to_genomic(&mapper, end_pos)?;
1258
1259        // On minus strand, genomic coords may be reversed relative to coding order.
1260        // Track whether we swap so we can restore coding order after normalization.
1261        let swapped = g_start > g_end;
1262        let (g_start, g_end) = if swapped {
1263            (g_end, g_start)
1264        } else {
1265            (g_start, g_end)
1266        };
1267
1268        // Fetch genomic sequence with window for normalization
1269        let window = self.config.window_size;
1270        let seq_start = g_start.saturating_sub(window);
1271        let seq_end = g_end.saturating_add(window);
1272        let genomic_seq = self
1273            .provider
1274            .get_genomic_sequence(chromosome, seq_start, seq_end)?;
1275
1276        // Calculate relative positions (1-based)
1277        let rel_start = (g_start - seq_start) + 1;
1278        let rel_end = (g_end - seq_start) + 1;
1279
1280        // Determine normalization boundaries
1281        // For boundary-spanning variants, use the union of exon and intron boundaries
1282        let boundaries =
1283            self.get_boundary_spanning_limits(transcript, &mapper, start_pos, end_pos, seq_start)?;
1284
1285        // Normalize in genomic space
1286        let seq_bytes = genomic_seq.as_bytes();
1287        let (new_rel_start, new_rel_end, new_edit, warnings) =
1288            self.normalize_na_edit(seq_bytes, edit, rel_start, rel_end, &boundaries)?;
1289
1290        // Convert back to absolute genomic
1291        let new_g_start = seq_start + new_rel_start - 1;
1292        let new_g_end = seq_start + new_rel_end - 1;
1293
1294        // Convert genomic back to CDS
1295        // The result might be:
1296        // - Still boundary-spanning
1297        // - Fully exonic (if shifted into exon)
1298        // - Fully intronic (if shifted into intron)
1299        let new_start = mapper.genomic_to_cds_intronic(new_g_start)?;
1300        let new_end = mapper.genomic_to_cds_intronic(new_g_end)?;
1301
1302        // Restore coding order if positions were swapped for genomic processing
1303        let (new_start, new_end) = if swapped {
1304            (new_end, new_start)
1305        } else {
1306            (new_start, new_end)
1307        };
1308
1309        let new_variant = CdsVariant {
1310            accession: variant.accession.clone(),
1311            gene_symbol: variant.gene_symbol.clone(),
1312            loc_edit: LocEdit::new(Interval::new(new_start, new_end), new_edit),
1313        };
1314
1315        Ok((HV::Cds(new_variant), warnings))
1316    }
1317
1318    /// Convert a CDS position (exonic or intronic) to genomic coordinate
1319    fn cds_pos_to_genomic(
1320        &self,
1321        mapper: &crate::convert::CoordinateMapper,
1322        pos: &CdsPos,
1323    ) -> Result<u64, FerroError> {
1324        if pos.is_intronic() {
1325            mapper.cds_to_genomic_with_intron(pos)
1326        } else {
1327            mapper
1328                .cds_to_genomic(pos)?
1329                .ok_or_else(|| FerroError::ConversionError {
1330                    msg: format!("CDS position {} not in exons", pos.base),
1331                })
1332        }
1333    }
1334
1335    /// Calculate normalization boundaries for boundary-spanning variants
1336    ///
1337    /// Returns the genomic region within which we can shift the variant,
1338    /// which is the union of the exon and intron containing the variant ends.
1339    fn get_boundary_spanning_limits(
1340        &self,
1341        transcript: &crate::reference::transcript::Transcript,
1342        mapper: &crate::convert::CoordinateMapper,
1343        start_pos: &CdsPos,
1344        end_pos: &CdsPos,
1345        seq_start: u64,
1346    ) -> Result<Boundaries, FerroError> {
1347        // Find the genomic extent of the region we can shift within
1348        // This is the union of:
1349        // - The exon containing the exonic position
1350        // - The intron containing the intronic position
1351
1352        // Identify which position is exonic and which is intronic
1353        let (exonic_pos, intronic_pos) = if start_pos.is_intronic() {
1354            (end_pos, start_pos)
1355        } else {
1356            (start_pos, end_pos)
1357        };
1358
1359        // Get exon boundaries in genomic coords
1360        let tx_pos = mapper.cds_to_tx(exonic_pos)?;
1361        let tx_pos_base = u64::try_from(tx_pos.base).map_err(|_| FerroError::ConversionError {
1362            msg: format!(
1363                "Negative transcript position {} during boundary normalization",
1364                tx_pos.base
1365            ),
1366        })?;
1367        let exon = transcript
1368            .exon_at(tx_pos_base)
1369            .ok_or_else(|| FerroError::ConversionError {
1370                msg: "Could not find exon for boundary normalization".to_string(),
1371            })?;
1372
1373        // Get intron boundaries in genomic coords
1374        let tx_boundary = mapper.cds_to_tx(intronic_pos)?;
1375        let tx_boundary_base =
1376            u64::try_from(tx_boundary.base).map_err(|_| FerroError::ConversionError {
1377                msg: format!(
1378                    "Negative transcript position {} during boundary normalization",
1379                    tx_boundary.base
1380                ),
1381            })?;
1382        let offset = intronic_pos.offset.unwrap_or(0);
1383        let intron = transcript
1384            .find_intron_at_tx_boundary(tx_boundary_base, offset)
1385            .ok_or_else(|| FerroError::ConversionError {
1386                msg: "Could not find intron for boundary normalization".to_string(),
1387            })?;
1388
1389        // Get genomic coordinates from exon and intron
1390        let (exon_g_start, exon_g_end): (u64, u64) = match (exon.genomic_start, exon.genomic_end) {
1391            (Some(s), Some(e)) => (s, e),
1392            _ => {
1393                return Err(FerroError::ConversionError {
1394                    msg: "Exon has no genomic coordinates".to_string(),
1395                })
1396            }
1397        };
1398
1399        let (intron_g_start, intron_g_end): (u64, u64) =
1400            match (intron.genomic_start, intron.genomic_end) {
1401                (Some(s), Some(e)) => (s, e),
1402                _ => {
1403                    return Err(FerroError::ConversionError {
1404                        msg: "Intron has no genomic coordinates".to_string(),
1405                    })
1406                }
1407            };
1408
1409        // Union of exon and intron genomic coordinates
1410        let combined_start = exon_g_start.min(intron_g_start);
1411        let combined_end = exon_g_end.max(intron_g_end);
1412
1413        // Convert to relative positions (1-based within our fetched sequence)
1414        let rel_start = combined_start.saturating_sub(seq_start) + 1;
1415        let rel_end = combined_end.saturating_sub(seq_start) + 1;
1416
1417        Ok(Boundaries::new(rel_start, rel_end))
1418    }
1419
1420    /// Core normalization for nucleic acid edits
1421    ///
1422    /// Returns (new_start, new_end, new_edit, warnings)
1423    fn normalize_na_edit(
1424        &self,
1425        ref_seq: &[u8],
1426        edit: &NaEdit,
1427        start: u64,
1428        end: u64,
1429        boundaries: &Boundaries,
1430    ) -> Result<(u64, u64, NaEdit, Vec<NormalizationWarning>), FerroError> {
1431        let mut warnings = Vec::new();
1432
1433        // Validate reference allele before normalization
1434        let validation = validate::validate_reference(edit, ref_seq, start, end);
1435        if !validation.valid {
1436            warnings.push(NormalizationWarning {
1437                code: "REFSEQ_MISMATCH".to_string(),
1438                message: validation.warning.unwrap_or_default(),
1439                stated_ref: validation.stated_ref.unwrap_or_default(),
1440                actual_ref: validation.actual_ref.unwrap_or_default(),
1441                position: format!("{}-{}", start, end),
1442                corrected: true,
1443            });
1444        }
1445
1446        // Get the alternate sequence for the edit
1447        let alt_seq = match edit {
1448            NaEdit::Deletion { .. } => vec![],
1449            NaEdit::Insertion { sequence } => {
1450                // Only shuffle if we have a literal sequence
1451                if let Some(bases) = sequence.bases() {
1452                    bases.iter().map(|b| b.to_u8()).collect()
1453                } else {
1454                    // Cannot shuffle non-literal insertions (counts, ranges, etc.)
1455                    return Ok((start, end, edit.clone(), warnings.clone()));
1456                }
1457            }
1458            NaEdit::Duplication { sequence, .. } => {
1459                if let Some(seq) = sequence {
1460                    seq.bases().iter().map(|b| b.to_u8()).collect()
1461                } else {
1462                    // Get duplicated sequence from reference
1463                    // start is 1-based, convert to 0-based index for array access
1464                    let s = hgvs_pos_to_index(start);
1465                    let e = end as usize;
1466                    if e <= ref_seq.len() {
1467                        ref_seq[s..e].to_vec()
1468                    } else {
1469                        vec![]
1470                    }
1471                }
1472            }
1473            NaEdit::Delins { sequence } => {
1474                use crate::hgvs::edit::InsertedSequence;
1475
1476                // HGVS spec: delins should NOT be 3' shifted like del/dup/ins,
1477                // but the edit-type priority (sub > del > inv > dup > ins) means
1478                // we may need to rewrite it as a higher-priority form: identity
1479                // (insert == ref), substitution (1->1 ref!=alt), or duplication.
1480                if let InsertedSequence::Literal(seq) = sequence {
1481                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1482                    let start_idx = hgvs_pos_to_index(start);
1483                    let end_idx = end as usize;
1484
1485                    // Identity (highest priority): insert == deleted reference.
1486                    // Example: c.10delinsG where ref[10]=G → c.10=.
1487                    if rules::delins_is_identity(ref_seq, start_idx, end_idx, &seq_bytes) {
1488                        return Ok((start, end, NaEdit::position_identity(), warnings.clone()));
1489                    }
1490
1491                    // Substitution: 1-base delins with a different alt base.
1492                    // Example: g.1000delinsA where ref[1000]=G → g.1000G>A.
1493                    if let Some((reference, alternative)) =
1494                        rules::delins_is_substitution(ref_seq, start_idx, end_idx, &seq_bytes)
1495                    {
1496                        return Ok((
1497                            start,
1498                            start,
1499                            NaEdit::Substitution {
1500                                reference,
1501                                alternative,
1502                            },
1503                            warnings.clone(),
1504                        ));
1505                    }
1506
1507                    // Duplication: c.5delinsGG where position 5 is G → dup.
1508                    if rules::delins_is_duplication(ref_seq, start_idx, end_idx, &seq_bytes) {
1509                        // Convert to duplication with minimal notation
1510                        return Ok((
1511                            start,
1512                            end,
1513                            NaEdit::Duplication {
1514                                sequence: None,
1515                                length: None,
1516                                uncertain_extent: None,
1517                            },
1518                            warnings.clone(),
1519                        ));
1520                    }
1521                }
1522                // Return unchanged (only apply minimal representation rules, not shuffling)
1523                return Ok((start, end, edit.clone(), warnings.clone()));
1524            }
1525            NaEdit::Inversion { sequence, length } => {
1526                // Apply inversion shortening: remove outer bases that cancel out
1527                // when the first base is complementary to the last base
1528                let start_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1529                let end_idx = end as usize; // end is exclusive (0-based)
1530
1531                if let Some((new_s, new_e)) = rules::shorten_inversion(ref_seq, start_idx, end_idx)
1532                {
1533                    // Inversion was shortened - convert back to 1-based
1534                    let shortened_edit = NaEdit::Inversion {
1535                        sequence: sequence.clone(),
1536                        length: *length,
1537                    };
1538                    return Ok((
1539                        index_to_hgvs_pos(new_s),
1540                        new_e as u64,
1541                        shortened_edit,
1542                        warnings,
1543                    ));
1544                } else {
1545                    // Inversion reduced to identity - return as identity
1546                    return Ok((
1547                        start,
1548                        end,
1549                        NaEdit::Identity {
1550                            sequence: None,
1551                            whole_entity: false,
1552                        },
1553                        warnings,
1554                    ));
1555                }
1556            }
1557            NaEdit::Repeat {
1558                sequence,
1559                count,
1560                additional_counts,
1561                trailing,
1562            } => {
1563                use crate::hgvs::edit::RepeatCount;
1564
1565                // Only normalize exact counts with a sequence
1566                let Some(seq) = sequence else {
1567                    return Ok((start, end, edit.clone(), warnings.clone()));
1568                };
1569
1570                let RepeatCount::Exact(specified_count) = count else {
1571                    return Ok((start, end, edit.clone(), warnings.clone()));
1572                };
1573
1574                // Skip if there are additional counts (genotype notation)
1575                if !additional_counts.is_empty() || trailing.is_some() {
1576                    return Ok((start, end, edit.clone(), warnings.clone()));
1577                }
1578
1579                // Get the repeat unit as bytes
1580                let repeat_unit: Vec<u8> = seq.bases().iter().map(|b| b.to_u8()).collect();
1581                let pos_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1582
1583                // Normalize the repeat
1584                match rules::normalize_repeat(ref_seq, pos_idx, &repeat_unit, *specified_count) {
1585                    rules::RepeatNormResult::Deletion {
1586                        start: del_start,
1587                        end: del_end,
1588                    } => {
1589                        // Minimal notation - no explicit length
1590                        let del_edit = NaEdit::Deletion {
1591                            sequence: None,
1592                            length: None,
1593                        };
1594                        return Ok((del_start, del_end, del_edit, warnings));
1595                    }
1596                    rules::RepeatNormResult::Duplication {
1597                        start: dup_start,
1598                        end: dup_end,
1599                        sequence: _dup_seq,
1600                    } => {
1601                        // Minimal notation - no explicit sequence or length
1602                        let dup_edit = NaEdit::Duplication {
1603                            sequence: None,
1604                            length: None,
1605                            uncertain_extent: None,
1606                        };
1607                        return Ok((dup_start, dup_end, dup_edit, warnings));
1608                    }
1609                    rules::RepeatNormResult::Repeat {
1610                        start: rep_start,
1611                        end: rep_end,
1612                        sequence: rep_seq,
1613                        count: rep_count,
1614                    } => {
1615                        use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1616                        let bases: Vec<Base> = rep_seq
1617                            .iter()
1618                            .filter_map(|&b| Base::from_char(b as char))
1619                            .collect();
1620                        let rep_edit = NaEdit::Repeat {
1621                            sequence: Some(Sequence::new(bases)),
1622                            count: RepeatCount::Exact(rep_count),
1623                            additional_counts: vec![],
1624                            trailing: None,
1625                        };
1626                        return Ok((rep_start, rep_end, rep_edit, warnings));
1627                    }
1628                    rules::RepeatNormResult::Unchanged => {
1629                        return Ok((start, end, edit.clone(), warnings.clone()));
1630                    }
1631                }
1632            }
1633            _ => return Ok((start, end, edit.clone(), warnings.clone())), // Other edits don't need shuffling
1634        };
1635
1636        // Perform shuffle
1637        // For insertions, the HGVS interval X_Y (where Y = X+1) represents flanking positions.
1638        // We need to adjust the end coordinate so shuffle checks the correct reference position.
1639        // For c.445_446insA: start=634, end=635 (1-based tx coords)
1640        // We want shuffle to check ref_seq[634-1] = ref_seq[633] for first flanking
1641        // and ref_seq[635-1] = ref_seq[634] for second flanking (the position to check for 3' shift)
1642        //
1643        // For insertions, we need to adjust the start passed to shuffle so that the alt_idx
1644        // calculation starts at 0 (not 1). The shuffle's alt_idx formula is:
1645        //   alt_idx = (new_end - start) % alt_seq.len()
1646        // For insertions, new_end starts at shuffle_end (which is end - 1 for insertions).
1647        // If we pass start_idx directly, alt_idx = (end-1) - start_idx = 1 (wrong, should be 0).
1648        // By passing start_idx + 1, we get alt_idx = (end-1) - (start_idx+1) = 0 (correct).
1649        let shuffle_end = match edit {
1650            NaEdit::Insertion { .. } => end.saturating_sub(1), // Use second flanking position
1651            _ => end,
1652        };
1653        let start_idx = hgvs_pos_to_index(start); // Convert 1-based to 0-based
1654        let shuffle_start = match edit {
1655            NaEdit::Insertion { .. } => start_idx as u64 + 1, // Adjust so alt_idx starts at 0
1656            _ => start_idx as u64,
1657        };
1658        let result = shuffle(
1659            ref_seq,
1660            &alt_seq,
1661            shuffle_start,
1662            shuffle_end, // Adjusted for insertions
1663            boundaries,
1664            self.config.shuffle_direction,
1665        );
1666
1667        // Convert back to 1-based HGVS positions
1668        // For insertions, we adjusted the start for shuffle, now adjust back
1669        let shuffle_result_start = match edit {
1670            NaEdit::Insertion { .. } => result.start.saturating_sub(1), // Adjust back
1671            _ => result.start,
1672        };
1673        let new_start = index_to_hgvs_pos(shuffle_result_start as usize);
1674        // For insertions, we adjusted the end for shuffle, now restore the HGVS X_(X+1) format
1675        let new_end = match edit {
1676            NaEdit::Insertion { .. } => index_to_hgvs_pos(result.end as usize), // Restore second flanking position
1677            _ => result.end,
1678        };
1679
1680        // Determine the canonical form for the edit
1681        // HGVS rules:
1682        // - Deletions ALWAYS stay as deletions (just shift 3')
1683        // - Insertions become duplications if single-base matches adjacent
1684        // - Multi-base insertions/dups in homopolymer become repeat notation
1685        let (final_start, final_end, new_edit) = match edit {
1686            NaEdit::Insertion { sequence } => {
1687                use crate::hgvs::edit::{InsertedSequence, RepeatCount, Sequence};
1688
1689                if let InsertedSequence::Literal(seq) = sequence {
1690                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1691
1692                    // Check for repeat notation first (multi-base homopolymer insertion)
1693                    // Use the ORIGINAL position (start), not shuffled position (result.start)
1694                    // because repeat notation refers to the reference tract position
1695                    if seq_bytes.len() > 1 {
1696                        let original_pos_idx = hgvs_pos_to_index(start) as u64; // 0-based original position
1697                        if let Some((_first, count, rep_start, rep_end, unit_bytes)) =
1698                            rules::insertion_to_repeat(ref_seq, original_pos_idx, &seq_bytes)
1699                        {
1700                            use crate::hgvs::edit::Base;
1701                            let bases: Vec<Base> = unit_bytes
1702                                .iter()
1703                                .filter_map(|&b| Base::from_char(b as char))
1704                                .collect();
1705                            if bases.len() == unit_bytes.len() {
1706                                let repeat_seq = Sequence::new(bases);
1707                                let repeat_edit = NaEdit::Repeat {
1708                                    sequence: Some(repeat_seq),
1709                                    count: RepeatCount::Exact(count),
1710                                    additional_counts: vec![],
1711                                    trailing: None,
1712                                };
1713                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1714                            }
1715                        }
1716                    }
1717
1718                    // Check for simple duplication (single-base or matching adjacent)
1719                    // When shifting an insertion through a repeat region, the effective sequence
1720                    // rotates. For example, shifting "GGC" by 1 position gives "GCG".
1721                    let shift_amount =
1722                        (result.start as usize).saturating_sub(shuffle_start as usize);
1723                    let rotation = shift_amount % seq_bytes.len();
1724                    let rotated_seq: Vec<u8> = if rotation > 0 {
1725                        seq_bytes[rotation..]
1726                            .iter()
1727                            .chain(seq_bytes[..rotation].iter())
1728                            .copied()
1729                            .collect()
1730                    } else {
1731                        seq_bytes.clone()
1732                    };
1733
1734                    if rules::insertion_is_duplication(ref_seq, result.start, &rotated_seq) {
1735                        // For duplication, use minimal notation without explicit sequence
1736                        // Position: for c.X_(X+1)ins that duplicates preceding sequence,
1737                        // the result is c.(X-len+1)_Xdup
1738                        let dup_len = rotated_seq.len() as u64;
1739                        let (dup_start, dup_end) = if dup_len == 1 {
1740                            (new_start, new_start) // Single position for single-base dup
1741                        } else {
1742                            // Multi-base dup: the duplicated region is BEFORE the insertion point
1743                            (new_start - dup_len + 1, new_start)
1744                        };
1745                        (
1746                            dup_start,
1747                            dup_end,
1748                            NaEdit::Duplication {
1749                                sequence: None, // Minimal notation - no explicit sequence
1750                                length: None,
1751                                uncertain_extent: None,
1752                            },
1753                        )
1754                    } else {
1755                        // Output the rotated sequence for shifted insertions
1756                        if rotation > 0 {
1757                            use crate::hgvs::edit::{Base, InsertedSequence, Sequence};
1758                            let rotated_bases: Vec<Base> = rotated_seq
1759                                .iter()
1760                                .filter_map(|&b| Base::from_char(b as char))
1761                                .collect();
1762                            let new_sequence =
1763                                InsertedSequence::Literal(Sequence::new(rotated_bases));
1764                            (
1765                                new_start,
1766                                new_end,
1767                                NaEdit::Insertion {
1768                                    sequence: new_sequence,
1769                                },
1770                            )
1771                        } else {
1772                            (new_start, new_end, edit.clone())
1773                        }
1774                    }
1775                } else {
1776                    (new_start, new_end, edit.clone())
1777                }
1778            }
1779            NaEdit::Duplication { .. } => {
1780                use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1781
1782                // Check if duplication should become repeat notation
1783                // Use the shuffled positions (result.start, result.end) which are 0-based
1784                // This applies to both single-base dups in homopolymers and multi-base tandem dups
1785                if let Some(dup_result) =
1786                    rules::duplication_to_repeat(ref_seq, result.start, result.end)
1787                {
1788                    match dup_result {
1789                        rules::DupToRepeatResult::Homopolymer {
1790                            base,
1791                            count,
1792                            start: rep_start,
1793                            end: rep_end,
1794                        } => {
1795                            if let Some(base_enum) = Base::from_char(base as char) {
1796                                let repeat_seq = Sequence::new(vec![base_enum]);
1797                                let repeat_edit = NaEdit::Repeat {
1798                                    sequence: Some(repeat_seq),
1799                                    count: RepeatCount::Exact(count),
1800                                    additional_counts: vec![],
1801                                    trailing: None,
1802                                };
1803                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1804                            }
1805                        }
1806                        rules::DupToRepeatResult::TandemRepeat {
1807                            unit,
1808                            count,
1809                            start: rep_start,
1810                            end: rep_end,
1811                        } => {
1812                            let bases: Vec<Base> = unit
1813                                .iter()
1814                                .filter_map(|&b| Base::from_char(b as char))
1815                                .collect();
1816                            if bases.len() == unit.len() {
1817                                let repeat_seq = Sequence::new(bases);
1818                                let repeat_edit = NaEdit::Repeat {
1819                                    sequence: Some(repeat_seq),
1820                                    count: RepeatCount::Exact(count),
1821                                    additional_counts: vec![],
1822                                    trailing: None,
1823                                };
1824                                return Ok((rep_start, rep_end, repeat_edit, warnings));
1825                            }
1826                        }
1827                    }
1828                }
1829                // Keep as duplication but strip explicit sequence (minimal notation)
1830                (
1831                    new_start,
1832                    new_end,
1833                    NaEdit::Duplication {
1834                        sequence: None,
1835                        length: None,
1836                        uncertain_extent: None,
1837                    },
1838                )
1839            }
1840            NaEdit::Delins { sequence } => {
1841                use crate::hgvs::edit::{InsertedSequence, Sequence};
1842
1843                // Check if delins should become a duplication
1844                // Example: c.5delinsGG where position 5 is G → dup
1845                if let InsertedSequence::Literal(seq) = sequence {
1846                    let seq_bytes: Vec<u8> = seq.bases().iter().map(|b| *b as u8).collect();
1847                    let start_0 = result.start as usize;
1848                    let end_0 = result.end as usize;
1849
1850                    if rules::delins_is_duplication(ref_seq, start_0, end_0, &seq_bytes) {
1851                        // Convert to duplication - the duplicated sequence is the deleted region
1852                        let dup_len = end_0 - start_0;
1853                        let dup_seq_bytes = &ref_seq[start_0..end_0];
1854                        let dup_bases: Vec<crate::hgvs::edit::Base> = dup_seq_bytes
1855                            .iter()
1856                            .filter_map(|&b| crate::hgvs::edit::Base::from_char(b as char))
1857                            .collect();
1858                        let dup_seq = Sequence::new(dup_bases);
1859
1860                        (
1861                            new_start,
1862                            new_end,
1863                            NaEdit::Duplication {
1864                                sequence: Some(dup_seq),
1865                                length: Some(dup_len as u64),
1866                                uncertain_extent: None,
1867                            },
1868                        )
1869                    } else {
1870                        (new_start, new_end, edit.clone())
1871                    }
1872                } else {
1873                    (new_start, new_end, edit.clone())
1874                }
1875            }
1876            // Deletions: post-shift, check for B2 canonical-form rule
1877            // (deletion of >=2 tandem-repeat units → unit[N-k]); otherwise
1878            // strip explicit length for minimal `del` notation. The
1879            // collect-into-Option short-circuits if any byte in the unit isn't
1880            // a valid `Base` (e.g. `N`), in which case we fall through to del.
1881            //
1882            // B2 is defined for a *post-3'-shift* deletion (the shuffle phase-
1883            // alignment lemma justifies emitting `unit[N-k]` without rotation).
1884            // Under FivePrime shuffle, applying it would re-anchor the
1885            // 5'-normalized deletion to the canonical tract position, defeating
1886            // the user's choice of direction — so gate it on ThreePrime.
1887            NaEdit::Deletion { .. } => {
1888                use crate::hgvs::edit::{Base, RepeatCount, Sequence};
1889                if self.config.shuffle_direction == ShuffleDirection::ThreePrime {
1890                    if let Some(rep) = rules::deletion_to_repeat(
1891                        ref_seq,
1892                        result.start as usize,
1893                        result.end as usize,
1894                    ) {
1895                        let bases: Option<Vec<Base>> = rep
1896                            .unit
1897                            .iter()
1898                            .map(|&b| Base::from_char(b as char))
1899                            .collect();
1900                        if let Some(bases) = bases {
1901                            let repeat_edit = NaEdit::Repeat {
1902                                sequence: Some(Sequence::new(bases)),
1903                                count: RepeatCount::Exact(rep.count),
1904                                additional_counts: vec![],
1905                                trailing: None,
1906                            };
1907                            return Ok((rep.start, rep.end, repeat_edit, warnings));
1908                        }
1909                    }
1910                }
1911                (
1912                    new_start,
1913                    new_end,
1914                    NaEdit::Deletion {
1915                        sequence: None,
1916                        length: None,
1917                    },
1918                )
1919            }
1920            // All other edit types stay unchanged
1921            _ => (new_start, new_end, edit.clone()),
1922        };
1923
1924        Ok((final_start, final_end, new_edit, warnings))
1925    }
1926
1927    /// Convert CDS position to transcript position
1928    fn cds_to_tx_pos(
1929        &self,
1930        pos: &CdsPos,
1931        cds_start: u64,
1932        cds_end: Option<u64>,
1933    ) -> Result<u64, FerroError> {
1934        if pos.utr3 {
1935            let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1936                msg: "No CDS end".to_string(),
1937            })?;
1938            let base = u64::try_from(pos.base).map_err(|_| FerroError::ConversionError {
1939                msg: format!("Negative base {} in 3' UTR position", pos.base),
1940            })?;
1941            Ok(end + base)
1942        } else if pos.base < 0 {
1943            // 5'UTR: HGVS numbering skips c.0 (c.-1 is the base immediately
1944            // upstream of c.1), so c.-N maps to tx position cds_start - N.
1945            // Issue #97 — the previous formula `cds_start + base - 1`
1946            // double-counted the gap and emitted the wrong tx position.
1947            let tx_pos = cds_start as i64 + pos.base;
1948            u64::try_from(tx_pos).map_err(|_| FerroError::ConversionError {
1949                msg: format!(
1950                    "CDS position c.{} maps before transcript start (cds_start={})",
1951                    pos.base, cds_start
1952                ),
1953            })
1954        } else if pos.base == 0 {
1955            // c.0 is not a valid HGVS position, but historical inputs
1956            // can land here. Preserve the legacy mapping (treat as the
1957            // last 5'UTR base, equivalent to c.-1) rather than failing.
1958            Ok(cds_start.saturating_sub(1))
1959        } else {
1960            Ok(cds_start + pos.base as u64 - 1)
1961        }
1962    }
1963
1964    /// Convert transcript position to CDS position
1965    fn tx_to_cds_pos(
1966        &self,
1967        pos: u64,
1968        cds_start: u64,
1969        cds_end: Option<u64>,
1970    ) -> Result<CdsPos, FerroError> {
1971        let end = cds_end.ok_or_else(|| FerroError::ConversionError {
1972            msg: "No CDS end".to_string(),
1973        })?;
1974
1975        if pos < cds_start {
1976            // 5'UTR: HGVS numbering skips c.0, so a tx position one
1977            // base 5' of cds_start is c.-1 (not c.0). Inverse of the
1978            // forward formula `tx = cds_start + base` for negative
1979            // base: `base = tx - cds_start`. Issue #97 — the previous
1980            // formula `tx - cds_start + 1` would emit base = 0 for
1981            // tx = cds_start - 1, rendered by `CdsPos::Display` as
1982            // `c.?` (`CDS_BASE_UNKNOWN`).
1983            Ok(CdsPos {
1984                base: pos as i64 - cds_start as i64,
1985                offset: None,
1986                utr3: false,
1987            })
1988        } else if pos > end {
1989            Ok(CdsPos {
1990                base: (pos - end) as i64,
1991                offset: None,
1992                utr3: true,
1993            })
1994        } else {
1995            Ok(CdsPos {
1996                base: (pos - cds_start + 1) as i64,
1997                offset: None,
1998                utr3: false,
1999            })
2000        }
2001    }
2002
2003    /// Apply minimal notation to a CDS variant without full normalization.
2004    ///
2005    /// This is used when we can't do full normalization (e.g., missing transcript)
2006    /// but still want to apply minimal HGVS notation rules.
2007    fn canonicalize_cds_variant(&self, variant: &CdsVariant) -> CdsVariant {
2008        let edit = match variant.loc_edit.edit.inner() {
2009            Some(e) => e,
2010            None => return variant.clone(),
2011        };
2012
2013        // Only canonicalize if the edit has redundant information
2014        if !should_canonicalize(edit) {
2015            return variant.clone();
2016        }
2017
2018        let canonical_edit = canonicalize_edit(edit);
2019
2020        CdsVariant {
2021            accession: variant.accession.clone(),
2022            gene_symbol: variant.gene_symbol.clone(),
2023            loc_edit: LocEdit::with_uncertainty(
2024                variant.loc_edit.location.clone(),
2025                variant.loc_edit.edit.map_ref(|_| canonical_edit),
2026            ),
2027        }
2028    }
2029
2030    /// Apply minimal notation to a genome variant without full normalization.
2031    fn canonicalize_genome_variant(&self, variant: &GenomeVariant) -> GenomeVariant {
2032        let edit = match variant.loc_edit.edit.inner() {
2033            Some(e) => e,
2034            None => return variant.clone(),
2035        };
2036
2037        if !should_canonicalize(edit) {
2038            return variant.clone();
2039        }
2040
2041        let canonical_edit = canonicalize_edit(edit);
2042
2043        GenomeVariant {
2044            accession: variant.accession.clone(),
2045            gene_symbol: variant.gene_symbol.clone(),
2046            loc_edit: LocEdit::with_uncertainty(
2047                variant.loc_edit.location.clone(),
2048                variant.loc_edit.edit.map_ref(|_| canonical_edit),
2049            ),
2050        }
2051    }
2052
2053    /// Apply minimal notation to a transcript variant without full normalization.
2054    fn canonicalize_tx_variant(&self, variant: &TxVariant) -> TxVariant {
2055        let edit = match variant.loc_edit.edit.inner() {
2056            Some(e) => e,
2057            None => return variant.clone(),
2058        };
2059
2060        if !should_canonicalize(edit) {
2061            return variant.clone();
2062        }
2063
2064        let canonical_edit = canonicalize_edit(edit);
2065
2066        TxVariant {
2067            accession: variant.accession.clone(),
2068            gene_symbol: variant.gene_symbol.clone(),
2069            loc_edit: LocEdit::with_uncertainty(
2070                variant.loc_edit.location.clone(),
2071                variant.loc_edit.edit.map_ref(|_| canonical_edit),
2072            ),
2073        }
2074    }
2075}
2076
2077/// Flip a fetched intronic genomic-strand window into transcript-view
2078/// orientation when the host transcript is on the minus strand. Returns
2079/// the input unchanged on plus strand. The relative positions and the
2080/// shuffle boundaries are flipped so they index into the returned
2081/// sequence consistently. Companion to [`unflip_intronic_positions`].
2082fn flip_intronic_for_strand(
2083    strand: Strand,
2084    genomic_seq: &str,
2085    rel_start: u64,
2086    rel_end: u64,
2087    boundaries: &Boundaries,
2088) -> (String, u64, u64, Boundaries) {
2089    if strand != Strand::Minus {
2090        return (
2091            genomic_seq.to_string(),
2092            rel_start,
2093            rel_end,
2094            boundaries.clone(),
2095        );
2096    }
2097    let seq_len = genomic_seq.len() as u64;
2098    let rc = crate::cli::reverse_complement(genomic_seq);
2099    let new_rel_start = seq_len - rel_end + 1;
2100    let new_rel_end = seq_len - rel_start + 1;
2101    let new_boundaries = Boundaries::new(
2102        seq_len - boundaries.right + 1,
2103        seq_len - boundaries.left + 1,
2104    );
2105    (rc, new_rel_start, new_rel_end, new_boundaries)
2106}
2107
2108/// Inverse of [`flip_intronic_for_strand`] for the result positions
2109/// emitted by `normalize_na_edit`. On plus strand returns the input
2110/// unchanged; on minus strand maps from transcript-view back to the
2111/// genomic-strand frame.
2112fn unflip_intronic_positions(
2113    strand: Strand,
2114    seq_len: u64,
2115    rel_start: u64,
2116    rel_end: u64,
2117) -> (u64, u64) {
2118    if strand == Strand::Minus {
2119        (seq_len - rel_end + 1, seq_len - rel_start + 1)
2120    } else {
2121        (rel_start, rel_end)
2122    }
2123}
2124
2125#[cfg(test)]
2126mod tests {
2127    use super::*;
2128    use crate::hgvs::parser::parse_hgvs;
2129    use crate::reference::MockProvider;
2130
2131    #[test]
2132    fn test_normalize_substitution_unchanged() {
2133        let provider = MockProvider::with_test_data();
2134        let normalizer = Normalizer::new(provider);
2135
2136        let variant = parse_hgvs("NM_000088.3:c.10A>G").unwrap();
2137        let normalized = normalizer.normalize(&variant).unwrap();
2138
2139        // Substitutions should not change
2140        assert_eq!(format!("{}", variant), format!("{}", normalized));
2141    }
2142
2143    #[test]
2144    fn test_normalize_with_config() {
2145        let provider = MockProvider::with_test_data();
2146        let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2147        let normalizer = Normalizer::with_config(provider, config);
2148
2149        assert_eq!(
2150            normalizer.config().shuffle_direction,
2151            ShuffleDirection::FivePrime
2152        );
2153    }
2154
2155    #[test]
2156    fn test_normalizer_handles_missing_transcript() {
2157        let provider = MockProvider::new(); // Empty provider
2158        let normalizer = Normalizer::new(provider);
2159
2160        // Should return variant unchanged when transcript not found
2161        let variant = parse_hgvs("NM_MISSING.1:c.100del").unwrap();
2162        let result = normalizer.normalize(&variant);
2163        assert!(result.is_ok());
2164        // Verify output equals input (unchanged)
2165        assert_eq!(
2166            format!("{}", variant),
2167            format!("{}", result.unwrap()),
2168            "Missing transcript should return variant unchanged"
2169        );
2170    }
2171
2172    #[test]
2173    fn test_normalize_deletion_shifts_3prime() {
2174        // NM_001234.1 has G repeat spanning exon boundaries
2175        // Exon 1: c.1-11, Exon 2: c.12-26, Exon 3: c.27+
2176        // G repeat is at c.9-c.33, but shift stops at exon boundary
2177        // c.10 is in exon 1 (ends at c.11), so deletion shifts to c.11
2178        let provider = MockProvider::with_test_data();
2179        let normalizer = Normalizer::new(provider);
2180
2181        let variant = parse_hgvs("NM_001234.1:c.10del").unwrap();
2182        let result = normalizer.normalize(&variant).unwrap();
2183        let output = format!("{}", result);
2184
2185        // Should remain a deletion
2186        assert!(
2187            output.contains("del"),
2188            "Deletion should remain a deletion, got: {}",
2189            output
2190        );
2191        // Should shift from c.10 to c.11 (3'-most within exon 1)
2192        assert!(
2193            output.contains("c.11del"),
2194            "Deletion should shift 3' to exon boundary (c.11), got: {}",
2195            output
2196        );
2197    }
2198
2199    #[test]
2200    fn test_normalize_insertion_becomes_dup() {
2201        // NM_001234.1 has G repeat at CDS positions c.9-c.33
2202        // Inserting G after position 10 should shift 3' and become dup
2203        let provider = MockProvider::with_test_data();
2204        let normalizer = Normalizer::new(provider);
2205
2206        let variant = parse_hgvs("NM_001234.1:c.10_11insG").unwrap();
2207        let result = normalizer.normalize(&variant).unwrap();
2208        let output = format!("{}", result);
2209
2210        // Inserting G in a G-repeat should become dup
2211        assert!(
2212            output.contains("dup"),
2213            "Insertion of matching base should become dup, got: {}",
2214            output
2215        );
2216    }
2217
2218    #[test]
2219    fn test_normalize_duplication_shifts_3prime() {
2220        // NM_001234.1 has G repeat spanning positions c.9-33 (25 G's)
2221        // Single-base duplications stay as simple dups (only 2+ base dups become repeat notation)
2222        let provider = MockProvider::with_test_data();
2223        let normalizer = Normalizer::new(provider);
2224
2225        let variant = parse_hgvs("NM_001234.1:c.10dup").unwrap();
2226        let result = normalizer.normalize(&variant).unwrap();
2227        let output = format!("{}", result);
2228
2229        // Single-base duplication should shift 3' and stay as dup
2230        // c.10dup in GGGGG...GGG tract shifts to rightmost position (c.33) but stays as dup
2231        assert!(
2232            output.contains("dup"),
2233            "Single-base duplication should remain as dup notation, got: {}",
2234            output
2235        );
2236    }
2237
2238    #[test]
2239    fn test_normalize_delins_unchanged() {
2240        // A delins that doesn't simplify should stay as delins
2241        // Deleting G and inserting AT is not a dup pattern
2242        let provider = MockProvider::with_test_data();
2243        let normalizer = Normalizer::new(provider);
2244
2245        let variant = parse_hgvs("NM_000088.3:c.10delinsAT").unwrap();
2246        let result = normalizer.normalize(&variant).unwrap();
2247        let output = format!("{}", result);
2248
2249        assert!(
2250            output.contains("delinsAT"),
2251            "Delins should remain unchanged, got: {}",
2252            output
2253        );
2254    }
2255
2256    #[test]
2257    fn test_normalize_single_base_delins_becomes_substitution() {
2258        // HGVS edit-type priority: a 1→1 delins with ref!=alt must be expressed
2259        // as a substitution. Transcript NM_000088.3 starts ATGCCCAAGG...; position
2260        // 10 is G. c.10delinsT replaces G with T → c.10G>T.
2261        let provider = MockProvider::with_test_data();
2262        let normalizer = Normalizer::new(provider);
2263
2264        let variant = parse_hgvs("NM_000088.3:c.10delinsT").unwrap();
2265        let result = normalizer.normalize(&variant).unwrap();
2266        assert_eq!(format!("{}", result), "NM_000088.3:c.10G>T");
2267    }
2268
2269    #[test]
2270    fn test_normalize_single_base_delins_same_base_becomes_identity() {
2271        // Per HGVS, a delins whose insert equals the reference is identity (=).
2272        // Transcript NM_000088.3 starts ATGCCCAAGG...; position 10 is G.
2273        // c.10delinsG produces no change → c.10=.
2274        let provider = MockProvider::with_test_data();
2275        let normalizer = Normalizer::new(provider);
2276
2277        let variant = parse_hgvs("NM_000088.3:c.10delinsG").unwrap();
2278        let result = normalizer.normalize(&variant).unwrap();
2279        assert_eq!(format!("{}", result), "NM_000088.3:c.10=");
2280    }
2281
2282    #[test]
2283    fn test_normalize_multi_base_delins_same_seq_becomes_identity() {
2284        // Transcript NM_000088.3 starts ATG at positions 1-3.
2285        // c.1_3delinsATG produces no change → c.1_3=.
2286        let provider = MockProvider::with_test_data();
2287        let normalizer = Normalizer::new(provider);
2288
2289        let variant = parse_hgvs("NM_000088.3:c.1_3delinsATG").unwrap();
2290        let result = normalizer.normalize(&variant).unwrap();
2291        assert_eq!(format!("{}", result), "NM_000088.3:c.1_3=");
2292    }
2293
2294    #[test]
2295    fn test_normalize_multi_base_delete_delins_unchanged() {
2296        // 2→1 delins is not a single-base substitution and must not be rewritten.
2297        // c.10_11delinsT (deletes GG, inserts T) stays as delins.
2298        let provider = MockProvider::with_test_data();
2299        let normalizer = Normalizer::new(provider);
2300
2301        let variant = parse_hgvs("NM_000088.3:c.10_11delinsT").unwrap();
2302        let result = normalizer.normalize(&variant).unwrap();
2303        let output = format!("{}", result);
2304        assert!(
2305            output.contains("delinsT"),
2306            "Multi-base delete delins should remain delins, got: {}",
2307            output
2308        );
2309        assert!(
2310            !output.contains(">"),
2311            "Multi-base delete delins must not become a substitution, got: {}",
2312            output
2313        );
2314    }
2315
2316    #[test]
2317    fn test_normalize_delins_to_dup_still_works() {
2318        // Regression guard: adding identity/substitution checks before the dup
2319        // check must not block legitimate dup conversions. ref[5] = C;
2320        // c.5delinsCC matches the dup pattern (insert == deleted twice) and
2321        // must still normalize to dup.
2322        let provider = MockProvider::with_test_data();
2323        let normalizer = Normalizer::new(provider);
2324
2325        let variant = parse_hgvs("NM_000088.3:c.5delinsCC").unwrap();
2326        let result = normalizer.normalize(&variant).unwrap();
2327        let output = format!("{}", result);
2328        assert!(
2329            output.contains("dup"),
2330            "delins matching dup pattern should normalize to dup, got: {}",
2331            output
2332        );
2333        assert!(
2334            !output.contains("delins"),
2335            "delins should not survive the dup conversion, got: {}",
2336            output
2337        );
2338    }
2339
2340    #[test]
2341    fn test_normalize_delins_different_bases_unchanged() {
2342        // c.1_3delinsACG (ref=ATG) differs at the middle base — not identity,
2343        // not a dup pattern, so it stays as delins.
2344        let provider = MockProvider::with_test_data();
2345        let normalizer = Normalizer::new(provider);
2346
2347        let variant = parse_hgvs("NM_000088.3:c.1_3delinsACG").unwrap();
2348        let result = normalizer.normalize(&variant).unwrap();
2349        assert_eq!(format!("{}", result), "NM_000088.3:c.1_3delinsACG");
2350    }
2351
2352    #[test]
2353    fn test_normalize_protein_substitution_unchanged() {
2354        let provider = MockProvider::with_test_data();
2355        let normalizer = Normalizer::new(provider);
2356
2357        // Protein substitution variants should pass through unchanged
2358        let variant = parse_hgvs("NP_000079.2:p.Val600Glu").unwrap();
2359        let normalized = normalizer.normalize(&variant).unwrap();
2360        assert_eq!(format!("{}", variant), format!("{}", normalized));
2361    }
2362
2363    #[test]
2364    fn test_normalize_protein_deletion_removes_redundant_sequence() {
2365        // Redundant sequence removal: p.Val600delVal → p.Val600del
2366        let provider = MockProvider::with_test_data();
2367        let normalizer = Normalizer::new(provider);
2368
2369        let variant = parse_hgvs("NP_000079.2:p.Val600delVal").unwrap();
2370        let normalized = normalizer.normalize(&variant).unwrap();
2371        let output = format!("{}", normalized);
2372
2373        // Should remove redundant "Val" from the deletion
2374        assert_eq!(
2375            output, "NP_000079.2:p.Val600del",
2376            "Redundant sequence should be removed from deletion"
2377        );
2378    }
2379
2380    #[test]
2381    fn test_normalize_protein_deletion_range_removes_redundant_sequence() {
2382        // Redundant sequence removal for range: p.Lys23_Glu25delLysAlaGlu → p.Lys23_Glu25del
2383        let provider = MockProvider::with_test_data();
2384        let normalizer = Normalizer::new(provider);
2385
2386        let variant = parse_hgvs("NP_000079.2:p.Lys23_Glu25delLysAlaGlu").unwrap();
2387        let normalized = normalizer.normalize(&variant).unwrap();
2388        let output = format!("{}", normalized);
2389
2390        // Should remove redundant sequence from the deletion
2391        assert_eq!(
2392            output, "NP_000079.2:p.Lys23_Glu25del",
2393            "Redundant sequence should be removed from range deletion"
2394        );
2395    }
2396
2397    #[test]
2398    fn test_normalize_protein_deletion_non_matching_sequence_unchanged() {
2399        // Non-matching sequence should stay: p.Val600delGlu should NOT change
2400        let provider = MockProvider::with_test_data();
2401        let normalizer = Normalizer::new(provider);
2402
2403        let variant = parse_hgvs("NP_000079.2:p.Val600delGlu").unwrap();
2404        let normalized = normalizer.normalize(&variant).unwrap();
2405        let output = format!("{}", normalized);
2406
2407        // Should NOT remove non-matching sequence
2408        assert_eq!(
2409            output, "NP_000079.2:p.Val600delGlu",
2410            "Non-matching sequence should not be removed"
2411        );
2412    }
2413
2414    #[test]
2415    fn test_normalize_protein_deletion_no_sequence_unchanged() {
2416        // Deletion without sequence should stay unchanged
2417        let provider = MockProvider::with_test_data();
2418        let normalizer = Normalizer::new(provider);
2419
2420        let variant = parse_hgvs("NP_000079.2:p.Val600del").unwrap();
2421        let normalized = normalizer.normalize(&variant).unwrap();
2422        let output = format!("{}", normalized);
2423
2424        assert_eq!(
2425            output, "NP_000079.2:p.Val600del",
2426            "Deletion without sequence should remain unchanged"
2427        );
2428    }
2429
2430    #[test]
2431    fn test_normalize_protein_duplication_unchanged() {
2432        // Duplications should pass through unchanged
2433        let provider = MockProvider::with_test_data();
2434        let normalizer = Normalizer::new(provider);
2435
2436        let variant = parse_hgvs("NP_000079.2:p.Val600dup").unwrap();
2437        let normalized = normalizer.normalize(&variant).unwrap();
2438        assert_eq!(format!("{}", variant), format!("{}", normalized));
2439    }
2440
2441    #[test]
2442    fn test_normalize_protein_frameshift_unchanged() {
2443        // Frameshifts should pass through unchanged
2444        let provider = MockProvider::with_test_data();
2445        let normalizer = Normalizer::new(provider);
2446
2447        let variant = parse_hgvs("NP_000079.2:p.Arg97ProfsTer23").unwrap();
2448        let normalized = normalizer.normalize(&variant).unwrap();
2449        assert_eq!(format!("{}", variant), format!("{}", normalized));
2450    }
2451
2452    #[test]
2453    fn test_normalize_protein_reference_validation_match() {
2454        // Test that validation passes when amino acid matches reference
2455        // NP_TEST.1 has: M at position 1, V at position 2
2456        let provider = MockProvider::with_test_data();
2457        let normalizer = Normalizer::new(provider);
2458
2459        // Position 1 = M (Met), Position 2 = V (Val)
2460        let variant = parse_hgvs("NP_TEST.1:p.Met1Val").unwrap();
2461        let result = normalizer.normalize(&variant);
2462        assert!(
2463            result.is_ok(),
2464            "Validation should pass for matching amino acid"
2465        );
2466    }
2467
2468    #[test]
2469    fn test_normalize_protein_reference_validation_mismatch() {
2470        // Test that validation fails when amino acid doesn't match reference
2471        // NP_TEST.1 has M at position 1, but we claim it's Val
2472        let provider = MockProvider::with_test_data();
2473        let normalizer = Normalizer::new(provider);
2474
2475        // Position 1 is M (Met), not V (Val)
2476        let variant = parse_hgvs("NP_TEST.1:p.Val1Glu").unwrap();
2477        let result = normalizer.normalize(&variant);
2478        assert!(
2479            result.is_err(),
2480            "Validation should fail for mismatched amino acid"
2481        );
2482
2483        if let Err(crate::error::FerroError::AminoAcidMismatch {
2484            position,
2485            expected,
2486            found,
2487            ..
2488        }) = result
2489        {
2490            assert_eq!(position, 1);
2491            assert_eq!(expected, "Val");
2492            assert_eq!(found, "M");
2493        } else {
2494            panic!("Expected AminoAcidMismatch error");
2495        }
2496    }
2497
2498    #[test]
2499    fn test_normalize_protein_reference_validation_deletion() {
2500        // Test validation for deletion variants
2501        // NP_TEST.1 has V at position 2
2502        let provider = MockProvider::with_test_data();
2503        let normalizer = Normalizer::new(provider);
2504
2505        // Position 2 = V (Val) - should pass
2506        let variant = parse_hgvs("NP_TEST.1:p.Val2del").unwrap();
2507        let result = normalizer.normalize(&variant);
2508        assert!(
2509            result.is_ok(),
2510            "Validation should pass for matching deletion position"
2511        );
2512    }
2513
2514    #[test]
2515    fn test_normalize_protein_reference_validation_missing_protein() {
2516        // Test that missing protein data skips validation (doesn't fail)
2517        let provider = MockProvider::with_test_data();
2518        let normalizer = Normalizer::new(provider);
2519
2520        // NP_MISSING.1 doesn't exist in provider
2521        let variant = parse_hgvs("NP_MISSING.1:p.Val600Glu").unwrap();
2522        let result = normalizer.normalize(&variant);
2523        // Should NOT error - just skip validation when protein not available
2524        assert!(
2525            result.is_ok(),
2526            "Missing protein should skip validation, not fail"
2527        );
2528    }
2529
2530    #[test]
2531    fn test_normalize_rna_substitution_unchanged() {
2532        let provider = MockProvider::with_test_data();
2533        let normalizer = Normalizer::new(provider);
2534
2535        // RNA substitutions should pass through unchanged
2536        let variant = parse_hgvs("NM_000088.3:r.10a>g").unwrap();
2537        let normalized = normalizer.normalize(&variant).unwrap();
2538        assert_eq!(format!("{}", variant), format!("{}", normalized));
2539    }
2540
2541    #[test]
2542    fn test_normalize_rna_deletion_shifts_3prime() {
2543        // NM_001234.1 has G repeat at positions 13-37 (position 37 is actually T,
2544        // but the normalization shifts to the 3'-most position)
2545        let provider = MockProvider::with_test_data();
2546        let normalizer = Normalizer::new(provider);
2547
2548        let variant = parse_hgvs("NM_001234.1:r.14del").unwrap();
2549        let result = normalizer.normalize(&variant).unwrap();
2550        let output = format!("{}", result);
2551
2552        // Should remain a deletion and shift 3'
2553        assert!(
2554            output.contains("del"),
2555            "Deletion should remain a deletion, got: {}",
2556            output
2557        );
2558        // Verify it shifted 3' (position should be > 14)
2559        assert!(
2560            output.contains("r.37del"),
2561            "Deletion should shift 3' to position 37, got: {}",
2562            output
2563        );
2564    }
2565
2566    #[test]
2567    fn test_normalize_rna_insertion_becomes_dup() {
2568        // NM_001234.1 has G repeat at positions 13-36
2569        // Inserting g after position 14 should shift 3' and become dup
2570        let provider = MockProvider::with_test_data();
2571        let normalizer = Normalizer::new(provider);
2572
2573        let variant = parse_hgvs("NM_001234.1:r.14_15insg").unwrap();
2574        let result = normalizer.normalize(&variant).unwrap();
2575        let output = format!("{}", result);
2576
2577        // Inserting g in a G-repeat should become dup
2578        assert!(
2579            output.contains("dup"),
2580            "Insertion of matching base should become dup, got: {}",
2581            output
2582        );
2583    }
2584
2585    #[test]
2586    fn test_normalize_rna_duplication_shifts_3prime() {
2587        // NM_001234.1 has G repeat - single-base duplications stay as simple dups
2588        let provider = MockProvider::with_test_data();
2589        let normalizer = Normalizer::new(provider);
2590
2591        let variant = parse_hgvs("NM_001234.1:r.14dup").unwrap();
2592        let result = normalizer.normalize(&variant).unwrap();
2593        let output = format!("{}", result);
2594
2595        // Single-base duplication should shift 3' and stay as dup
2596        assert!(
2597            output.contains("dup"),
2598            "Single-base RNA duplication should remain as dup notation, got: {}",
2599            output
2600        );
2601    }
2602
2603    #[test]
2604    fn test_normalize_rna_intronic_returns_error() {
2605        let provider = MockProvider::with_test_data();
2606        let normalizer = Normalizer::new(provider);
2607
2608        // Intronic RNA variants should return an error
2609        let variant = parse_hgvs("NM_001234.1:r.10+5del").unwrap();
2610        let result = normalizer.normalize(&variant);
2611        assert!(result.is_err(), "Intronic RNA variant should return error");
2612    }
2613
2614    #[test]
2615    fn test_normalize_rna_missing_transcript_unchanged() {
2616        let provider = MockProvider::with_test_data();
2617        let normalizer = Normalizer::new(provider);
2618
2619        // Missing transcript should return variant unchanged
2620        let variant = parse_hgvs("NM_MISSING.1:r.100del").unwrap();
2621        let normalized = normalizer.normalize(&variant).unwrap();
2622        assert_eq!(format!("{}", variant), format!("{}", normalized));
2623    }
2624
2625    #[test]
2626    fn test_normalize_null_allele() {
2627        let provider = MockProvider::with_test_data();
2628        let normalizer = Normalizer::new(provider);
2629
2630        // Null alleles should pass through
2631        let variant = HgvsVariant::NullAllele;
2632        let normalized = normalizer.normalize(&variant).unwrap();
2633        assert!(matches!(normalized, HgvsVariant::NullAllele));
2634    }
2635
2636    #[test]
2637    fn test_normalize_unknown_allele() {
2638        let provider = MockProvider::with_test_data();
2639        let normalizer = Normalizer::new(provider);
2640
2641        // Unknown alleles should pass through
2642        let variant = HgvsVariant::UnknownAllele;
2643        let normalized = normalizer.normalize(&variant).unwrap();
2644        assert!(matches!(normalized, HgvsVariant::UnknownAllele));
2645    }
2646
2647    #[test]
2648    fn test_normalize_allele_variant() {
2649        let provider = MockProvider::with_test_data();
2650        let normalizer = Normalizer::new(provider);
2651
2652        // Allele variants should normalize each component
2653        // Using substitutions which should remain unchanged
2654        let variant = parse_hgvs("[NM_000088.3:c.10A>G;NM_000088.3:c.20C>T]").unwrap();
2655        let result = normalizer.normalize(&variant).unwrap();
2656
2657        // Verify it's still an allele
2658        assert!(matches!(result, HgvsVariant::Allele(_)));
2659
2660        // Verify output is the canonical compact form (ACC:c.[edit1;edit2])
2661        assert_eq!(
2662            format!("{}", result),
2663            "NM_000088.3:c.[10A>G;20C>T]",
2664            "Allele display should use canonical compact form"
2665        );
2666    }
2667
2668    #[test]
2669    fn test_normalize_5prime_direction() {
2670        // Test that 5' direction shifts toward 5' end instead of 3'
2671        // NM_001234.1 has G repeat spanning exons
2672        // Exon 2: c.12-c.26
2673        // With 5' direction, deletion at c.20 should shift toward c.12
2674        // Note: Actual shift depends on boundary handling
2675        let provider = MockProvider::with_test_data();
2676        let config = NormalizeConfig::default().with_direction(ShuffleDirection::FivePrime);
2677        let normalizer = Normalizer::with_config(provider, config);
2678
2679        // Delete G at position 20 (middle of G repeat in exon 2)
2680        let variant = parse_hgvs("NM_001234.1:c.20del").unwrap();
2681        let result = normalizer.normalize(&variant).unwrap();
2682        let output = format!("{}", result);
2683
2684        assert!(
2685            output.contains("del"),
2686            "Should remain a deletion, got: {}",
2687            output
2688        );
2689        // With 5' direction within exon 2, should shift toward 5' boundary
2690        // The exact position depends on exon boundary handling
2691        assert!(
2692            output.contains("c.13del") || output.contains("c.12del"),
2693            "5' direction should shift deletion toward exon start, got: {}",
2694            output
2695        );
2696    }
2697
2698    #[test]
2699    fn test_normalize_3prime_direction() {
2700        let provider = MockProvider::with_test_data();
2701        let config = NormalizeConfig::default().with_direction(ShuffleDirection::ThreePrime);
2702        let normalizer = Normalizer::with_config(provider, config);
2703
2704        let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2705        let result = normalizer.normalize(&variant);
2706        assert!(result.is_ok());
2707    }
2708
2709    #[test]
2710    fn test_normalize_with_cross_boundaries() {
2711        let provider = MockProvider::with_test_data();
2712        let config = NormalizeConfig::default().allow_crossing_boundaries();
2713        let normalizer = Normalizer::with_config(provider, config);
2714
2715        let variant = parse_hgvs("NM_000088.3:c.10del").unwrap();
2716        let result = normalizer.normalize(&variant);
2717        assert!(result.is_ok());
2718    }
2719
2720    #[test]
2721    fn test_normalize_genomic_variant() {
2722        let provider = MockProvider::with_test_data();
2723        let normalizer = Normalizer::new(provider);
2724
2725        // Genomic variants with missing sequence should pass through unchanged
2726        let variant = parse_hgvs("NC_000001.11:g.12345del").unwrap();
2727        let result = normalizer.normalize(&variant);
2728        assert!(result.is_ok());
2729    }
2730
2731    #[test]
2732    fn test_normalize_tx_variant() {
2733        let provider = MockProvider::with_test_data();
2734        let normalizer = Normalizer::new(provider);
2735
2736        let variant = parse_hgvs("NR_000001.1:n.100del").unwrap();
2737        let result = normalizer.normalize(&variant);
2738        assert!(result.is_ok());
2739    }
2740
2741    #[test]
2742    fn test_normalize_mt_variant() {
2743        let provider = MockProvider::with_test_data();
2744        let normalizer = Normalizer::new(provider);
2745
2746        // MT variants pass through unchanged
2747        let variant = parse_hgvs("NC_012920.1:m.100A>G").unwrap();
2748        let normalized = normalizer.normalize(&variant).unwrap();
2749        assert!(matches!(normalized, HgvsVariant::Mt(_)));
2750    }
2751
2752    #[test]
2753    fn test_cds_to_tx_pos_utr5() {
2754        let provider = MockProvider::with_test_data();
2755        let normalizer = Normalizer::new(provider);
2756
2757        // 5' UTR position (negative)
2758        let cds_pos = CdsPos {
2759            base: -5,
2760            offset: None,
2761            utr3: false,
2762        };
2763        let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2764        assert!(result.is_ok());
2765    }
2766
2767    #[test]
2768    fn test_cds_to_tx_pos_utr3() {
2769        let provider = MockProvider::with_test_data();
2770        let normalizer = Normalizer::new(provider);
2771
2772        // 3' UTR position
2773        let cds_pos = CdsPos {
2774            base: 5,
2775            offset: None,
2776            utr3: true,
2777        };
2778        let result = normalizer.cds_to_tx_pos(&cds_pos, 10, Some(50));
2779        assert!(result.is_ok());
2780        assert_eq!(result.unwrap(), 55);
2781    }
2782
2783    #[test]
2784    fn test_cds_to_tx_pos_coding() {
2785        let provider = MockProvider::with_test_data();
2786        let normalizer = Normalizer::new(provider);
2787
2788        // Normal coding position
2789        let cds_pos = CdsPos {
2790            base: 10,
2791            offset: None,
2792            utr3: false,
2793        };
2794        let result = normalizer.cds_to_tx_pos(&cds_pos, 5, Some(50));
2795        assert!(result.is_ok());
2796        assert_eq!(result.unwrap(), 14); // 5 + 10 - 1 = 14
2797    }
2798
2799    #[test]
2800    fn test_tx_to_cds_pos_utr5() {
2801        let provider = MockProvider::with_test_data();
2802        let normalizer = Normalizer::new(provider);
2803
2804        // Position before CDS start
2805        let result = normalizer.tx_to_cds_pos(3, 10, Some(50));
2806        assert!(result.is_ok());
2807        let cds_pos = result.unwrap();
2808        assert!(cds_pos.base < 0);
2809        assert!(!cds_pos.utr3);
2810    }
2811
2812    #[test]
2813    fn test_tx_to_cds_pos_utr3() {
2814        let provider = MockProvider::with_test_data();
2815        let normalizer = Normalizer::new(provider);
2816
2817        // Position after CDS end
2818        let result = normalizer.tx_to_cds_pos(55, 10, Some(50));
2819        assert!(result.is_ok());
2820        let cds_pos = result.unwrap();
2821        assert!(cds_pos.utr3);
2822    }
2823
2824    #[test]
2825    fn test_tx_to_cds_pos_coding() {
2826        let provider = MockProvider::with_test_data();
2827        let normalizer = Normalizer::new(provider);
2828
2829        // Normal coding position
2830        let result = normalizer.tx_to_cds_pos(20, 10, Some(50));
2831        assert!(result.is_ok());
2832        let cds_pos = result.unwrap();
2833        assert!(!cds_pos.utr3);
2834        assert_eq!(cds_pos.base, 11); // 20 - 10 + 1 = 11
2835    }
2836
2837    #[test]
2838    fn test_config_default() {
2839        let config = NormalizeConfig::default();
2840        assert_eq!(config.shuffle_direction, ShuffleDirection::ThreePrime);
2841        assert!(!config.cross_boundaries);
2842    }
2843
2844    #[test]
2845    #[allow(deprecated)]
2846    fn test_config_builder() {
2847        let config = NormalizeConfig::default()
2848            .with_direction(ShuffleDirection::FivePrime)
2849            .allow_crossing_boundaries()
2850            .skip_validation();
2851
2852        assert_eq!(config.shuffle_direction, ShuffleDirection::FivePrime);
2853        assert!(config.cross_boundaries);
2854        // skip_validation now sets RefSeqMismatch to SilentCorrect
2855        assert!(!config.should_reject_ref_mismatch());
2856        assert!(!config.should_warn_ref_mismatch());
2857    }
2858
2859    #[test]
2860    fn test_duplication_3prime_shift_two_bases() {
2861        // Test the exact scenario from ClinVar: c.4159dup vs c.4160dup
2862        // When duplicating an A in a homopolymer tract (AA),
2863        // HGVS requires converting to repeat notation.
2864        //
2865        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2866        // Positions (1-based):  12345678901234567890...
2867        // c.8 = A, c.9 = A (the "AA" in "GAA")
2868        //
2869        // Single-base duplications stay as simple dups
2870        let provider = MockProvider::with_test_data();
2871        let normalizer = Normalizer::new(provider);
2872
2873        let variant = parse_hgvs("NM_888888.1:c.8dup").unwrap();
2874        let result = normalizer.normalize(&variant).unwrap();
2875        let output = format!("{}", result);
2876
2877        // Single-base duplication should shift 3' and stay as dup
2878        assert!(
2879            output.contains("dup"),
2880            "Single-base duplication should remain as dup notation, got: {}",
2881            output
2882        );
2883    }
2884
2885    #[test]
2886    fn test_duplication_3prime_shift_three_bases() {
2887        // Test with three consecutive identical bases (TTT)
2888        //
2889        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2890        // Positions:           ...              2021222324...
2891        // c.20 = G, c.21 = T, c.22 = T, c.23 = T, c.24 = G
2892        //
2893        // Single-base duplications stay as simple dups
2894        let provider = MockProvider::with_test_data();
2895        let normalizer = Normalizer::new(provider);
2896
2897        let variant = parse_hgvs("NM_888888.1:c.21dup").unwrap();
2898        let result = normalizer.normalize(&variant).unwrap();
2899        let output = format!("{}", result);
2900
2901        // Single-base duplication should shift 3' to c.23 and stay as dup
2902        assert!(
2903            output.contains("dup"),
2904            "Single-base duplication should remain as dup notation, got: {}",
2905            output
2906        );
2907    }
2908
2909    #[test]
2910    fn test_duplication_no_shift_when_unique() {
2911        // Test that a duplication of a unique base doesn't shift
2912        //
2913        // NM_888888.1 sequence: ATGCCCGAAGCCCCCCCCCGTTTGCATGCATGCATGCAT
2914        // c.7 = G (followed by AA, so no G to shift to)
2915        //
2916        // c.7dup should stay as c.7dup
2917        let provider = MockProvider::with_test_data();
2918        let normalizer = Normalizer::new(provider);
2919
2920        let variant = parse_hgvs("NM_888888.1:c.7dup").unwrap();
2921        let result = normalizer.normalize(&variant).unwrap();
2922        let output = format!("{}", result);
2923
2924        assert!(
2925            output.contains("dup"),
2926            "Should remain a duplication, got: {}",
2927            output
2928        );
2929        assert!(
2930            output.contains("c.7dup"),
2931            "Duplication of unique G at c.7 should not shift, got: {}",
2932            output
2933        );
2934    }
2935
2936    // =====================================================================
2937    // Exon-Intron Boundary Spanning Variant Tests
2938    // =====================================================================
2939
2940    /// Create a provider with a transcript that has genomic coordinates and introns
2941    /// for testing boundary-spanning variant normalization.
2942    ///
2943    /// Transcript structure (NM_BOUNDARY.1):
2944    /// - Gene: BOUNDARY
2945    /// - Strand: Plus
2946    /// - Chromosome: chr1
2947    ///
2948    /// Genomic layout (chr1):
2949    /// Position: 1000                   1020  1030                   1050  1060                   1080
2950    ///           |-------- Exon 1 ------|      |-------- Exon 2 ------|      |-------- Exon 3 ------|
2951    ///           ATGCCCAAAGGGTTTAGGCCC       AAAGGGTTTAGGCCCAAAAAA       GGGTTTAGGCCCAAATGA
2952    ///                                 ^^^  ^^^                   ^^^  ^^^
2953    ///                              intron 1                   intron 2
2954    ///
2955    /// Transcript positions (tx):
2956    /// - Exon 1: tx 1-20 = genomic 1000-1019
2957    /// - Intron 1: genomic 1020-1029 (10 bp)
2958    /// - Exon 2: tx 21-40 = genomic 1030-1049
2959    /// - Intron 2: genomic 1050-1059 (10 bp)
2960    /// - Exon 3: tx 41-58 = genomic 1060-1077
2961    ///
2962    /// CDS: starts at tx 1 (no 5' UTR for simplicity)
2963    /// CDS positions: c.1 = tx 1, c.20 = tx 20 (last of exon 1), c.21 = tx 21 (first of exon 2)
2964    ///
2965    /// Intron 1 sequence (g.1020-1029): "GTAAGCTAGG" (10 bp)
2966    ///   - c.20+1 = g.1020 (G)
2967    ///   - c.20+10 = g.1029 (G)
2968    ///   - c.21-10 = g.1020 (G)
2969    ///   - c.21-1 = g.1029 (G)
2970    ///
2971    /// Intron 2 sequence (g.1050-1059): "GTAAGTAAGG" (10 bp)
2972    fn make_boundary_test_provider() -> MockProvider {
2973        use crate::reference::transcript::{Exon, ManeStatus, Strand, Transcript};
2974        use std::sync::OnceLock;
2975
2976        let mut provider = MockProvider::new();
2977
2978        // Build transcript sequence (exons only, spliced)
2979        // Exon 1 (20bp): ATGCCCAAAGGGTTTAGGCC (ends with CC at exon boundary)
2980        // Exon 2 (20bp): AAAGGGTTTAGGCCCAAAAA (AA repeat at both boundaries)
2981        // Exon 3 (18bp): GGGTTTAGGCCCAAATGA
2982        // Total: 58bp transcript
2983        let tx_seq = "ATGCCCAAAGGGTTTAGGCCAAAGGGTTTAGGCCCAAAAAGGGTTTAGGCCCAAATGA";
2984
2985        // Build genomic sequence around the transcript
2986        // We'll create 100bp before, the gene region, and 100bp after
2987        // Gene region: exon1 + intron1 + exon2 + intron2 + exon3
2988        // = 20 + 10 + 20 + 10 + 18 = 78bp
2989        //
2990        // Genomic: 900-999 (padding) + 1000-1019 (exon1) + 1020-1029 (intron1) +
2991        //          1030-1049 (exon2) + 1050-1059 (intron2) + 1060-1077 (exon3) + 1078+ (padding)
2992        let mut genomic_seq = String::new();
2993
2994        // Padding before (positions 0-999, 1000 bytes at 0-based)
2995        for _ in 0..1000 {
2996            genomic_seq.push('N');
2997        }
2998
2999        // Exon 1 (positions 1000-1019, 0-based 1000-1019)
3000        genomic_seq.push_str("ATGCCCAAAGGGTTTAGGCC");
3001
3002        // Intron 1 (positions 1020-1029) - with splice consensus
3003        // Note: The intron has AAA at the end (1027-1029) to test shifting
3004        genomic_seq.push_str("GTAAGCTAAA");
3005
3006        // Exon 2 (positions 1030-1049)
3007        genomic_seq.push_str("AAAGGGTTTAGGCCCAAAAA");
3008
3009        // Intron 2 (positions 1050-1059) - with AAA at start for testing
3010        genomic_seq.push_str("AAAGTAAGGG");
3011
3012        // Exon 3 (positions 1060-1077)
3013        genomic_seq.push_str("GGGTTTAGGCCCAAATGA");
3014
3015        // Padding after (100 bytes)
3016        for _ in 0..100 {
3017            genomic_seq.push('N');
3018        }
3019
3020        // Add genomic sequence to provider
3021        provider.add_genomic_sequence("chr1", genomic_seq);
3022
3023        // Create transcript with exons that have genomic coordinates
3024        provider.add_transcript(Transcript {
3025            id: "NM_BOUNDARY.1".to_string(),
3026            gene_symbol: Some("BOUNDARY".to_string()),
3027            strand: Strand::Plus,
3028            sequence: tx_seq.to_string(),
3029            cds_start: Some(1),
3030            cds_end: Some(58),
3031            exons: vec![
3032                Exon::with_genomic(1, 1, 20, 1000, 1019),
3033                Exon::with_genomic(2, 21, 40, 1030, 1049),
3034                Exon::with_genomic(3, 41, 58, 1060, 1077),
3035            ],
3036            chromosome: Some("chr1".to_string()),
3037            genomic_start: Some(1000),
3038            genomic_end: Some(1077),
3039            genome_build: Default::default(),
3040            mane_status: ManeStatus::None,
3041            refseq_match: None,
3042            ensembl_match: None,
3043            exon_cigars: Vec::new(),
3044            cached_introns: OnceLock::new(),
3045        });
3046
3047        provider
3048    }
3049
3050    #[test]
3051    fn test_boundary_spanning_exonic_to_intronic_del() {
3052        // Test: c.20_20+3del - deletion from last exon base into intron
3053        // c.20 = last base of exon 1 (C at g.1019)
3054        // c.20+3 = 3rd intronic base (A at g.1022)
3055        // Deletes: C (exonic) + GTA (intronic) = 4 bases
3056        //
3057        // This should normalize without error (using genomic space)
3058        let provider = make_boundary_test_provider();
3059        let normalizer = Normalizer::new(provider);
3060
3061        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+3del").unwrap();
3062        let result = normalizer.normalize(&variant);
3063
3064        assert!(
3065            result.is_ok(),
3066            "Boundary-spanning deletion should normalize, got error: {:?}",
3067            result.err()
3068        );
3069
3070        let output = format!("{}", result.unwrap());
3071        assert!(
3072            output.contains("del"),
3073            "Should remain a deletion, got: {}",
3074            output
3075        );
3076    }
3077
3078    #[test]
3079    fn test_boundary_spanning_intronic_to_exonic_del() {
3080        // Test: c.21-3_23del - deletion from intron into exon
3081        // c.21-3 = 3rd base before exon 2 (A at g.1027)
3082        // c.23 = 3rd base of exon 2 (A at g.1032)
3083        // Deletes: AAA (intronic) + AAA (exonic) = 6 bases
3084        //
3085        // The intron ends with AAA (g.1027-1029) and exon starts with AAA (g.1030-1032)
3086        // This is a repeat, so normalization might shift
3087        let provider = make_boundary_test_provider();
3088        let normalizer = Normalizer::new(provider);
3089
3090        let variant = parse_hgvs("NM_BOUNDARY.1:c.21-3_23del").unwrap();
3091        let result = normalizer.normalize(&variant);
3092
3093        assert!(
3094            result.is_ok(),
3095            "Boundary-spanning deletion should normalize, got error: {:?}",
3096            result.err()
3097        );
3098
3099        let output = format!("{}", result.unwrap());
3100        assert!(
3101            output.contains("del"),
3102            "Should remain a deletion, got: {}",
3103            output
3104        );
3105    }
3106
3107    #[test]
3108    fn test_boundary_spanning_same_base_position() {
3109        // Test: c.20_20+5del - deletion starting and ending at same CDS base
3110        // Start is exonic (c.20), end is intronic (c.20+5)
3111        let provider = make_boundary_test_provider();
3112        let normalizer = Normalizer::new(provider);
3113
3114        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+5del").unwrap();
3115        let result = normalizer.normalize(&variant);
3116
3117        assert!(
3118            result.is_ok(),
3119            "Same-base boundary-spanning deletion should normalize, got error: {:?}",
3120            result.err()
3121        );
3122    }
3123
3124    #[test]
3125    fn test_boundary_splice_site_plus1() {
3126        // Test: c.20_20+1del - deletion of last exon base + splice donor (GT)
3127        // This is a clinically important splice site variant
3128        let provider = make_boundary_test_provider();
3129        let normalizer = Normalizer::new(provider);
3130
3131        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+1del").unwrap();
3132        let result = normalizer.normalize(&variant);
3133
3134        assert!(
3135            result.is_ok(),
3136            "Splice site +1 deletion should normalize, got error: {:?}",
3137            result.err()
3138        );
3139    }
3140
3141    #[test]
3142    fn test_boundary_splice_site_minus1() {
3143        // Test: c.21-1_22del - deletion of splice acceptor + first exon bases
3144        // c.21-1 = last intronic base before exon 2 (A at g.1029)
3145        // c.22 = 2nd base of exon 2 (A at g.1031)
3146        let provider = make_boundary_test_provider();
3147        let normalizer = Normalizer::new(provider);
3148
3149        let variant = parse_hgvs("NM_BOUNDARY.1:c.21-1_22del").unwrap();
3150        let result = normalizer.normalize(&variant);
3151
3152        assert!(
3153            result.is_ok(),
3154            "Splice site -1 deletion should normalize, got error: {:?}",
3155            result.err()
3156        );
3157    }
3158
3159    #[test]
3160    fn test_boundary_spanning_dup() {
3161        // Test: c.40_40+3dup - duplication spanning exon-intron boundary
3162        // c.40 = last base of exon 2 (A at g.1049)
3163        // c.40+3 = 3rd intronic base of intron 2 (A at g.1052)
3164        // Since intron 2 starts with AAA (and exon 2 ends with AAAAA), this creates a repeat pattern
3165        // The normalizer correctly converts this to repeat notation (A[N])
3166        let provider = make_boundary_test_provider();
3167        let normalizer = Normalizer::new(provider);
3168
3169        let variant = parse_hgvs("NM_BOUNDARY.1:c.40_40+3dup").unwrap();
3170        let result = normalizer.normalize(&variant);
3171
3172        assert!(
3173            result.is_ok(),
3174            "Boundary-spanning duplication should normalize, got error: {:?}",
3175            result.err()
3176        );
3177
3178        let output = format!("{}", result.unwrap());
3179        // May remain as dup or be converted to repeat notation for poly-A
3180        assert!(
3181            output.contains("dup") || output.contains("["),
3182            "Should remain a duplication or become repeat notation, got: {}",
3183            output
3184        );
3185    }
3186
3187    #[test]
3188    fn test_boundary_spanning_delins() {
3189        // Test: c.20_20+2delinsTTT - delins spanning exon-intron boundary
3190        let provider = make_boundary_test_provider();
3191        let normalizer = Normalizer::new(provider);
3192
3193        let variant = parse_hgvs("NM_BOUNDARY.1:c.20_20+2delinsTTT").unwrap();
3194        let result = normalizer.normalize(&variant);
3195
3196        assert!(
3197            result.is_ok(),
3198            "Boundary-spanning delins should normalize, got error: {:?}",
3199            result.err()
3200        );
3201
3202        let output = format!("{}", result.unwrap());
3203        assert!(
3204            output.contains("delins") || output.contains(">"),
3205            "Should remain a delins or become substitution, got: {}",
3206            output
3207        );
3208    }
3209
3210    #[test]
3211    fn test_boundary_no_genomic_data_returns_error() {
3212        // Test that without genomic data, we still get the ExonIntronBoundary error
3213        let provider = MockProvider::with_test_data(); // No genomic data
3214        let normalizer = Normalizer::new(provider);
3215
3216        // NM_001234.1 doesn't have genomic coordinates
3217        let variant = parse_hgvs("NM_001234.1:c.11_11+3del").unwrap();
3218        let result = normalizer.normalize(&variant);
3219
3220        assert!(
3221            result.is_err(),
3222            "Boundary-spanning without genomic data should return error"
3223        );
3224    }
3225
3226    #[test]
3227    fn test_deletion_3prime_shift_consecutive_bases() {
3228        // Test case simulating NM_001408491.1:c.517delA -> should become c.518del
3229        // Create a transcript with consecutive A's at positions that should shift
3230        use crate::reference::transcript::{Exon, ManeStatus, Strand};
3231        use std::sync::OnceLock;
3232
3233        let mut provider = MockProvider::new();
3234
3235        // Create a sequence where c.517 and c.518 are both 'A'
3236        // CDS starts at position 1, so c.N = transcript position N
3237        // Put "AA" at positions 517-518 (1-based)
3238        // Sequence: 516 bases of padding + "AA" + more padding
3239        let mut seq = String::new();
3240        for _ in 0..516 {
3241            seq.push('G'); // Padding (not A to ensure we see the shift)
3242        }
3243        seq.push('A'); // Position 517 (c.517)
3244        seq.push('A'); // Position 518 (c.518)
3245        for _ in 519..=600 {
3246            seq.push('G'); // More padding
3247        }
3248
3249        provider.add_transcript(crate::reference::transcript::Transcript {
3250            id: "NM_777777.1".to_string(),
3251            gene_symbol: Some("SHIFTTEST".to_string()),
3252            strand: Strand::Plus,
3253            sequence: seq.clone(),
3254            cds_start: Some(1),
3255            cds_end: Some(600),
3256            exons: vec![Exon::new(1, 1, 600)], // Single exon covering all
3257            chromosome: None,
3258            genomic_start: None,
3259            genomic_end: None,
3260            genome_build: Default::default(),
3261            mane_status: ManeStatus::None,
3262            refseq_match: None,
3263            ensembl_match: None,
3264            exon_cigars: Vec::new(),
3265            cached_introns: OnceLock::new(),
3266        });
3267
3268        let normalizer = Normalizer::new(provider);
3269
3270        // Parse c.517delA - deleting the first A
3271        let variant = parse_hgvs("NM_777777.1:c.517delA").unwrap();
3272
3273        // Debug: print the sequence around positions 517-518
3274        println!("Sequence length: {}", seq.len());
3275        println!(
3276            "Position 516 (0-based 515): {}",
3277            seq.chars().nth(515).unwrap()
3278        );
3279        println!(
3280            "Position 517 (0-based 516): {}",
3281            seq.chars().nth(516).unwrap()
3282        );
3283        println!(
3284            "Position 518 (0-based 517): {}",
3285            seq.chars().nth(517).unwrap()
3286        );
3287        println!(
3288            "Position 519 (0-based 518): {}",
3289            seq.chars().nth(518).unwrap()
3290        );
3291
3292        let result = normalizer.normalize(&variant).unwrap();
3293        let output = format!("{}", result);
3294
3295        println!("Input:  NM_777777.1:c.517delA");
3296        println!("Output: {}", output);
3297
3298        // The deletion should shift from c.517 to c.518 (3' rule)
3299        // because both positions are 'A'
3300        assert!(
3301            output.contains("c.518del"),
3302            "Deletion at c.517 should shift to c.518 (3' rule), got: {}",
3303            output
3304        );
3305    }
3306
3307    #[test]
3308    fn test_deletion_3prime_shift_with_utr() {
3309        // Same test but with a 5' UTR (cds_start > 1)
3310        // This simulates real transcripts more accurately
3311        use crate::reference::transcript::{Exon, ManeStatus, Strand};
3312        use std::sync::OnceLock;
3313
3314        let mut provider = MockProvider::new();
3315
3316        // Create a transcript with 100bp 5' UTR
3317        // CDS starts at position 101, so:
3318        // c.1 = tx position 101
3319        // c.517 = tx position 617
3320        // c.518 = tx position 618
3321        let utr_len = 100;
3322        let mut seq = String::new();
3323
3324        // 5' UTR (100 bases)
3325        for _ in 0..utr_len {
3326            seq.push('T');
3327        }
3328        // CDS: 516 bases of G padding, then "AA", then more G
3329        for _ in 0..516 {
3330            seq.push('G');
3331        }
3332        seq.push('A'); // tx position 617 = c.517
3333        seq.push('A'); // tx position 618 = c.518
3334        for _ in 0..100 {
3335            seq.push('G');
3336        }
3337
3338        let seq_len = seq.len();
3339        println!("Test with UTR:");
3340        println!("  Sequence length: {}", seq_len);
3341        println!("  CDS start (1-based): 101");
3342        println!("  c.517 = tx position 617 (0-based 616)");
3343        println!("  c.518 = tx position 618 (0-based 617)");
3344        println!(
3345            "  tx pos 617 (0-based 616): {}",
3346            seq.chars().nth(616).unwrap()
3347        );
3348        println!(
3349            "  tx pos 618 (0-based 617): {}",
3350            seq.chars().nth(617).unwrap()
3351        );
3352
3353        provider.add_transcript(crate::reference::transcript::Transcript {
3354            id: "NM_666666.1".to_string(),
3355            gene_symbol: Some("UTRTEST".to_string()),
3356            strand: Strand::Plus,
3357            sequence: seq.clone(),
3358            cds_start: Some(101),
3359            cds_end: Some(seq_len as u64),
3360            exons: vec![Exon::new(1, 1, seq_len as u64)], // Single exon
3361            chromosome: None,
3362            genomic_start: None,
3363            genomic_end: None,
3364            genome_build: Default::default(),
3365            mane_status: ManeStatus::None,
3366            refseq_match: None,
3367            ensembl_match: None,
3368            exon_cigars: Vec::new(),
3369            cached_introns: OnceLock::new(),
3370        });
3371
3372        let normalizer = Normalizer::new(provider);
3373
3374        let variant = parse_hgvs("NM_666666.1:c.517delA").unwrap();
3375        let result = normalizer.normalize(&variant).unwrap();
3376        let output = format!("{}", result);
3377
3378        println!("Input:  NM_666666.1:c.517delA");
3379        println!("Output: {}", output);
3380
3381        assert!(
3382            output.contains("c.518del"),
3383            "Deletion at c.517 should shift to c.518 (3' rule) even with UTR, got: {}",
3384            output
3385        );
3386    }
3387
3388    #[test]
3389    fn test_normalize_inverted_range_insertion_no_panic() {
3390        // Regression: ClinVar pattern NC_000011.10:g.5238138_5153222insTATTT
3391        // has start > end (inverted range).  Previously caused a panic in
3392        // insertion_is_duplication due to slice index out of bounds.
3393        // The normalizer should return an error, not panic.
3394        let provider = MockProvider::with_test_data();
3395        let normalizer = Normalizer::new(provider);
3396
3397        let variant = parse_hgvs("NC_000011.10:g.5238138_5153222insTATTT").unwrap();
3398        let result = normalizer.normalize(&variant);
3399        // It's fine if this returns Ok (unchanged) or Err (validation failure),
3400        // but it must NOT panic.
3401        let _ = result;
3402    }
3403
3404    #[test]
3405    fn test_delins_should_not_shift() {
3406        // HGVS spec: delins should NOT be 3' shifted like del/dup/ins
3407        // This test ensures we don't incorrectly shift delins positions
3408        use crate::reference::transcript::{Exon, ManeStatus, Strand};
3409        use std::sync::OnceLock;
3410
3411        let mut provider = MockProvider::new();
3412
3413        // Create a transcript where delins could theoretically shift if we were wrong
3414        // Sequence: ...GGAATTCC... where we do c.10_11delinsXX
3415        // If incorrectly shifted, it might become c.11_12delinsXX
3416        let seq = "GGGGGGGGGGAATTCCGGGGGGGGGG".to_string(); // c.10=A, c.11=A, c.12=T, c.13=T
3417
3418        provider.add_transcript(crate::reference::transcript::Transcript {
3419            id: "NM_555555.1".to_string(),
3420            gene_symbol: Some("DELINSTEST".to_string()),
3421            strand: Strand::Plus,
3422            sequence: seq,
3423            cds_start: Some(1),
3424            cds_end: Some(26),
3425            exons: vec![Exon::new(1, 1, 26)],
3426            chromosome: None,
3427            genomic_start: None,
3428            genomic_end: None,
3429            genome_build: Default::default(),
3430            mane_status: ManeStatus::None,
3431            refseq_match: None,
3432            ensembl_match: None,
3433            exon_cigars: Vec::new(),
3434            cached_introns: OnceLock::new(),
3435        });
3436
3437        let normalizer = Normalizer::new(provider);
3438
3439        // Test delins - should NOT shift
3440        let variant = parse_hgvs("NM_555555.1:c.10_11delinsTT").unwrap();
3441        let result = normalizer.normalize(&variant).unwrap();
3442        let output = format!("{}", result);
3443
3444        assert!(
3445            output.contains("c.10_11delins"),
3446            "Delins should NOT be shifted (HGVS spec), got: {}",
3447            output
3448        );
3449    }
3450
3451    #[test]
3452    fn test_cds_to_tx_pos_utr5_underflow() {
3453        // cds_start=5, base=-6 → 5 + (-6) - 1 = -2, should return Err not wrap to u64::MAX
3454        let provider = MockProvider::with_test_data();
3455        let normalizer = Normalizer::new(provider);
3456        let pos = CdsPos {
3457            base: -6,
3458            offset: None,
3459            utr3: false,
3460        };
3461        let result = normalizer.cds_to_tx_pos(&pos, 5, Some(38));
3462        assert!(
3463            result.is_err(),
3464            "cds_to_tx_pos should return Err for positions before transcript start, got: {:?}",
3465            result
3466        );
3467    }
3468
3469    #[test]
3470    fn test_cds_to_tx_pos_utr5_valid() {
3471        // HGVS numbering skips c.0, so c.-N maps to tx position
3472        // cds_start - N. For cds_start=5, c.-3 → tx = 5 + (-3) = 2.
3473        // Issue #97 — the previous formula `cds_start + base - 1`
3474        // double-counted the gap and returned tx 1 (the c.-4 base).
3475        let provider = MockProvider::with_test_data();
3476        let normalizer = Normalizer::new(provider);
3477        let pos = CdsPos {
3478            base: -3,
3479            offset: None,
3480            utr3: false,
3481        };
3482        let result = normalizer.cds_to_tx_pos(&pos, 5, Some(38));
3483        assert_eq!(result.unwrap(), 2);
3484    }
3485
3486    #[test]
3487    fn test_variant_positions_negative_cds_base() {
3488        // CDS variant with negative base should return None from variant_positions
3489        let provider = MockProvider::with_test_data();
3490        let normalizer = Normalizer::new(provider);
3491        let variant = parse_hgvs("NM_001234.1:c.-3A>G").unwrap();
3492        let positions = normalizer.extract_position_range(&variant);
3493        assert_eq!(
3494            positions, None,
3495            "variant_positions should return None for 5' UTR CDS positions"
3496        );
3497    }
3498
3499    #[test]
3500    fn test_normalize_cds_utr5_deep_negative() {
3501        // A deeply negative 5' UTR position that would overflow should return an error, not panic
3502        let provider = MockProvider::with_test_data();
3503        let normalizer = Normalizer::new(provider);
3504        // c.-88 with cds_start=5 → 5 + (-88) - 1 = -84, which would wrap to huge u64
3505        let variant = parse_hgvs("NM_001234.1:c.-88A>G").unwrap();
3506        let result = normalizer.normalize(&variant);
3507        // The primary check is that this doesn't panic.
3508        let _ = result;
3509    }
3510
3511    #[test]
3512    fn test_normalize_unknown_offset_returns_unchanged() {
3513        // Variants with ? offsets (sentinel values i64::MAX/MIN) should return unchanged
3514        // because we can't normalize with indeterminate boundaries
3515        let provider = MockProvider::with_test_data();
3516        let normalizer = Normalizer::new(provider);
3517
3518        // c.-85-?_834+?del has unknown offsets on both positions
3519        let variant = parse_hgvs("NM_000088.3:c.-85-?_834+?del").unwrap();
3520        let result = normalizer.normalize(&variant);
3521        assert!(
3522            result.is_ok(),
3523            "Unknown offset should not error, got: {:?}",
3524            result.err()
3525        );
3526    }
3527
3528    #[test]
3529    fn test_normalize_unknown_offset_single_position() {
3530        // Even a single unknown offset should cause early return
3531        let provider = MockProvider::with_test_data();
3532        let normalizer = Normalizer::new(provider);
3533
3534        let variant = parse_hgvs("NM_000088.3:c.10-?del").unwrap();
3535        let result = normalizer.normalize(&variant);
3536        assert!(
3537            result.is_ok(),
3538            "Single unknown offset should not error, got: {:?}",
3539            result.err()
3540        );
3541    }
3542
3543    #[test]
3544    fn test_normalize_utr_before_tx_start_returns_unchanged() {
3545        // c.-215 with a small UTR should not error - return unchanged
3546        // NM_001234.1 has cds_start=5, so c.-215 maps to 5 + (-215) - 1 = -211
3547        let provider = MockProvider::with_test_data();
3548        let normalizer = Normalizer::new(provider);
3549
3550        let variant = parse_hgvs("NM_001234.1:c.-215_-214del").unwrap();
3551        let result = normalizer.normalize(&variant);
3552        assert!(
3553            result.is_ok(),
3554            "UTR before transcript start should not error, got: {:?}",
3555            result.err()
3556        );
3557    }
3558
3559    #[test]
3560    fn test_normalize_no_cds_returns_unchanged() {
3561        // An NR_ transcript with c. coordinates should not error
3562        let mut provider = MockProvider::new();
3563        use crate::reference::transcript::{Exon, Transcript};
3564        provider.add_transcript(Transcript {
3565            id: "NR_001566.1".to_string(),
3566            gene_symbol: Some("NCRNA".to_string()),
3567            strand: crate::reference::transcript::Strand::Plus,
3568            sequence: "ATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGC".to_string(),
3569            cds_start: None,
3570            cds_end: None,
3571            exons: vec![Exon::new(1, 1, 51)],
3572            chromosome: None,
3573            genomic_start: None,
3574            genomic_end: None,
3575            genome_build: Default::default(),
3576            mane_status: Default::default(),
3577            refseq_match: None,
3578            ensembl_match: None,
3579            exon_cigars: Vec::new(),
3580            cached_introns: std::sync::OnceLock::new(),
3581        });
3582        let normalizer = Normalizer::new(provider);
3583
3584        // c. variant on a non-coding transcript (no CDS)
3585        let variant = parse_hgvs("NR_001566.1:c.10del").unwrap();
3586        let result = normalizer.normalize(&variant);
3587        assert!(
3588            result.is_ok(),
3589            "No CDS should not error, got: {:?}",
3590            result.err()
3591        );
3592    }
3593
3594    #[test]
3595    fn test_normalize_tx_intronic() {
3596        // n. intronic variants should normalize via genomic space
3597        // Build a non-coding transcript with genomic coords and intronic positions
3598        use crate::reference::transcript::{Exon, GenomeBuild, ManeStatus, Strand, Transcript};
3599
3600        let mut provider = MockProvider::new();
3601
3602        // Create transcript: 2 exons with an intron in between
3603        // Exon 1: tx 1-100, genomic 1000-1099
3604        // Intron: genomic 1100-1199
3605        // Exon 2: tx 101-200, genomic 1200-1299
3606        // Sequence in the intron around position 1100+: AAAA... (for shifting test)
3607        let tx_sequence = "A".repeat(200);
3608
3609        provider.add_transcript(Transcript {
3610            id: "NR_038982.1".to_string(),
3611            gene_symbol: Some("NCRNA_TEST".to_string()),
3612            strand: Strand::Plus,
3613            sequence: tx_sequence,
3614            cds_start: None,
3615            cds_end: None,
3616            exons: vec![
3617                Exon::with_genomic(1, 1, 100, 1000, 1099),
3618                Exon::with_genomic(2, 101, 200, 1200, 1299),
3619            ],
3620            chromosome: Some("chr1".to_string()),
3621            genomic_start: Some(1000),
3622            genomic_end: Some(1299),
3623            genome_build: GenomeBuild::GRCh38,
3624            mane_status: ManeStatus::None,
3625            refseq_match: None,
3626            ensembl_match: None,
3627            exon_cigars: Vec::new(),
3628            cached_introns: std::sync::OnceLock::new(),
3629        });
3630
3631        // Add genomic sequence for chr1 around positions 1000-1299
3632        // Make the intron region (1100-1199) be "AGCT" repeated to test shifting
3633        let mut genomic = String::new();
3634        for _ in 0..325 {
3635            genomic.push_str("AGCT");
3636        }
3637        provider.add_genomic_sequence("chr1", genomic);
3638
3639        let normalizer = Normalizer::new(provider);
3640
3641        // n.100+4del - intronic deletion in a non-coding transcript
3642        let variant = parse_hgvs("NR_038982.1:n.100+4del").unwrap();
3643        let result = normalizer.normalize(&variant);
3644        assert!(
3645            result.is_ok(),
3646            "n. intronic normalization should succeed, got: {:?}",
3647            result.err()
3648        );
3649        let output = format!("{}", result.unwrap());
3650        assert!(
3651            output.contains('+') || output.contains('-'),
3652            "Normalized intronic n. variant should retain intronic notation, got: {}",
3653            output
3654        );
3655    }
3656}