Skip to main content

ferro_hgvs/project/
projector.rs

1//! `VariantProjector` orchestrator.
2
3use crate::data::mapping::MappingInfo;
4use crate::data::projection::Projector;
5use crate::error::FerroError;
6use crate::hgvs::edit::NaEdit;
7use crate::hgvs::interval::{CdsInterval, TxInterval};
8use crate::hgvs::location::{CdsPos, GenomePos, TxPos};
9use crate::hgvs::variant::{
10    is_frameshift, AlleleVariant, CdsVariant, HgvsVariant, LocEdit, TxVariant,
11};
12use crate::normalize::{NormalizeConfig, Normalizer};
13use crate::project::accession::parse_accession;
14use crate::project::edit::transform_edit_for_strand;
15use crate::project::protein::{predict_indel_protein, predict_substitution_protein};
16use crate::project::result::VariantProjection;
17use crate::reference::ReferenceProvider;
18
19pub struct VariantProjector<P: ReferenceProvider + Clone> {
20    projector: Projector,
21    provider: P,
22    normalizer: Normalizer<P>,
23}
24
25impl<P: ReferenceProvider + Clone> VariantProjector<P> {
26    pub fn new(projector: Projector, provider: P) -> Self {
27        let normalizer = Normalizer::with_config(provider.clone(), NormalizeConfig::default());
28        Self {
29            projector,
30            provider,
31            normalizer,
32        }
33    }
34
35    pub fn with_normalize_config(mut self, config: NormalizeConfig) -> Self {
36        self.normalizer = Normalizer::with_config(self.provider.clone(), config);
37        self
38    }
39
40    /// Parse, normalize, and project an HGVS string onto a transcript.
41    pub fn project(
42        &self,
43        hgvs_string: &str,
44        transcript_id: &str,
45    ) -> Result<VariantProjection, FerroError> {
46        let variant = crate::parse_hgvs(hgvs_string)?;
47        self.project_variant(&variant, transcript_id)
48    }
49
50    /// Normalize and project an already-parsed g. variant onto a transcript.
51    ///
52    /// The variant is normalized first; for pre-normalized variants use
53    /// [`project_normalized`] to skip the redundant normalization step.
54    pub fn project_variant(
55        &self,
56        variant: &HgvsVariant,
57        transcript_id: &str,
58    ) -> Result<VariantProjection, FerroError> {
59        // 1. Normalize the genomic variant. The normalizer is built once at
60        // construction time so we don't clone the (potentially heavy) provider
61        // on every call.
62        let normalized = self.normalizer.normalize(variant)?;
63        self.project_variant_inner(&normalized, transcript_id)
64    }
65
66    /// Project an already-normalized g. variant onto a transcript, skipping the
67    /// normalization step.
68    ///
69    /// Callers that pre-normalize once and then project against many transcripts
70    /// should use this method to avoid the cost of re-normalization.
71    ///
72    /// **Warning**: passing a non-normalized variant will produce coordinates
73    /// that are technically valid but may not match other tools' canonical form.
74    pub fn project_normalized(
75        &self,
76        variant: &HgvsVariant,
77        transcript_id: &str,
78    ) -> Result<VariantProjection, FerroError> {
79        self.project_variant_inner(variant, transcript_id)
80    }
81
82    /// Parse, normalize, and project an HGVS string onto ALL overlapping
83    /// transcripts, returning results in clinical priority order (MANE Select
84    /// first, then Plus Clinical, then canonical, then longest CDS).
85    ///
86    /// Returns an empty `Vec` when the variant overlaps no known transcripts.
87    /// Individual transcript errors are logged at trace level and silently
88    /// skipped so that a single bad transcript does not abort the whole call.
89    pub fn project_all(&self, hgvs_string: &str) -> Result<Vec<VariantProjection>, FerroError> {
90        let variant = crate::parse_hgvs(hgvs_string)?;
91        self.project_variant_all(&variant)
92    }
93
94    /// Normalize and project an already-parsed g. variant onto ALL overlapping
95    /// transcripts.
96    ///
97    /// See [`project_all`] for ordering and error-handling semantics.
98    pub fn project_variant_all(
99        &self,
100        variant: &HgvsVariant,
101    ) -> Result<Vec<VariantProjection>, FerroError> {
102        // 1. Normalize once via the cached normalizer (built at construction time).
103        let normalized = self.normalizer.normalize(variant)?;
104        self.project_normalized_all(&normalized)
105    }
106
107    /// Project an already-normalized g. variant onto ALL overlapping
108    /// transcripts, skipping re-normalization.
109    ///
110    /// Callers that pre-normalize once and then fan-out across transcripts
111    /// should use this method.
112    pub fn project_normalized_all(
113        &self,
114        variant: &HgvsVariant,
115    ) -> Result<Vec<VariantProjection>, FerroError> {
116        // 2. Extract contig + first-base position from the normalized variant.
117        //    For Allele variants we use the first inner variant's accession.
118        let (contig, pos) = extract_contig_and_pos(variant)?;
119
120        // 3. Find overlapping transcripts via the Projector (sorted by priority).
121        let projection_result = self.projector.project(&contig, pos)?;
122
123        // 4. Project against each overlapping transcript.
124        let mut results = Vec::with_capacity(projection_result.projections.len());
125        for tx_proj in &projection_result.projections {
126            match self.project_variant_inner(variant, &tx_proj.transcript_id) {
127                Ok(vp) => results.push(vp),
128                Err(e) => {
129                    // Skip transcripts that fail — log at trace level only.
130                    log::trace!(
131                        "project_normalized_all: skipping {} for {}: {}",
132                        tx_proj.transcript_id,
133                        variant,
134                        e
135                    );
136                }
137            }
138        }
139
140        Ok(results)
141    }
142
143    // -------------------------------------------------------------------------
144    // Private helpers
145    // -------------------------------------------------------------------------
146
147    /// Core projection logic, operating on a variant that is assumed to be
148    /// already normalized.  Does NOT re-normalize.
149    fn project_variant_inner(
150        &self,
151        variant: &HgvsVariant,
152        transcript_id: &str,
153    ) -> Result<VariantProjection, FerroError> {
154        // Dispatch on variant kind.
155        match variant {
156            HgvsVariant::Allele(allele) => {
157                self.project_allele_inner(allele, variant, transcript_id)
158            }
159            _ => self.project_single_inner(variant, transcript_id),
160        }
161    }
162
163    /// Project a compound [`AlleleVariant`] by recursively projecting each
164    /// inner variant and combining the results.
165    fn project_allele_inner(
166        &self,
167        allele: &AlleleVariant,
168        original: &HgvsVariant,
169        transcript_id: &str,
170    ) -> Result<VariantProjection, FerroError> {
171        // Empty allele: pass through with no coding/protein, but validate the
172        // transcript ID — every other path through `project_variant_inner` errors
173        // out for unknown transcripts (via `project_single_inner`), and this
174        // branch must behave the same to avoid silently returning bogus
175        // projections for typo'd or missing accessions.
176        if allele.variants.is_empty() {
177            let gene_symbol = self
178                .projector
179                .mapper()
180                .cdot()
181                .get_transcript(transcript_id)
182                .ok_or_else(|| FerroError::ReferenceNotFound {
183                    id: transcript_id.to_string(),
184                })?
185                .gene_name
186                .clone();
187            return Ok(VariantProjection {
188                genomic: original.clone(),
189                coding: None,
190                protein: None,
191                transcript_id: transcript_id.to_string(),
192                gene_symbol,
193                is_frameshift: false,
194                is_intronic: false,
195                is_utr: false,
196            });
197        }
198
199        // Recursively project each inner variant.
200        let mut inner_projections = Vec::with_capacity(allele.variants.len());
201        for inner in &allele.variants {
202            let proj = self.project_variant_inner(inner, transcript_id)?;
203            inner_projections.push(proj);
204        }
205
206        // Aggregate flags.
207        let is_frameshift = inner_projections.iter().any(|p| p.is_frameshift);
208        let is_intronic = inner_projections.iter().any(|p| p.is_intronic);
209        let is_utr = inner_projections.iter().any(|p| p.is_utr);
210
211        // gene_symbol from any projection that has one.
212        let gene_symbol = inner_projections.iter().find_map(|p| p.gene_symbol.clone());
213
214        // Build the coding allele from inner c./n. variants.
215        let coding_variants: Option<Vec<HgvsVariant>> =
216            inner_projections.iter().map(|p| p.coding.clone()).collect();
217        let coding = coding_variants
218            .map(|variants| HgvsVariant::Allele(AlleleVariant::new(variants, allele.phase)));
219
220        // Build the protein allele only if ALL inner projections have a protein.
221        let all_have_protein = inner_projections.iter().all(|p| p.protein.is_some());
222        let protein = if all_have_protein {
223            let protein_variants: Vec<HgvsVariant> = inner_projections
224                .iter()
225                .filter_map(|p| p.protein.clone())
226                .collect();
227            Some(HgvsVariant::Allele(AlleleVariant::new(
228                protein_variants,
229                allele.phase,
230            )))
231        } else {
232            None
233        };
234
235        Ok(VariantProjection {
236            genomic: original.clone(),
237            coding,
238            protein,
239            transcript_id: transcript_id.to_string(),
240            gene_symbol,
241            is_frameshift,
242            is_intronic,
243            is_utr,
244        })
245    }
246
247    /// Project a single (non-allele) g. variant, assuming it has already been
248    /// normalized.
249    fn project_single_inner(
250        &self,
251        normalized: &HgvsVariant,
252        transcript_id: &str,
253    ) -> Result<VariantProjection, FerroError> {
254        // Require a g. variant.
255        let genome_variant = match normalized {
256            HgvsVariant::Genome(g) => g.clone(),
257            _ => {
258                return Err(FerroError::UnsupportedProjection {
259                    reason: "VariantProjector currently only accepts g. variants".to_string(),
260                });
261            }
262        };
263
264        let edit = genome_variant
265            .loc_edit
266            .edit
267            .inner()
268            .cloned()
269            .ok_or_else(|| FerroError::UnsupportedProjection {
270                reason: "g. variant has no concrete edit".to_string(),
271            })?;
272
273        // Look up the transcript in the cdot mapper.
274        let cdot_tx = self
275            .projector
276            .mapper()
277            .cdot()
278            .get_transcript(transcript_id)
279            .ok_or_else(|| FerroError::ReferenceNotFound {
280                id: transcript_id.to_string(),
281            })?;
282        let strand = cdot_tx.strand;
283        let gene_symbol = cdot_tx.gene_name.clone();
284        let cdot_protein = cdot_tx.protein.clone();
285        let is_coding = cdot_tx.cds_start.is_some();
286
287        // Extract start and end genomic positions from the variant interval.
288        let g_start = genome_variant
289            .loc_edit
290            .location
291            .start
292            .inner()
293            .cloned()
294            .ok_or_else(|| FerroError::InvalidCoordinates {
295                msg: "genomic interval start is unknown".to_string(),
296            })?;
297        let g_end = genome_variant
298            .loc_edit
299            .location
300            .end
301            .inner()
302            .cloned()
303            .ok_or_else(|| FerroError::InvalidCoordinates {
304                msg: "genomic interval end is unknown".to_string(),
305            })?;
306
307        let mapper = self.projector.mapper();
308        let normalized_str = normalized.to_string();
309
310        // Compute the overall genomic extent of the transcript from its exons.
311        // Exon format: [genome_start(0-based), genome_end(0-based excl), tx_start, tx_end].
312        let (tx_genome_start, tx_genome_end) = {
313            let exons = &cdot_tx.exons;
314            if exons.is_empty() {
315                return Err(FerroError::ReferenceNotFound {
316                    id: transcript_id.to_string(),
317                });
318            }
319            let starts = exons.iter().map(|e| e[0]).min().unwrap();
320            let ends = exons.iter().map(|e| e[1]).max().unwrap();
321            (starts, ends)
322        };
323
324        // Helper: map one GenomePos → CdsPos, converting out-of-range errors.
325        let map_position = |gp: &GenomePos| -> Result<(CdsPos, MappingInfo), FerroError> {
326            if gp.base < tx_genome_start || gp.base >= tx_genome_end {
327                return Err(FerroError::TranscriptNotOverlapping {
328                    variant: normalized_str.clone(),
329                    transcript_id: transcript_id.to_string(),
330                });
331            }
332            match mapper.genome_to_cds(transcript_id, gp) {
333                Ok(res) => Ok((res.variant, res.info)),
334                Err(FerroError::InvalidCoordinates { .. })
335                | Err(FerroError::ConversionError { .. }) => {
336                    Err(FerroError::TranscriptNotOverlapping {
337                        variant: normalized_str.clone(),
338                        transcript_id: transcript_id.to_string(),
339                    })
340                }
341                Err(other) => Err(other),
342            }
343        };
344
345        let (cds_start_raw, info_start) = map_position(&g_start)?;
346        let (cds_end_raw, info_end) = map_position(&g_end)?;
347
348        // On minus strand the start and end of the c. interval are swapped.
349        let (cds_start, cds_end) = match strand {
350            crate::reference::Strand::Plus => (cds_start_raw, cds_end_raw),
351            crate::reference::Strand::Minus => (cds_end_raw, cds_start_raw),
352            crate::reference::Strand::Unknown => {
353                return Err(FerroError::UnsupportedProjection {
354                    reason: format!(
355                        "transcript {} has unknown strand; cannot project g. → c./n.",
356                        transcript_id
357                    ),
358                });
359            }
360        };
361
362        // Transform the edit for the transcript strand.
363        let c_edit = transform_edit_for_strand(&edit, strand);
364
365        // Build the c./n. HGVS variant.
366        let coding = if is_coding {
367            let interval = CdsInterval::new(cds_start, cds_end);
368            HgvsVariant::Cds(CdsVariant {
369                accession: parse_accession(transcript_id),
370                gene_symbol: gene_symbol.clone(),
371                loc_edit: LocEdit::new(interval, c_edit.clone()),
372            })
373        } else {
374            let tx_start = TxPos::new(cds_start.base);
375            let tx_end = TxPos::new(cds_end.base);
376            HgvsVariant::Tx(TxVariant {
377                accession: parse_accession(transcript_id),
378                gene_symbol: gene_symbol.clone(),
379                loc_edit: LocEdit::new(TxInterval::new(tx_start, tx_end), c_edit.clone()),
380            })
381        };
382
383        // Derive per-position flags from MappingInfo.
384        let is_intronic = info_start.is_intronic || info_end.is_intronic;
385        let is_utr = !is_intronic
386            && (info_start.in_5utr || info_start.in_3utr || info_end.in_5utr || info_end.in_3utr);
387
388        // Predict protein consequence for CDS variants.
389        let mut protein = None;
390        if !is_intronic && !is_utr && is_coding {
391            // Prefer the explicit cdot.protein accession; otherwise infer NP_/XP_
392            // from NM_/XM_ by stripping the prefix (not by substring replace, which
393            // would mangle accessions whose suffix happens to contain "NM_"). If
394            // we cannot infer a protein accession, skip protein prediction rather
395            // than fabricate a bogus one.
396            let prot_acc = match cdot_protein {
397                Some(p) => Some(p),
398                None => transcript_id
399                    .strip_prefix("NM_")
400                    .map(|rest| format!("NP_{rest}"))
401                    .or_else(|| {
402                        transcript_id
403                            .strip_prefix("XM_")
404                            .map(|rest| format!("XP_{rest}"))
405                    }),
406            };
407            if let Some(prot_acc) = prot_acc {
408                match &c_edit {
409                    NaEdit::Substitution { .. } => {
410                        let tx_for_codon = self.provider.get_transcript(transcript_id)?;
411                        protein = Some(predict_substitution_protein(
412                            &tx_for_codon,
413                            cds_start.base,
414                            &c_edit,
415                            &prot_acc,
416                        )?);
417                    }
418                    NaEdit::Deletion { .. }
419                    | NaEdit::Insertion { .. }
420                    | NaEdit::Duplication { .. }
421                    | NaEdit::Delins { .. }
422                    | NaEdit::Inversion { .. }
423                        // Only predict when both CDS positions are concrete exonic positions
424                        // (no intronic offsets — already guarded above).
425                        if cds_start.offset.is_none()
426                            && cds_end.offset.is_none()
427                            && cds_start.base > 0
428                            && cds_end.base > 0 =>
429                    {
430                        let tx_for_codon = self.provider.get_transcript(transcript_id)?;
431                        match predict_indel_protein(
432                            &tx_for_codon,
433                            cds_start.base,
434                            cds_end.base,
435                            &c_edit,
436                            &prot_acc,
437                        ) {
438                            Ok(pv) => protein = Some(pv),
439                            // Non-fatal: unsupported edits or missing sequence → leave protein=None.
440                            Err(FerroError::UnsupportedProjection { .. })
441                            | Err(FerroError::ProteinSequenceUnavailable { .. }) => {}
442                            Err(other) => return Err(other),
443                        }
444                    }
445                    _ => {}
446                }
447            }
448        }
449
450        let frameshift = is_frameshift(&coding);
451        Ok(VariantProjection {
452            genomic: normalized.clone(),
453            coding: Some(coding),
454            protein,
455            transcript_id: transcript_id.to_string(),
456            gene_symbol,
457            is_frameshift: frameshift,
458            is_intronic,
459            is_utr,
460        })
461    }
462}
463
464/// Extract the contig name and a representative 1-based genomic position from
465/// an already-normalized `HgvsVariant`.
466///
467/// For `HgvsVariant::Allele`, the first inner g. variant is used.
468///
469/// # Contig name resolution
470///
471/// The contig name is taken directly from the variant's `Accession`. For
472/// standard RefSeq genomic accessions (e.g. `NC_000001.11`) the cdot mapper
473/// stores contigs under those same names and also aliases UCSC names
474/// (`chr1`) to them via `populate_contig_aliases`.  For assembly-notation
475/// accessions (`GRCh38(chr1)`) the `chromosome` field of `Accession` is
476/// used instead.  Callers that use non-standard accession formats (e.g.
477/// plain `chr1` keys) should ensure their cdot data was loaded with matching
478/// contig keys.
479fn extract_contig_and_pos(variant: &HgvsVariant) -> Result<(String, u64), FerroError> {
480    let effective = match variant {
481        HgvsVariant::Allele(allele) => {
482            allele
483                .variants
484                .first()
485                .ok_or_else(|| FerroError::UnsupportedProjection {
486                    reason: "cannot project an empty allele to all transcripts".to_string(),
487                })?
488        }
489        other => other,
490    };
491
492    match effective {
493        HgvsVariant::Genome(gv) => {
494            // Prefer the chromosome field for assembly-notation accessions.
495            let contig = if let Some(chr) = &gv.accession.chromosome {
496                chr.to_string()
497            } else {
498                gv.accession.full()
499            };
500
501            let pos = gv
502                .loc_edit
503                .location
504                .start
505                .inner()
506                .cloned()
507                .ok_or_else(|| FerroError::InvalidCoordinates {
508                    msg: "genomic interval start is unknown".to_string(),
509                })?
510                .base;
511
512            Ok((contig, pos))
513        }
514        _ => Err(FerroError::UnsupportedProjection {
515            reason: "project_all currently only accepts g. variants".to_string(),
516        }),
517    }
518}
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523    use crate::data::cdot::{CdotMapper, CdotTranscript};
524    use crate::data::projection::Projector;
525    use crate::hgvs::variant::{AllelePhase, AlleleVariant};
526    use crate::reference::mock::MockProvider;
527    use crate::reference::transcript::{Exon, ManeStatus, Strand as TxStrand, Transcript};
528    use crate::reference::Strand as ProvStrand;
529    use std::sync::OnceLock;
530
531    fn make_test_provider_and_projector() -> (Projector, MockProvider) {
532        // A single coding transcript on chr1, plus strand.
533        //
534        // Genomic layout (cdot 0-based coords):
535        //   Exon: genome [1000, 1009), tx [0, 9), so 9 bases total.
536        //   cds_start = 0 (0-based, no 5'UTR), cds_end = 9.
537        //
538        // Sequence: "ATGCGCTAA" = Met-Arg-Stop (3 codons).
539        //
540        // Coordinate mapping (cdot genome 0-based → HGVS c. 1-based):
541        //   g.1000 → tx_pos=0 → c.1 (A = first base of Met)
542        //   g.1001 → tx_pos=1 → c.2 (T)
543        //   g.1002 → tx_pos=2 → c.3 (G)
544        //   g.1003 → tx_pos=3 → c.4 (C ← ref base for test substitution)
545        //   g.1004 → tx_pos=4 → c.5 (G)
546        //   ...
547        //
548        // c.4C>A: codon 2 CGC (Arg) → AGC (Ser), missense.
549        let mut cdot = CdotMapper::new();
550        cdot.add_transcript(
551            "NM_TEST.1".to_string(),
552            CdotTranscript {
553                gene_name: Some("TESTGENE".to_string()),
554                contig: "chr1".to_string(),
555                strand: ProvStrand::Plus,
556                // [genome_start(0-based), genome_end(0-based excl), tx_start(1-based), tx_end(1-based)]
557                exons: vec![[1000, 1009, 0, 9]],
558                cds_start: Some(0), // 0-based: CDS starts at tx_pos 0 (no 5'UTR)
559                cds_end: Some(9),   // 0-based exclusive: CDS ends at tx_pos 9
560                gene_id: None,
561                protein: Some("NP_TEST.1".to_string()),
562                exon_cigars: Vec::new(),
563            },
564        );
565        let projector = Projector::new(cdot);
566
567        let mut provider = MockProvider::new();
568        // Transcript: sequence "ATGCGCTAA", cds_start=1 (1-based, first base).
569        provider.add_transcript(Transcript {
570            id: "NM_TEST.1".to_string(),
571            gene_symbol: Some("TESTGENE".to_string()),
572            strand: TxStrand::Plus,
573            sequence: Some("ATGCGCTAA".to_string()),
574            cds_start: Some(1), // 1-based inclusive per Transcript convention
575            cds_end: Some(9),
576            exons: vec![Exon::new(1, 1, 9)],
577            chromosome: Some("chr1".to_string()),
578            genomic_start: Some(1000),
579            genomic_end: Some(1008),
580            genome_build: Default::default(),
581            mane_status: ManeStatus::default(),
582            refseq_match: None,
583            ensembl_match: None,
584            exon_cigars: Vec::new(),
585            cached_introns: OnceLock::new(),
586        });
587        // Genomic sequence: 999 N's + "ATGCGCTAA" + 100 N's.
588        let prefix = "N".repeat(999);
589        let suffix = "N".repeat(100);
590        provider.add_genomic_sequence("chr1", format!("{}{}{}", prefix, "ATGCGCTAA", suffix));
591        (projector, provider)
592    }
593
594    fn make_minus_strand_provider_and_projector() -> (Projector, MockProvider) {
595        // Same 9bp CDS on chr1, but transcript is on the minus strand.
596        let mut cdot = CdotMapper::new();
597        cdot.add_transcript(
598            "NM_TEST_MINUS.1".to_string(),
599            CdotTranscript {
600                gene_name: Some("TESTGENE".to_string()),
601                contig: "chr1".to_string(),
602                strand: ProvStrand::Minus,
603                exons: vec![[1000, 1009, 0, 9]],
604                cds_start: Some(0),
605                cds_end: Some(9),
606                gene_id: None,
607                protein: Some("NP_TEST_MINUS.1".to_string()),
608                exon_cigars: Vec::new(),
609            },
610        );
611        let projector = Projector::new(cdot);
612
613        let mut provider = MockProvider::new();
614        provider.add_transcript(Transcript {
615            id: "NM_TEST_MINUS.1".to_string(),
616            gene_symbol: Some("TESTGENE".to_string()),
617            strand: TxStrand::Minus,
618            sequence: Some("ATGCGCTAA".to_string()), // CDS as the transcript reads it
619            cds_start: Some(1),                      // 1-based inclusive per Transcript convention
620            cds_end: Some(9),
621            exons: vec![Exon::new(1, 1, 9)],
622            chromosome: Some("chr1".to_string()),
623            genomic_start: Some(1000),
624            genomic_end: Some(1008),
625            genome_build: Default::default(),
626            mane_status: ManeStatus::default(),
627            refseq_match: None,
628            ensembl_match: None,
629            exon_cigars: Vec::new(),
630            cached_introns: OnceLock::new(),
631        });
632        let prefix = "N".repeat(999);
633        let suffix = "N".repeat(100);
634        provider.add_genomic_sequence("chr1", format!("{}TTAGCGCAT{}", prefix, suffix));
635        (projector, provider)
636    }
637
638    /// Build a two-transcript setup for project_all tests.
639    ///
640    /// NM_TX1.1: chr1 [1000,1009), plus strand, 9bp CDS "ATGCGCTAA"
641    /// NM_TX2.1: chr1 [1000,1009), plus strand, 9bp CDS "ATGCGCTAA" (same region)
642    ///           NM_TX2.1 is registered as MANE Select so it sorts first.
643    fn make_two_transcript_setup() -> (Projector, MockProvider) {
644        let mut cdot = CdotMapper::new();
645        cdot.add_transcript(
646            "NM_TX1.1".to_string(),
647            CdotTranscript {
648                gene_name: Some("GENE1".to_string()),
649                contig: "chr1".to_string(),
650                strand: ProvStrand::Plus,
651                exons: vec![[1000, 1009, 0, 9]],
652                cds_start: Some(0),
653                cds_end: Some(9),
654                gene_id: None,
655                protein: Some("NP_TX1.1".to_string()),
656                exon_cigars: Vec::new(),
657            },
658        );
659        cdot.add_transcript(
660            "NM_TX2.1".to_string(),
661            CdotTranscript {
662                gene_name: Some("GENE1".to_string()),
663                contig: "chr1".to_string(),
664                strand: ProvStrand::Plus,
665                exons: vec![[1000, 1009, 0, 9]],
666                cds_start: Some(0),
667                cds_end: Some(9),
668                gene_id: None,
669                protein: Some("NP_TX2.1".to_string()),
670                exon_cigars: Vec::new(),
671            },
672        );
673        let projector = Projector::new(cdot)
674            // Make NM_TX2.1 the MANE Select so it sorts first.
675            .with_mane(vec!["NM_TX2.1".to_string()], vec![]);
676
677        let mut provider = MockProvider::new();
678        for id in ["NM_TX1.1", "NM_TX2.1"] {
679            provider.add_transcript(Transcript {
680                id: id.to_string(),
681                gene_symbol: Some("GENE1".to_string()),
682                strand: TxStrand::Plus,
683                sequence: Some("ATGCGCTAA".to_string()),
684                cds_start: Some(1),
685                cds_end: Some(9),
686                exons: vec![Exon::new(1, 1, 9)],
687                chromosome: Some("chr1".to_string()),
688                genomic_start: Some(1000),
689                genomic_end: Some(1008),
690                genome_build: Default::default(),
691                mane_status: ManeStatus::default(),
692                refseq_match: None,
693                ensembl_match: None,
694                exon_cigars: Vec::new(),
695                cached_introns: OnceLock::new(),
696            });
697        }
698        let prefix = "N".repeat(999);
699        let suffix = "N".repeat(100);
700        provider.add_genomic_sequence("chr1", format!("{}{}{}", prefix, "ATGCGCTAA", suffix));
701        (projector, provider)
702    }
703
704    // -------------------------------------------------------------------------
705    // Existing tests (preserved)
706    // -------------------------------------------------------------------------
707
708    #[test]
709    fn project_substitution_minus_strand_revcomps_ref_alt() {
710        let (projector, provider) = make_minus_strand_provider_and_projector();
711        let vp = VariantProjector::new(projector, provider);
712        let result = vp
713            .project("chr1:g.1005G>A", "NM_TEST_MINUS.1")
714            .expect("minus-strand projection should succeed");
715        let c = result
716            .coding
717            .as_ref()
718            .expect("c. should be present")
719            .to_string();
720        assert!(
721            c.contains(":c.4C>T"),
722            "expected revcomp c.4C>T for minus strand, got: {}",
723            c
724        );
725        let p = result
726            .protein
727            .as_ref()
728            .expect("p. should be present")
729            .to_string();
730        assert_eq!(p, "NP_TEST_MINUS.1(TESTGENE):p.(Arg2Cys)");
731        assert!(!result.is_frameshift);
732        assert!(!result.is_intronic);
733        assert!(!result.is_utr);
734    }
735
736    fn make_intronic_test_data() -> (Projector, MockProvider) {
737        // Two-exon coding transcript on chr1, plus strand.
738        //
739        //   Exon 1: genome [1000, 1010), tx [0, 10)  — 10 bp
740        //   Intron: genome [1010, 2000)               — 990 bp
741        //   Exon 2: genome [2000, 2010), tx [10, 20) — 10 bp (last 2 are pad)
742        //
743        // CDS: tx [0, 18) → 18 bp → 6 codons: ATG-CGC-AAA-GGG-TAA-CCC
744        //   (Met-Arg-Lys-Gly-Stop-Pro)
745        //
746        // Test position g.1015 is in the intron, 6 bases after the end of exon 1
747        // (exon 1 ends at genome 1010, exclusive; last exonic base is g.1009;
748        //  distance = 1015-1010+1 = 6).  Maps to c.9+6 or c.10+6 depending on
749        // whether tx_end is inclusive or exclusive in the boundary calculation.
750        let mut cdot = CdotMapper::new();
751        cdot.add_transcript(
752            "NM_INTR.1".to_string(),
753            CdotTranscript {
754                gene_name: Some("INTRGENE".to_string()),
755                contig: "chr1".to_string(),
756                strand: ProvStrand::Plus,
757                exons: vec![[1000, 1010, 0, 10], [2000, 2010, 10, 20]],
758                cds_start: Some(0),
759                cds_end: Some(18),
760                gene_id: None,
761                protein: Some("NP_INTR.1".to_string()),
762                exon_cigars: Vec::new(),
763            },
764        );
765        let projector = Projector::new(cdot);
766
767        let mut provider = MockProvider::new();
768        provider.add_transcript(Transcript {
769            id: "NM_INTR.1".to_string(),
770            gene_symbol: Some("INTRGENE".to_string()),
771            strand: TxStrand::Plus,
772            sequence: Some("ATGCGCAAAGGGTAACCC".to_string()), // 18 bp
773            cds_start: Some(1),
774            cds_end: Some(18),
775            exons: vec![Exon::new(1, 1, 10), Exon::new(2, 11, 20)],
776            chromosome: Some("chr1".to_string()),
777            genomic_start: Some(1000),
778            genomic_end: Some(2009),
779            genome_build: Default::default(),
780            mane_status: ManeStatus::default(),
781            refseq_match: None,
782            ensembl_match: None,
783            exon_cigars: Vec::new(),
784            cached_introns: OnceLock::new(),
785        });
786        let exon1 = "ATGCGCAAAG"; // 10 bp (genomic positions 1000..1009)
787        let intron = "N".repeat(990); // genomic positions 1010..1999 (990 bp)
788        let exon2 = "GGTAACCCNN"; // 10 bp pad (genomic positions 2000..2009)
789        let prefix = "N".repeat(999);
790        let suffix = "N".repeat(100);
791        provider.add_genomic_sequence("chr1", format!("{prefix}{exon1}{intron}{exon2}{suffix}"));
792        (projector, provider)
793    }
794
795    #[test]
796    fn project_intronic_substitution_no_protein() {
797        let (projector, provider) = make_intronic_test_data();
798        let vp = VariantProjector::new(projector, provider);
799        let result = vp
800            .project("NC_000001.11:g.1015A>G", "NM_INTR.1")
801            .expect("intronic substitution should project to c. with offset");
802        assert!(result.is_intronic, "expected is_intronic=true");
803        assert!(result.protein.is_none(), "no p. for intronic substitutions");
804        let c = result.coding.as_ref().expect("c. expected").to_string();
805        assert!(
806            c.contains('+'),
807            "expected intronic offset notation (e.g. c.9+6 or c.10+6), got: {}",
808            c
809        );
810    }
811
812    #[test]
813    fn project_no_overlap_returns_transcript_not_overlapping() {
814        let (projector, provider) = make_test_provider_and_projector();
815        let vp = VariantProjector::new(projector, provider);
816        let err = vp
817            .project("NC_000001.11:g.5000A>G", "NM_TEST.1")
818            .expect_err("should fail to project outside the transcript");
819        assert!(
820            matches!(err, FerroError::TranscriptNotOverlapping { .. }),
821            "expected TranscriptNotOverlapping, got: {:?}",
822            err
823        );
824    }
825
826    #[test]
827    fn project_substitution_plus_strand_missense() {
828        let (projector, provider) = make_test_provider_and_projector();
829        let vp = VariantProjector::new(projector, provider);
830
831        let result = vp
832            .project("chr1:g.1003C>A", "NM_TEST.1")
833            .expect("projection should succeed");
834
835        assert_eq!(result.transcript_id, "NM_TEST.1");
836        assert_eq!(result.gene_symbol.as_deref(), Some("TESTGENE"));
837
838        let c = result
839            .coding
840            .as_ref()
841            .expect("c. should be present")
842            .to_string();
843        assert!(c.contains(":c.4C>A"), "expected ':c.4C>A' in '{}' ", c);
844
845        let p = result
846            .protein
847            .as_ref()
848            .expect("p. should be present")
849            .to_string();
850        assert_eq!(p, "NP_TEST.1(TESTGENE):p.(Arg2Ser)");
851
852        assert!(!result.is_frameshift);
853        assert!(!result.is_intronic);
854        assert!(!result.is_utr);
855    }
856
857    #[test]
858    fn project_single_base_deletion_is_frameshift() {
859        let (projector, provider) = make_test_provider_and_projector();
860        let vp = VariantProjector::new(projector, provider);
861        let result = vp
862            .project("NC_000001.11:g.1004del", "NM_TEST.1")
863            .expect("deletion should project");
864        let c = result.coding.as_ref().expect("c. expected").to_string();
865        assert!(c.contains(":c."), "expected c. variant, got: {}", c);
866        assert!(c.contains("del"), "expected del notation in c., got: {}", c);
867        let p = result
868            .protein
869            .as_ref()
870            .expect("p. expected for CDS del")
871            .to_string();
872        assert!(p.contains("Arg2"), "expected Arg2 in p.: {}", p);
873        assert!(p.contains("fs"), "expected fs in p.: {}", p);
874        assert!(result.is_frameshift, "1-base del should be frameshift");
875        assert!(!result.is_intronic);
876    }
877
878    #[test]
879    fn project_three_base_deletion_in_frame() {
880        let (projector, provider) = make_test_provider_and_projector();
881        let vp = VariantProjector::new(projector, provider);
882        let result = vp
883            .project("NC_000001.11:g.1003_1005del", "NM_TEST.1")
884            .expect("3-base del should project");
885        let c = result.coding.as_ref().expect("c. expected").to_string();
886        assert!(c.contains("del"), "expected del notation, got: {}", c);
887        let p = result
888            .protein
889            .as_ref()
890            .expect("p. expected for in-frame del")
891            .to_string();
892        assert!(p.contains("Arg2del"), "expected Arg2del in p.: {}", p);
893        assert!(!result.is_frameshift, "3-base deletion is in-frame");
894    }
895
896    #[test]
897    fn project_single_base_insertion_is_frameshift() {
898        let (projector, provider) = make_test_provider_and_projector();
899        let vp = VariantProjector::new(projector, provider);
900        let result = vp
901            .project("NC_000001.11:g.1003_1004insA", "NM_TEST.1")
902            .expect("insertion should project");
903        let c = result.coding.as_ref().expect("c. expected").to_string();
904        assert!(c.contains("ins"), "expected ins notation, got: {}", c);
905        assert!(result.protein.is_some(), "p. expected for CDS insertion");
906        assert!(result.is_frameshift, "1-base insertion is frameshift");
907    }
908
909    #[test]
910    fn project_duplication() {
911        let (projector, provider) = make_test_provider_and_projector();
912        let vp = VariantProjector::new(projector, provider);
913        let result = vp
914            .project("NC_000001.11:g.1004dup", "NM_TEST.1")
915            .expect("dup should project");
916        let c = result.coding.as_ref().expect("c. expected").to_string();
917        assert!(c.contains("dup"), "expected dup notation, got: {}", c);
918        assert!(result.protein.is_some(), "p. expected for CDS dup");
919        assert!(result.is_frameshift, "1-base dup is frameshift");
920    }
921
922    #[test]
923    fn project_delins() {
924        let (projector, provider) = make_test_provider_and_projector();
925        let vp = VariantProjector::new(projector, provider);
926        let result = vp
927            .project("NC_000001.11:g.1003_1004delinsAT", "NM_TEST.1")
928            .expect("delins should project");
929        let c = result.coding.as_ref().expect("c. expected").to_string();
930        assert!(c.contains("delins"), "expected delins notation, got: {}", c);
931        let p = result
932            .protein
933            .as_ref()
934            .expect("p. expected for delins")
935            .to_string();
936        assert!(p.contains("delins"), "expected delins in p.: {}", p);
937        assert!(
938            !result.is_frameshift,
939            "delins of equal length is not a frameshift"
940        );
941    }
942
943    #[test]
944    fn project_inversion() {
945        let (projector, provider) = make_test_provider_and_projector();
946        let vp = VariantProjector::new(projector, provider);
947        let result = vp
948            .project("NC_000001.11:g.1003_1005inv", "NM_TEST.1")
949            .expect("inv should project");
950        let c = result.coding.as_ref().expect("c. expected").to_string();
951        assert!(c.contains("inv"), "expected inv notation, got: {}", c);
952        let p = result
953            .protein
954            .as_ref()
955            .expect("p. expected for inv")
956            .to_string();
957        assert!(p.contains("Arg2"), "expected Arg2 in p.: {}", p);
958        assert!(p.contains("delins"), "expected delins in p.: {}", p);
959        assert!(p.contains("Ala"), "expected Ala in p.: {}", p);
960        assert!(!result.is_frameshift, "inversion is not a frameshift");
961    }
962
963    fn make_ensembl_provider_and_projector() -> (Projector, MockProvider) {
964        // Same CDS as make_test_provider_and_projector but the transcript ID is
965        // an Ensembl accession (no NM_/XM_ prefix) and cdot.protein is absent.
966        // The projector must NOT fabricate a bogus protein accession via
967        // substring substitution; it should skip protein prediction instead.
968        let mut cdot = CdotMapper::new();
969        cdot.add_transcript(
970            "ENST00000000001.1".to_string(),
971            CdotTranscript {
972                gene_name: Some("TESTGENE".to_string()),
973                contig: "chr1".to_string(),
974                strand: ProvStrand::Plus,
975                exons: vec![[1000, 1009, 0, 9]],
976                cds_start: Some(0),
977                cds_end: Some(9),
978                gene_id: None,
979                protein: None,
980                exon_cigars: Vec::new(),
981            },
982        );
983        let projector = Projector::new(cdot);
984
985        let mut provider = MockProvider::new();
986        provider.add_transcript(Transcript {
987            id: "ENST00000000001.1".to_string(),
988            gene_symbol: Some("TESTGENE".to_string()),
989            strand: TxStrand::Plus,
990            sequence: Some("ATGCGCTAA".to_string()),
991            cds_start: Some(1),
992            cds_end: Some(9),
993            exons: vec![Exon::new(1, 1, 9)],
994            chromosome: Some("chr1".to_string()),
995            genomic_start: Some(1000),
996            genomic_end: Some(1008),
997            genome_build: Default::default(),
998            mane_status: ManeStatus::default(),
999            refseq_match: None,
1000            ensembl_match: None,
1001            exon_cigars: Vec::new(),
1002            cached_introns: OnceLock::new(),
1003        });
1004        let prefix = "N".repeat(999);
1005        let suffix = "N".repeat(100);
1006        provider.add_genomic_sequence("chr1", format!("{}{}{}", prefix, "ATGCGCTAA", suffix));
1007        (projector, provider)
1008    }
1009
1010    #[test]
1011    fn project_substitution_no_protein_accession_skips_protein() {
1012        let (projector, provider) = make_ensembl_provider_and_projector();
1013        let vp = VariantProjector::new(projector, provider);
1014        let result = vp
1015            .project("chr1:g.1003C>A", "ENST00000000001.1")
1016            .expect("projection should succeed");
1017        let c = result.coding.as_ref().expect("c. expected").to_string();
1018        assert!(c.contains(":c.4C>A"), "expected c.4C>A, got: {}", c);
1019        // Neither NM_/XM_ prefix nor cdot.protein: must skip protein prediction.
1020        assert!(
1021            result.protein.is_none(),
1022            "expected no protein for accession with no NM_/XM_ prefix and no cdot.protein, got: {:?}",
1023            result.protein
1024        );
1025    }
1026
1027    // -------------------------------------------------------------------------
1028    // project_normalized tests
1029    // -------------------------------------------------------------------------
1030
1031    #[test]
1032    fn project_normalized_same_result_as_project_variant() {
1033        let (projector, provider) = make_test_provider_and_projector();
1034        let vp = VariantProjector::new(projector, provider);
1035
1036        let variant = crate::parse_hgvs("chr1:g.1003C>A").expect("parse should succeed");
1037        let via_project = vp
1038            .project_variant(&variant, "NM_TEST.1")
1039            .expect("project_variant should succeed");
1040        // Pre-normalize via the cached normalizer, then call project_normalized.
1041        let normalized = vp.normalizer.normalize(&variant).expect("normalize failed");
1042        let via_normalized = vp
1043            .project_normalized(&normalized, "NM_TEST.1")
1044            .expect("project_normalized should succeed");
1045
1046        assert_eq!(
1047            via_project.coding.as_ref().map(|v| v.to_string()),
1048            via_normalized.coding.as_ref().map(|v| v.to_string()),
1049            "project_normalized should produce same c. as project_variant"
1050        );
1051        assert_eq!(
1052            via_project.protein.as_ref().map(|v| v.to_string()),
1053            via_normalized.protein.as_ref().map(|v| v.to_string()),
1054            "project_normalized should produce same p. as project_variant"
1055        );
1056    }
1057
1058    // -------------------------------------------------------------------------
1059    // project_all / project_variant_all tests
1060    // -------------------------------------------------------------------------
1061
1062    #[test]
1063    fn project_variant_all_returns_both_transcripts() {
1064        let (projector, provider) = make_two_transcript_setup();
1065        let vp = VariantProjector::new(projector, provider);
1066
1067        let results = vp
1068            .project_all("chr1:g.1003C>A")
1069            .expect("project_all should succeed");
1070
1071        assert_eq!(
1072            results.len(),
1073            2,
1074            "expected projections onto both transcripts, got: {}",
1075            results.len()
1076        );
1077    }
1078
1079    #[test]
1080    fn project_all_mane_select_sorts_first() {
1081        let (projector, provider) = make_two_transcript_setup();
1082        let vp = VariantProjector::new(projector, provider);
1083
1084        let results = vp
1085            .project_all("chr1:g.1003C>A")
1086            .expect("project_all should succeed");
1087
1088        // NM_TX2.1 is MANE Select → must be first.
1089        assert_eq!(
1090            results[0].transcript_id, "NM_TX2.1",
1091            "MANE Select should be first"
1092        );
1093    }
1094
1095    #[test]
1096    fn project_all_no_overlap_returns_empty() {
1097        let (projector, provider) = make_two_transcript_setup();
1098        let vp = VariantProjector::new(projector, provider);
1099
1100        // g.5000 is far outside all transcripts.
1101        let results = vp
1102            .project_all("NC_000001.11:g.5000A>G")
1103            .expect("project_all should return Ok for no overlaps");
1104
1105        assert!(
1106            results.is_empty(),
1107            "expected empty result for non-overlapping variant"
1108        );
1109    }
1110
1111    // -------------------------------------------------------------------------
1112    // Allele compound projection tests
1113    // -------------------------------------------------------------------------
1114
1115    /// Helper: build a cis allele `[chr1:g.1003C>A;chr1:g.1006T>A]` and
1116    /// project it onto NM_TEST.1.
1117    fn project_cis_allele() -> VariantProjection {
1118        let (projector, provider) = make_test_provider_and_projector();
1119        let vp = VariantProjector::new(projector, provider);
1120
1121        // Use parse → project_variant to build the allele naturally.
1122        // g.1003 is C (c.4) and g.1006 is T (c.7) in the test fixture
1123        // "ATGCGCTAA"; using the correct ref bases keeps the test
1124        // valid HGVS rather than just exercising control flow.
1125        let v1 = crate::parse_hgvs("chr1:g.1003C>A").expect("v1 parse");
1126        let v2 = crate::parse_hgvs("chr1:g.1006T>A").expect("v2 parse");
1127        let allele = HgvsVariant::Allele(AlleleVariant::cis(vec![v1, v2]));
1128        vp.project_variant(&allele, "NM_TEST.1")
1129            .expect("cis allele projection should succeed")
1130    }
1131
1132    #[test]
1133    fn project_cis_allele_produces_cis_coding_allele() {
1134        let result = project_cis_allele();
1135        let coding = result.coding.as_ref().expect("c. allele should be present");
1136        // The coding variant should itself be an Allele.
1137        assert!(
1138            matches!(coding, HgvsVariant::Allele(av) if av.phase == AllelePhase::Cis),
1139            "expected Cis coding allele, got: {}",
1140            coding
1141        );
1142        // Two inner c. variants.
1143        if let HgvsVariant::Allele(av) = coding {
1144            assert_eq!(
1145                av.variants.len(),
1146                2,
1147                "expected 2 inner c. variants in cis allele"
1148            );
1149        }
1150    }
1151
1152    #[test]
1153    fn project_trans_allele_preserves_trans_phase() {
1154        let (projector, provider) = make_test_provider_and_projector();
1155        let vp = VariantProjector::new(projector, provider);
1156
1157        // Ref bases match the fixture: g.1003 = C, g.1006 = T.
1158        let v1 = crate::parse_hgvs("chr1:g.1003C>A").expect("v1 parse");
1159        let v2 = crate::parse_hgvs("chr1:g.1006T>A").expect("v2 parse");
1160        let allele = HgvsVariant::Allele(AlleleVariant::trans(vec![v1, v2]));
1161        let result = vp
1162            .project_variant(&allele, "NM_TEST.1")
1163            .expect("trans allele projection should succeed");
1164
1165        let coding = result.coding.as_ref().expect("c. expected");
1166        assert!(
1167            matches!(coding, HgvsVariant::Allele(av) if av.phase == AllelePhase::Trans),
1168            "expected Trans coding allele, got: {}",
1169            coding
1170        );
1171    }
1172
1173    #[test]
1174    fn project_allele_with_frameshift_inner_sets_is_frameshift() {
1175        let (projector, provider) = make_test_provider_and_projector();
1176        let vp = VariantProjector::new(projector, provider);
1177
1178        // g.1004del is a 1-base deletion → frameshift.
1179        let v1 = crate::parse_hgvs("chr1:g.1003C>A").expect("v1 parse");
1180        let v_fs = crate::parse_hgvs("NC_000001.11:g.1004del").expect("fs parse");
1181        let allele = HgvsVariant::Allele(AlleleVariant::cis(vec![v1, v_fs]));
1182        let result = vp
1183            .project_variant(&allele, "NM_TEST.1")
1184            .expect("allele with frameshift should project");
1185
1186        assert!(
1187            result.is_frameshift,
1188            "allele containing a frameshift inner variant should set is_frameshift=true"
1189        );
1190    }
1191
1192    #[test]
1193    fn project_allele_with_non_protein_inner_has_no_protein() {
1194        // An allele where one inner variant has no protein (intronic) →
1195        // the whole allele protein should be None.
1196        let (projector, provider) = make_intronic_test_data();
1197        let vp = VariantProjector::new(projector, provider);
1198
1199        // g.1003 (exonic, ref C in NM_INTR.1 fixture) + g.1015 (intronic, ref N
1200        // since g.1015 is in the intron gap — the projector doesn't validate
1201        // intronic ref bases against the genomic reference).
1202        let v_exon = crate::parse_hgvs("NC_000001.11:g.1003C>G").expect("exon parse");
1203        let v_intron = crate::parse_hgvs("NC_000001.11:g.1015N>G").expect("intron parse");
1204        let allele = HgvsVariant::Allele(AlleleVariant::cis(vec![v_exon, v_intron]));
1205        let result = vp
1206            .project_variant(&allele, "NM_INTR.1")
1207            .expect("allele with intronic variant should project");
1208
1209        assert!(
1210            result.protein.is_none(),
1211            "allele with intronic inner variant should have no protein"
1212        );
1213        assert!(result.is_intronic, "should be marked intronic");
1214    }
1215
1216    #[test]
1217    fn project_empty_allele_unknown_transcript_returns_reference_not_found() {
1218        // Regression: the empty-allele fast path used to silently return Ok(...)
1219        // for any transcript_id, even one not present in the cdot mapper. That
1220        // was inconsistent with every other projection path, which surfaces
1221        // ReferenceNotFound for unknown accessions.
1222        let (projector, provider) = make_test_provider_and_projector();
1223        let vp = VariantProjector::new(projector, provider);
1224        let empty_allele = HgvsVariant::Allele(AlleleVariant::cis(vec![]));
1225        let err = vp
1226            .project_variant(&empty_allele, "NM_DOES_NOT_EXIST.1")
1227            .expect_err("empty allele on unknown transcript should error");
1228        assert!(
1229            matches!(err, FerroError::ReferenceNotFound { ref id } if id == "NM_DOES_NOT_EXIST.1"),
1230            "expected ReferenceNotFound for unknown transcript, got: {:?}",
1231            err
1232        );
1233    }
1234}