Skip to main content

vareffect/consequence/
snv.rs

1//! SNV (single-nucleotide variant) consequence annotation.
2
3use super::helpers::{
4    build_alt_codon, build_base_result, compute_cdna_position_exonic,
5    compute_cdna_position_for_cds, fetch_ref_codon, finalize_consequences,
6    is_incomplete_terminal_codon,
7};
8use super::{Consequence, ConsequenceResult};
9use crate::codon::{complement, format_amino_acids, format_codons, translate_codon_for_transcript};
10use crate::error::VarEffectError;
11use crate::fasta::FastaReader;
12use crate::hgvs_c;
13use crate::locate::{
14    LocateIndex, VariantLocation, format_exon_number, format_intron_number, locate_variant,
15};
16use crate::types::{Strand, TranscriptModel, TranscriptTier};
17
18/// Annotate a single-nucleotide variant against one transcript.
19///
20/// `ref_base` and `alt_base` are on the **plus strand** (VCF convention).
21/// For minus-strand transcripts, internal complementing is performed to
22/// derive the coding-strand alleles.
23///
24/// # Arguments
25///
26/// * `chrom` -- UCSC-style chromosome name (e.g., `"chr17"`)
27/// * `pos` -- 0-based genomic position
28/// * `ref_base` -- VCF REF base (uppercase ASCII)
29/// * `alt_base` -- VCF ALT base (uppercase ASCII)
30/// * `transcript` -- Transcript model to annotate against
31/// * `locate_index` -- Precomputed locate index for the transcript
32/// * `fasta` -- Reference FASTA reader for codon extraction and ref
33///   verification
34///
35/// # Returns
36///
37/// A [`ConsequenceResult`] with consequence terms, amino acids, codons,
38/// and IMPACT. HGVS fields (`hgvs_c`, `hgvs_p`) are left as `None`.
39///
40/// # Errors
41///
42/// - [`VarEffectError::RefMismatch`] if the VCF REF base does not match
43///   the reference FASTA
44/// - [`VarEffectError::ChromNotFound`] if the chromosome is not in the
45///   FASTA index
46/// - [`VarEffectError::CoordinateOutOfRange`] if the position exceeds
47///   chromosome length
48pub fn annotate_snv(
49    chrom: &str,
50    pos: u64,
51    ref_base: u8,
52    alt_base: u8,
53    transcript: &TranscriptModel,
54    locate_index: &LocateIndex,
55    fasta: &FastaReader,
56) -> Result<ConsequenceResult, VarEffectError> {
57    if !fasta.verify_ref(chrom, pos, &[ref_base])? {
58        let expected = fasta.fetch_base(chrom, pos)?;
59        return Err(VarEffectError::RefMismatch {
60            chrom: chrom.to_string(),
61            pos,
62            expected: String::from(expected as char),
63            got: String::from(ref_base as char),
64        });
65    }
66    annotate_snv_verified(
67        chrom,
68        pos,
69        ref_base,
70        alt_base,
71        transcript,
72        locate_index,
73        fasta,
74    )
75}
76
77/// Internal SNV annotator that skips REF verification. Called by the
78/// [`super::annotate`] dispatcher after it has already verified the full
79/// VCF REF, avoiding a redundant FASTA seek per transcript.
80pub(super) fn annotate_snv_verified(
81    chrom: &str,
82    pos: u64,
83    ref_base: u8,
84    alt_base: u8,
85    transcript: &TranscriptModel,
86    locate_index: &LocateIndex,
87    fasta: &FastaReader,
88) -> Result<ConsequenceResult, VarEffectError> {
89    let location = locate_variant(chrom, pos, pos + 1, transcript, locate_index)?;
90    let hgvs_c =
91        hgvs_c::format_snv_hgvs(pos, ref_base, alt_base, &location, transcript, locate_index)?;
92
93    let mut result = match location {
94        VariantLocation::CdsExon {
95            exon_index,
96            cds_offset,
97            codon_number,
98            codon_position,
99            is_splice_region,
100            ..
101        } => annotate_cds_snv(
102            chrom,
103            alt_base,
104            transcript,
105            locate_index,
106            fasta,
107            exon_index,
108            cds_offset,
109            codon_number,
110            codon_position,
111            is_splice_region,
112        ),
113
114        VariantLocation::SpliceDonor { intron_index, .. } => {
115            let mut result = build_base_result(transcript, vec![Consequence::SpliceDonorVariant]);
116            result.intron = Some(format_intron_number(intron_index, transcript.exon_count));
117            Ok(result)
118        }
119
120        VariantLocation::SpliceAcceptor { intron_index, .. } => {
121            let mut result =
122                build_base_result(transcript, vec![Consequence::SpliceAcceptorVariant]);
123            result.intron = Some(format_intron_number(intron_index, transcript.exon_count));
124            Ok(result)
125        }
126
127        VariantLocation::SpliceRegion { intron_index, .. } => {
128            let mut result = build_base_result(
129                transcript,
130                vec![Consequence::SpliceRegionVariant, Consequence::IntronVariant],
131            );
132            result.intron = Some(format_intron_number(intron_index, transcript.exon_count));
133            Ok(result)
134        }
135
136        VariantLocation::Intron { intron_index, .. } => {
137            let mut result = build_base_result(transcript, vec![Consequence::IntronVariant]);
138            result.intron = Some(format_intron_number(intron_index, transcript.exon_count));
139            Ok(result)
140        }
141
142        VariantLocation::FivePrimeUtr {
143            exon_index,
144            is_splice_region,
145            ..
146        } => {
147            let mut consequences = vec![Consequence::FivePrimeUtrVariant];
148            if is_splice_region {
149                consequences.push(Consequence::SpliceRegionVariant);
150            }
151            let mut result = build_base_result(transcript, consequences);
152            result.exon = Some(format_exon_number(exon_index, transcript.exon_count));
153            result.cdna_position = compute_cdna_position_exonic(pos, transcript);
154            result.cdna_position_end = result.cdna_position;
155            Ok(result)
156        }
157
158        VariantLocation::ThreePrimeUtr {
159            exon_index,
160            is_splice_region,
161            ..
162        } => {
163            let mut consequences = vec![Consequence::ThreePrimeUtrVariant];
164            if is_splice_region {
165                consequences.push(Consequence::SpliceRegionVariant);
166            }
167            let mut result = build_base_result(transcript, consequences);
168            result.exon = Some(format_exon_number(exon_index, transcript.exon_count));
169            result.cdna_position = compute_cdna_position_exonic(pos, transcript);
170            result.cdna_position_end = result.cdna_position;
171            Ok(result)
172        }
173
174        VariantLocation::Upstream { .. } => Ok(build_base_result(
175            transcript,
176            vec![Consequence::UpstreamGeneVariant],
177        )),
178
179        VariantLocation::Downstream { .. } => Ok(build_base_result(
180            transcript,
181            vec![Consequence::DownstreamGeneVariant],
182        )),
183
184        VariantLocation::NonCodingExon {
185            exon_index,
186            is_splice_region,
187        } => {
188            let mut consequences = vec![Consequence::NonCodingTranscriptExonVariant];
189            if is_splice_region {
190                consequences.push(Consequence::SpliceRegionVariant);
191            }
192            let mut result = build_base_result(transcript, consequences);
193            result.exon = Some(format_exon_number(exon_index, transcript.exon_count));
194            result.cdna_position = compute_cdna_position_exonic(pos, transcript);
195            result.cdna_position_end = result.cdna_position;
196            Ok(result)
197        }
198
199        VariantLocation::NonCodingIntron { intron_index, .. } => {
200            let mut result = build_base_result(transcript, vec![Consequence::IntronVariant]);
201            result.intron = Some(format_intron_number(intron_index, transcript.exon_count));
202            Ok(result)
203        }
204
205        VariantLocation::Distal => Err(VarEffectError::Malformed(
206            "variant is distal to transcript -- callers should filter by overlap first".to_string(),
207        )),
208    }?;
209
210    result.hgvs_c = hgvs_c;
211
212    // Stop_lost: compute the actual extension distance via a 3'UTR scan.
213    if result.consequences.contains(&Consequence::StopLost) {
214        if let Some(ref aa_str) = result.amino_acids
215            && let Some((_, alt_str)) = aa_str.split_once('/')
216            && let Some(&alt_byte) = alt_str.as_bytes().first()
217        {
218            result.hgvs_p = Some(crate::hgvs_p::format_hgvs_p_extension(
219                alt_byte,
220                result
221                    .protein_start
222                    .expect("protein_start always set for coding SNV"),
223                chrom,
224                transcript,
225                locate_index,
226                fasta,
227            )?);
228        }
229    } else {
230        result.hgvs_p = crate::hgvs_p::format_hgvs_p_snv(
231            &result.consequences,
232            result.amino_acids.as_deref(),
233            result.protein_start,
234        );
235    }
236    Ok(result)
237}
238
239/// Annotate a CDS exon SNV: codon extraction, translation, AA comparison.
240#[allow(clippy::too_many_arguments)]
241fn annotate_cds_snv(
242    chrom: &str,
243    alt_base: u8,
244    transcript: &TranscriptModel,
245    index: &LocateIndex,
246    fasta: &FastaReader,
247    exon_index: u16,
248    cds_offset: u32,
249    codon_number: u32,
250    codon_position: u8,
251    is_splice_region: bool,
252) -> Result<ConsequenceResult, VarEffectError> {
253    let codon_start_offset = cds_offset - codon_position as u32;
254
255    if is_incomplete_terminal_codon(cds_offset, index) {
256        let consequences = vec![Consequence::IncompleteTerminalCodonVariant];
257        let mut result = build_base_result(transcript, consequences);
258        result.exon = Some(format_exon_number(exon_index, transcript.exon_count));
259        result.cds_position = Some(cds_offset + 1);
260        result.cds_position_end = Some(cds_offset + 1);
261        result.cdna_position = Some(compute_cdna_position_for_cds(cds_offset, index));
262        result.cdna_position_end = result.cdna_position;
263        return Ok(result);
264    }
265
266    let ref_codon = fetch_ref_codon(codon_start_offset, chrom, transcript, index, fasta)?;
267
268    // Complement the VCF ALT for minus-strand to get coding-strand base
269    let coding_alt = match transcript.strand {
270        Strand::Plus => alt_base,
271        Strand::Minus => complement(alt_base),
272    };
273    let alt_codon = build_alt_codon(&ref_codon, codon_position, coding_alt);
274
275    let is_mito = transcript.chrom == "chrM";
276    let ref_aa = translate_codon_for_transcript(&ref_codon, is_mito);
277    let alt_aa = translate_codon_for_transcript(&alt_codon, is_mito);
278
279    // Ambiguous codons (ref contains N)
280    if ref_aa == b'X' || alt_aa == b'X' {
281        let mut consequences = vec![Consequence::CodingSequenceVariant];
282        if is_splice_region {
283            consequences.push(Consequence::SpliceRegionVariant);
284        }
285        let mut result = build_base_result(transcript, consequences);
286        result.exon = Some(format_exon_number(exon_index, transcript.exon_count));
287        result.cds_position = Some(cds_offset + 1);
288        result.cds_position_end = Some(cds_offset + 1);
289        result.cdna_position = Some(compute_cdna_position_for_cds(cds_offset, index));
290        result.cdna_position_end = Some(compute_cdna_position_for_cds(cds_offset, index));
291        return Ok(result);
292    }
293
294    let coding_consequence = if ref_aa == alt_aa {
295        if codon_number == 1 && ref_aa == b'M' {
296            Consequence::StartRetainedVariant
297        } else if ref_aa == b'*' {
298            Consequence::StopRetainedVariant
299        } else {
300            Consequence::SynonymousVariant
301        }
302    } else if alt_aa == b'*' {
303        Consequence::StopGained
304    } else if ref_aa == b'*' {
305        Consequence::StopLost
306    } else if codon_number == 1 && ref_aa == b'M' {
307        Consequence::StartLost
308    } else {
309        Consequence::MissenseVariant
310    };
311
312    let mut consequences = vec![coding_consequence];
313    if is_splice_region {
314        consequences.push(Consequence::SpliceRegionVariant);
315    }
316    let impact = finalize_consequences(&mut consequences);
317    let predicts_nmd = coding_consequence == Consequence::StopGained
318        && super::nmd::predicts_nmd(cds_offset + 1, index);
319
320    Ok(ConsequenceResult {
321        transcript: transcript.accession.clone(),
322        gene_symbol: transcript.gene_symbol.clone(),
323        protein_accession: transcript.protein_accession.clone(),
324        consequences,
325        impact,
326        protein_start: Some(codon_number),
327        protein_end: Some(codon_number),
328        codons: Some(format_codons(&ref_codon, &alt_codon, codon_position)),
329        amino_acids: Some(format_amino_acids(ref_aa, alt_aa)),
330        exon: Some(format_exon_number(exon_index, transcript.exon_count)),
331        intron: None,
332        cds_position: Some(cds_offset + 1),
333        cds_position_end: Some(cds_offset + 1),
334        cdna_position: Some(compute_cdna_position_for_cds(cds_offset, index)),
335        cdna_position_end: Some(compute_cdna_position_for_cds(cds_offset, index)),
336        strand: transcript.strand,
337        biotype: transcript.biotype.clone(),
338        is_mane_select: transcript.tier == TranscriptTier::ManeSelect,
339        is_mane_plus_clinical: transcript.tier == TranscriptTier::ManePlusClinical,
340        is_refseq_select: transcript.tier == TranscriptTier::RefSeqSelect,
341        hgvs_c: None,
342        hgvs_p: None,
343        predicts_nmd,
344    })
345}