Skip to main content

vareffect/
types.rs

1//! Runtime types for the transcript model store.
2//!
3//! All coordinates use the **0-based, half-open** convention (BED/UCSC style).
4//! The GFF3 parser in `vareffect-cli` converts NCBI's 1-based fully-closed
5//! coordinates on ingest, so consumers of this crate never see 1-based indices.
6//!
7//! Every type in this module round-trips through MessagePack via
8//! [`serde::Serialize`]/[`serde::Deserialize`]. [`Biotype`] has a hand-written
9//! `Serialize`/`Deserialize` so the on-disk format stays a single flat string
10//! and unknown upstream labels (`vault_RNA`, future biotypes) survive as
11//! [`Biotype::Other`] without schema changes.
12
13use serde::{Deserialize, Serialize};
14
15/// Transcript strand orientation relative to the reference genome.
16///
17/// Marked `#[non_exhaustive]` so future assemblies with unknown / ambiguous
18/// strand annotations can extend this enum without a SemVer break.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
20#[non_exhaustive]
21pub enum Strand {
22    /// Plus strand (5'→3' runs in the direction of increasing genomic coordinate).
23    Plus,
24    /// Minus strand (5'→3' runs in the direction of decreasing genomic coordinate).
25    Minus,
26}
27
28/// Curation tier that produced a transcript model.
29///
30/// `ManeSelect` and `ManePlusClinical` are the primary clinical tiers;
31/// `RefSeqSelect` is provided as a fallback for genes without MANE coverage.
32/// `#[non_exhaustive]` leaves room for future tiers (Ensembl Canonical,
33/// CCDS-only, …) without breaking downstream matches.
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35#[non_exhaustive]
36pub enum TranscriptTier {
37    /// NCBI/Ensembl jointly-curated MANE Select transcript (the default
38    /// clinical reference isoform for each gene).
39    ManeSelect,
40    /// MANE Plus Clinical — a second isoform curated for clinically
41    /// actionable variants not captured on the MANE Select isoform.
42    ManePlusClinical,
43    /// RefSeq Select — NCBI's fallback canonical transcript for genes
44    /// without MANE coverage.
45    RefSeqSelect,
46}
47
48/// A single exon within a [`TranscriptModel`].
49///
50/// Exons inside a `TranscriptModel` are ordered 5'→3' on the *transcript*,
51/// which for minus-strand genes is the reverse of genomic order.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
53pub struct Exon {
54    /// 1-based exon number, counting from the 5' end of the transcript.
55    /// `u16` is sufficient — the largest known human transcript (TTN) has
56    /// ~363 exons, far below the 65,535 ceiling.
57    pub exon_number: u16,
58    /// Genomic start coordinate, 0-based inclusive.
59    pub genomic_start: u64,
60    /// Genomic end coordinate, 0-based exclusive (half-open).
61    pub genomic_end: u64,
62}
63
64/// A single CDS segment within a [`TranscriptModel`].
65///
66/// One `CdsSegment` corresponds to one GFF3 `CDS` row. Segments are ordered
67/// 5'→3' on the *transcript* (reversed for minus-strand genes), matching the
68/// `exons` vector on [`TranscriptModel`]. The per-segment [`phase`] captures
69/// the reading-frame offset that VEP needs for frameshift detection and p. HGVS
70/// notation across exon boundaries.
71///
72/// `exon_index` is the 0-based index into `TranscriptModel::exons` of the exon
73/// that contains this CDS segment — this lets downstream code walk codons
74/// without re-scanning the exon vector for every CDS row.
75///
76/// [`phase`]: Self::phase
77#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
78pub struct CdsSegment {
79    /// 0-based index into `TranscriptModel::exons` of the containing exon.
80    pub exon_index: u16,
81    /// Genomic start coordinate, 0-based inclusive.
82    pub genomic_start: u64,
83    /// Genomic end coordinate, 0-based exclusive (half-open).
84    pub genomic_end: u64,
85    /// GFF3 column-8 phase: `0`, `1`, or `2` — the number of bases at the
86    /// *transcript-5'* end of this CDS segment that belong to the final codon
87    /// of the previous segment. A missing GFF3 phase (`.`) is normalized to
88    /// `0` at build time. Any value > 2 is rejected as malformed input.
89    pub phase: u8,
90}
91
92/// Transcript biotype.
93///
94/// Known biotypes are enum variants for type safety; unrecognized upstream
95/// labels (e.g. a future `Y_RNA` added to MANE) survive verbatim in
96/// [`Biotype::Other`] and round-trip through the MessagePack store without
97/// code changes.
98///
99/// The on-disk representation is a single flat string — known variants use
100/// their canonical NCBI/Ensembl label, and `Other` stores the raw upstream
101/// label. This is implemented via a hand-written `Serialize`/`Deserialize`
102/// below rather than `#[serde(untagged)]` to guarantee the flat wire format.
103#[derive(Debug, Clone, PartialEq, Eq)]
104pub enum Biotype {
105    /// `NM_*` — protein-coding mRNA.
106    ProteinCoding,
107    /// Generic non-coding RNA — the default for `NR_*` accessions without a
108    /// more specific gene-level biotype.
109    NonCodingRna,
110    /// Long non-coding RNA.
111    LncRna,
112    /// Antisense RNA.
113    AntisenseRna,
114    /// Small nucleolar RNA.
115    SnoRna,
116    /// Small nuclear RNA.
117    SnRna,
118    /// RNase MRP RNA.
119    RnaseMrpRna,
120    /// Telomerase RNA component.
121    TelomeraseRna,
122    /// Vault RNA.
123    VaultRna,
124    /// Upstream biotype label not recognized by this crate. The raw string is
125    /// preserved verbatim so future gene types survive without a schema change.
126    Other(String),
127    /// No biotype signal was available at build time (neither the accession
128    /// prefix nor a gene-level `gene_biotype` attribute).
129    Unknown,
130}
131
132impl Biotype {
133    /// Return the canonical on-disk label for this biotype.
134    pub fn as_str(&self) -> &str {
135        match self {
136            Self::ProteinCoding => "protein_coding",
137            Self::NonCodingRna => "non_coding_rna",
138            Self::LncRna => "lncRNA",
139            Self::AntisenseRna => "antisense_RNA",
140            Self::SnoRna => "snoRNA",
141            Self::SnRna => "snRNA",
142            Self::RnaseMrpRna => "RNase_MRP_RNA",
143            Self::TelomeraseRna => "telomerase_RNA",
144            Self::VaultRna => "vault_RNA",
145            Self::Other(s) => s.as_str(),
146            Self::Unknown => "unknown",
147        }
148    }
149
150    /// Parse a biotype label from its canonical string form.
151    ///
152    /// Unknown labels are returned as [`Biotype::Other`] (preserving the raw
153    /// string) rather than an error — the transcript model store is meant to
154    /// round-trip whatever MANE/RefSeq ships today or tomorrow.
155    pub fn from_label(label: &str) -> Self {
156        match label {
157            "protein_coding" => Self::ProteinCoding,
158            "non_coding_rna" => Self::NonCodingRna,
159            "lncRNA" => Self::LncRna,
160            "antisense_RNA" => Self::AntisenseRna,
161            "snoRNA" => Self::SnoRna,
162            "snRNA" => Self::SnRna,
163            "RNase_MRP_RNA" => Self::RnaseMrpRna,
164            "telomerase_RNA" => Self::TelomeraseRna,
165            "vault_RNA" => Self::VaultRna,
166            "unknown" | "" => Self::Unknown,
167            other => Self::Other(other.to_string()),
168        }
169    }
170
171    /// `true` if the biotype is `ProteinCoding`. Convenience helper for
172    /// downstream filters.
173    pub fn is_protein_coding(&self) -> bool {
174        matches!(self, Self::ProteinCoding)
175    }
176}
177
178impl Serialize for Biotype {
179    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
180        serializer.serialize_str(self.as_str())
181    }
182}
183
184impl<'de> Deserialize<'de> for Biotype {
185    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
186        // Deserialize as an owned `String` then map into the enum. Avoids the
187        // lifetime dance of `&str` for formats that might not hand out borrows
188        // (rmp-serde does, but JSON with escapes does not).
189        let raw = String::deserialize(deserializer)?;
190        Ok(Self::from_label(&raw))
191    }
192}
193
194/// A canonical transcript model sourced from MANE or RefSeq Select.
195///
196/// Every coordinate is 0-based half-open. Accessions include the version
197/// suffix (e.g., `"NM_006772.2"`) — version-less lookup is intentionally out
198/// of scope; resolving an unversioned HGVS string to "the latest" belongs
199/// with the consumer, not the raw store.
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct TranscriptModel {
202    /// RefSeq transcript accession including version, e.g. `"NM_006772.2"`.
203    pub accession: String,
204    /// RefSeq protein accession including version, e.g. `"NP_006763.2"`.
205    /// `None` for non-coding transcripts (`NR_*` accessions).
206    pub protein_accession: Option<String>,
207    /// HGNC-approved gene symbol, e.g. `"SYNGAP1"`.
208    pub gene_symbol: String,
209    /// HGNC identifier including the `HGNC:` prefix, e.g. `"HGNC:11497"`.
210    /// `None` if the parent gene row did not carry an HGNC cross-reference.
211    pub hgnc_id: Option<String>,
212    /// Ensembl transcript accession with version, e.g. `"ENST00000418600.6"`.
213    /// `None` if the Dbxref did not list an Ensembl cross-reference.
214    pub ensembl_accession: Option<String>,
215    /// Chromosome name in UCSC style (`"chr1"`, …, `"chrX"`, `"chrY"`, `"chrM"`).
216    /// For transcripts on GRCh38 patch sequences, this field holds the UCSC
217    /// contig name exactly as published by MANE GFF3 column 1 (e.g.
218    /// `"chr9_KN196479v1_fix"`, `"chr22_KI270879v1_alt"`). Against an NCBI
219    /// RefSeq FASTA, patch lookups work only when
220    /// [`crate::FastaReader::open_with_patch_aliases`] is supplied a
221    /// `patch_chrom_aliases.csv` that maps the UCSC form back to the
222    /// matching `NW_*`/`NT_*` RefSeq accession.
223    pub chrom: String,
224    /// Transcript strand orientation.
225    pub strand: Strand,
226    /// Genomic start of the entire transcript (0-based, inclusive).
227    /// Includes 5' and 3' UTRs.
228    pub tx_start: u64,
229    /// Genomic end of the entire transcript (0-based, exclusive).
230    /// Includes 5' and 3' UTRs.
231    pub tx_end: u64,
232    /// Lowest genomic coordinate of any CDS segment (0-based, inclusive).
233    ///
234    /// **This is a genomic interval span, not a transcript-relative start.**
235    /// For a minus-strand transcript, this is the 3' end of the protein in
236    /// transcript order. Use [`cds_segments`](Self::cds_segments) when you
237    /// need the true 5' coding start or per-exon CDS bounds. `None` for
238    /// non-coding transcripts.
239    pub cds_genomic_start: Option<u64>,
240    /// Highest genomic coordinate of any CDS segment (0-based, exclusive).
241    /// See [`cds_genomic_start`](Self::cds_genomic_start) for the interval
242    /// semantics caveat. `None` for non-coding transcripts.
243    pub cds_genomic_end: Option<u64>,
244    /// Exons ordered 5'→3' on the *transcript*.
245    ///
246    /// Invariant: for plus-strand transcripts, `exons[i].genomic_start <
247    /// exons[i+1].genomic_start`; for minus-strand transcripts, the inverse
248    /// holds (exon 1 has the highest genomic coordinates).
249    pub exons: Vec<Exon>,
250    /// CDS segments ordered 5'→3' on the *transcript*, matching
251    /// [`exons`](Self::exons). Empty for non-coding transcripts.
252    ///
253    /// Each segment carries the containing exon index (for O(1) exon lookup
254    /// without re-scanning) and the GFF3 column-8 phase (needed for codon
255    /// walking across exon boundaries and frameshift detection).
256    pub cds_segments: Vec<CdsSegment>,
257    /// MANE / RefSeq Select curation tier.
258    pub tier: TranscriptTier,
259    /// Biotype, either a known variant or a raw upstream label preserved
260    /// verbatim via [`Biotype::Other`].
261    pub biotype: Biotype,
262    /// Total number of exons — always equal to `exons.len()`.
263    ///
264    /// Stored for O(1) access without traversing the vec. `u16` suffices
265    /// (TTN, the longest known, is ~363).
266    pub exon_count: u16,
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn biotype_roundtrips_known_variants() {
275        for variant in [
276            Biotype::ProteinCoding,
277            Biotype::NonCodingRna,
278            Biotype::LncRna,
279            Biotype::AntisenseRna,
280            Biotype::SnoRna,
281            Biotype::SnRna,
282            Biotype::RnaseMrpRna,
283            Biotype::TelomeraseRna,
284            Biotype::VaultRna,
285            Biotype::Unknown,
286        ] {
287            let encoded = rmp_serde::to_vec_named(&variant).unwrap();
288            let decoded: Biotype = rmp_serde::from_slice(&encoded).unwrap();
289            assert_eq!(decoded, variant);
290        }
291    }
292
293    #[test]
294    fn biotype_preserves_unknown_upstream_label() {
295        let custom = Biotype::from_label("misc_RNA");
296        assert!(matches!(&custom, Biotype::Other(s) if s == "misc_RNA"));
297        let encoded = rmp_serde::to_vec_named(&custom).unwrap();
298        let decoded: Biotype = rmp_serde::from_slice(&encoded).unwrap();
299        assert_eq!(decoded, custom);
300        assert_eq!(decoded.as_str(), "misc_RNA");
301    }
302
303    #[test]
304    fn biotype_from_label_normalizes_empty_to_unknown() {
305        assert_eq!(Biotype::from_label(""), Biotype::Unknown);
306        assert_eq!(Biotype::from_label("unknown"), Biotype::Unknown);
307    }
308}