vareffect/types.rs
1//! Runtime types for the transcript model store.
2//!
3//! All coordinates use the **0-based, half-open** convention (BED/UCSC style).
4//! The GFF3 parser in `vareffect-cli` converts NCBI's 1-based fully-closed
5//! coordinates on ingest, so consumers of this crate never see 1-based indices.
6//!
7//! Every type in this module round-trips through MessagePack via
8//! [`serde::Serialize`]/[`serde::Deserialize`]. [`Biotype`] has a hand-written
9//! `Serialize`/`Deserialize` so the on-disk format stays a single flat string
10//! and unknown upstream labels (`vault_RNA`, future biotypes) survive as
11//! [`Biotype::Other`] without schema changes.
12
13use serde::{Deserialize, Serialize};
14
15/// Transcript strand orientation relative to the reference genome.
16///
17/// Marked `#[non_exhaustive]` so future assemblies with unknown / ambiguous
18/// strand annotations can extend this enum without a SemVer break.
19#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
20#[non_exhaustive]
21pub enum Strand {
22 /// Plus strand (5'→3' runs in the direction of increasing genomic coordinate).
23 Plus,
24 /// Minus strand (5'→3' runs in the direction of decreasing genomic coordinate).
25 Minus,
26}
27
28/// Curation tier that produced a transcript model.
29///
30/// `ManeSelect` and `ManePlusClinical` are the primary clinical tiers;
31/// `RefSeqSelect` is provided as a fallback for genes without MANE coverage.
32/// `#[non_exhaustive]` leaves room for future tiers (Ensembl Canonical,
33/// CCDS-only, …) without breaking downstream matches.
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
35#[non_exhaustive]
36pub enum TranscriptTier {
37 /// NCBI/Ensembl jointly-curated MANE Select transcript (the default
38 /// clinical reference isoform for each gene).
39 ManeSelect,
40 /// MANE Plus Clinical — a second isoform curated for clinically
41 /// actionable variants not captured on the MANE Select isoform.
42 ManePlusClinical,
43 /// RefSeq Select — NCBI's fallback canonical transcript for genes
44 /// without MANE coverage.
45 RefSeqSelect,
46}
47
48/// A single exon within a [`TranscriptModel`].
49///
50/// Exons inside a `TranscriptModel` are ordered 5'→3' on the *transcript*,
51/// which for minus-strand genes is the reverse of genomic order.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
53pub struct Exon {
54 /// 1-based exon number, counting from the 5' end of the transcript.
55 /// `u16` is sufficient — the largest known human transcript (TTN) has
56 /// ~363 exons, far below the 65,535 ceiling.
57 pub exon_number: u16,
58 /// Genomic start coordinate, 0-based inclusive.
59 pub genomic_start: u64,
60 /// Genomic end coordinate, 0-based exclusive (half-open).
61 pub genomic_end: u64,
62}
63
64/// A single CDS segment within a [`TranscriptModel`].
65///
66/// One `CdsSegment` corresponds to one GFF3 `CDS` row. Segments are ordered
67/// 5'→3' on the *transcript* (reversed for minus-strand genes), matching the
68/// `exons` vector on [`TranscriptModel`]. The per-segment [`phase`] captures
69/// the reading-frame offset that VEP needs for frameshift detection and p. HGVS
70/// notation across exon boundaries.
71///
72/// `exon_index` is the 0-based index into `TranscriptModel::exons` of the exon
73/// that contains this CDS segment — this lets downstream code walk codons
74/// without re-scanning the exon vector for every CDS row.
75///
76/// [`phase`]: Self::phase
77#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
78pub struct CdsSegment {
79 /// 0-based index into `TranscriptModel::exons` of the containing exon.
80 pub exon_index: u16,
81 /// Genomic start coordinate, 0-based inclusive.
82 pub genomic_start: u64,
83 /// Genomic end coordinate, 0-based exclusive (half-open).
84 pub genomic_end: u64,
85 /// GFF3 column-8 phase: `0`, `1`, or `2` — the number of bases at the
86 /// *transcript-5'* end of this CDS segment that belong to the final codon
87 /// of the previous segment. A missing GFF3 phase (`.`) is normalized to
88 /// `0` at build time. Any value > 2 is rejected as malformed input.
89 pub phase: u8,
90}
91
92/// Transcript biotype.
93///
94/// Known biotypes are enum variants for type safety; unrecognized upstream
95/// labels (e.g. a future `Y_RNA` added to MANE) survive verbatim in
96/// [`Biotype::Other`] and round-trip through the MessagePack store without
97/// code changes.
98///
99/// The on-disk representation is a single flat string — known variants use
100/// their canonical NCBI/Ensembl label, and `Other` stores the raw upstream
101/// label. This is implemented via a hand-written `Serialize`/`Deserialize`
102/// below rather than `#[serde(untagged)]` to guarantee the flat wire format.
103#[derive(Debug, Clone, PartialEq, Eq)]
104pub enum Biotype {
105 /// `NM_*` — protein-coding mRNA.
106 ProteinCoding,
107 /// Generic non-coding RNA — the default for `NR_*` accessions without a
108 /// more specific gene-level biotype.
109 NonCodingRna,
110 /// Long non-coding RNA.
111 LncRna,
112 /// Antisense RNA.
113 AntisenseRna,
114 /// Small nucleolar RNA.
115 SnoRna,
116 /// Small nuclear RNA.
117 SnRna,
118 /// RNase MRP RNA.
119 RnaseMrpRna,
120 /// Telomerase RNA component.
121 TelomeraseRna,
122 /// Vault RNA.
123 VaultRna,
124 /// Upstream biotype label not recognized by this crate. The raw string is
125 /// preserved verbatim so future gene types survive without a schema change.
126 Other(String),
127 /// No biotype signal was available at build time (neither the accession
128 /// prefix nor a gene-level `gene_biotype` attribute).
129 Unknown,
130}
131
132impl Biotype {
133 /// Return the canonical on-disk label for this biotype.
134 pub fn as_str(&self) -> &str {
135 match self {
136 Self::ProteinCoding => "protein_coding",
137 Self::NonCodingRna => "non_coding_rna",
138 Self::LncRna => "lncRNA",
139 Self::AntisenseRna => "antisense_RNA",
140 Self::SnoRna => "snoRNA",
141 Self::SnRna => "snRNA",
142 Self::RnaseMrpRna => "RNase_MRP_RNA",
143 Self::TelomeraseRna => "telomerase_RNA",
144 Self::VaultRna => "vault_RNA",
145 Self::Other(s) => s.as_str(),
146 Self::Unknown => "unknown",
147 }
148 }
149
150 /// Parse a biotype label from its canonical string form.
151 ///
152 /// Unknown labels are returned as [`Biotype::Other`] (preserving the raw
153 /// string) rather than an error — the transcript model store is meant to
154 /// round-trip whatever MANE/RefSeq ships today or tomorrow.
155 pub fn from_label(label: &str) -> Self {
156 match label {
157 "protein_coding" => Self::ProteinCoding,
158 "non_coding_rna" => Self::NonCodingRna,
159 "lncRNA" => Self::LncRna,
160 "antisense_RNA" => Self::AntisenseRna,
161 "snoRNA" => Self::SnoRna,
162 "snRNA" => Self::SnRna,
163 "RNase_MRP_RNA" => Self::RnaseMrpRna,
164 "telomerase_RNA" => Self::TelomeraseRna,
165 "vault_RNA" => Self::VaultRna,
166 "unknown" | "" => Self::Unknown,
167 other => Self::Other(other.to_string()),
168 }
169 }
170
171 /// `true` if the biotype is `ProteinCoding`. Convenience helper for
172 /// downstream filters.
173 pub fn is_protein_coding(&self) -> bool {
174 matches!(self, Self::ProteinCoding)
175 }
176}
177
178impl Serialize for Biotype {
179 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
180 serializer.serialize_str(self.as_str())
181 }
182}
183
184impl<'de> Deserialize<'de> for Biotype {
185 fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
186 // Deserialize as an owned `String` then map into the enum. Avoids the
187 // lifetime dance of `&str` for formats that might not hand out borrows
188 // (rmp-serde does, but JSON with escapes does not).
189 let raw = String::deserialize(deserializer)?;
190 Ok(Self::from_label(&raw))
191 }
192}
193
194/// A canonical transcript model sourced from MANE or RefSeq Select.
195///
196/// Every coordinate is 0-based half-open. Accessions include the version
197/// suffix (e.g., `"NM_006772.2"`) — version-less lookup is intentionally out
198/// of scope; resolving an unversioned HGVS string to "the latest" belongs
199/// with the consumer, not the raw store.
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct TranscriptModel {
202 /// RefSeq transcript accession including version, e.g. `"NM_006772.2"`.
203 pub accession: String,
204 /// RefSeq protein accession including version, e.g. `"NP_006763.2"`.
205 /// `None` for non-coding transcripts (`NR_*` accessions).
206 pub protein_accession: Option<String>,
207 /// HGNC-approved gene symbol, e.g. `"SYNGAP1"`.
208 pub gene_symbol: String,
209 /// HGNC identifier including the `HGNC:` prefix, e.g. `"HGNC:11497"`.
210 /// `None` if the parent gene row did not carry an HGNC cross-reference.
211 pub hgnc_id: Option<String>,
212 /// Ensembl transcript accession with version, e.g. `"ENST00000418600.6"`.
213 /// `None` if the Dbxref did not list an Ensembl cross-reference.
214 pub ensembl_accession: Option<String>,
215 /// Chromosome name in UCSC style (`"chr1"`, …, `"chrX"`, `"chrY"`, `"chrM"`).
216 /// For transcripts on GRCh38 patch sequences, this field holds the UCSC
217 /// contig name exactly as published by MANE GFF3 column 1 (e.g.
218 /// `"chr9_KN196479v1_fix"`, `"chr22_KI270879v1_alt"`). Against an NCBI
219 /// RefSeq FASTA, patch lookups work only when
220 /// [`crate::FastaReader::open_with_patch_aliases`] is supplied a
221 /// `patch_chrom_aliases.csv` that maps the UCSC form back to the
222 /// matching `NW_*`/`NT_*` RefSeq accession.
223 pub chrom: String,
224 /// Transcript strand orientation.
225 pub strand: Strand,
226 /// Genomic start of the entire transcript (0-based, inclusive).
227 /// Includes 5' and 3' UTRs.
228 pub tx_start: u64,
229 /// Genomic end of the entire transcript (0-based, exclusive).
230 /// Includes 5' and 3' UTRs.
231 pub tx_end: u64,
232 /// Lowest genomic coordinate of any CDS segment (0-based, inclusive).
233 ///
234 /// **This is a genomic interval span, not a transcript-relative start.**
235 /// For a minus-strand transcript, this is the 3' end of the protein in
236 /// transcript order. Use [`cds_segments`](Self::cds_segments) when you
237 /// need the true 5' coding start or per-exon CDS bounds. `None` for
238 /// non-coding transcripts.
239 pub cds_genomic_start: Option<u64>,
240 /// Highest genomic coordinate of any CDS segment (0-based, exclusive).
241 /// See [`cds_genomic_start`](Self::cds_genomic_start) for the interval
242 /// semantics caveat. `None` for non-coding transcripts.
243 pub cds_genomic_end: Option<u64>,
244 /// Exons ordered 5'→3' on the *transcript*.
245 ///
246 /// Invariant: for plus-strand transcripts, `exons[i].genomic_start <
247 /// exons[i+1].genomic_start`; for minus-strand transcripts, the inverse
248 /// holds (exon 1 has the highest genomic coordinates).
249 pub exons: Vec<Exon>,
250 /// CDS segments ordered 5'→3' on the *transcript*, matching
251 /// [`exons`](Self::exons). Empty for non-coding transcripts.
252 ///
253 /// Each segment carries the containing exon index (for O(1) exon lookup
254 /// without re-scanning) and the GFF3 column-8 phase (needed for codon
255 /// walking across exon boundaries and frameshift detection).
256 pub cds_segments: Vec<CdsSegment>,
257 /// MANE / RefSeq Select curation tier.
258 pub tier: TranscriptTier,
259 /// Biotype, either a known variant or a raw upstream label preserved
260 /// verbatim via [`Biotype::Other`].
261 pub biotype: Biotype,
262 /// Total number of exons — always equal to `exons.len()`.
263 ///
264 /// Stored for O(1) access without traversing the vec. `u16` suffices
265 /// (TTN, the longest known, is ~363).
266 pub exon_count: u16,
267}
268
269#[cfg(test)]
270mod tests {
271 use super::*;
272
273 #[test]
274 fn biotype_roundtrips_known_variants() {
275 for variant in [
276 Biotype::ProteinCoding,
277 Biotype::NonCodingRna,
278 Biotype::LncRna,
279 Biotype::AntisenseRna,
280 Biotype::SnoRna,
281 Biotype::SnRna,
282 Biotype::RnaseMrpRna,
283 Biotype::TelomeraseRna,
284 Biotype::VaultRna,
285 Biotype::Unknown,
286 ] {
287 let encoded = rmp_serde::to_vec_named(&variant).unwrap();
288 let decoded: Biotype = rmp_serde::from_slice(&encoded).unwrap();
289 assert_eq!(decoded, variant);
290 }
291 }
292
293 #[test]
294 fn biotype_preserves_unknown_upstream_label() {
295 let custom = Biotype::from_label("misc_RNA");
296 assert!(matches!(&custom, Biotype::Other(s) if s == "misc_RNA"));
297 let encoded = rmp_serde::to_vec_named(&custom).unwrap();
298 let decoded: Biotype = rmp_serde::from_slice(&encoded).unwrap();
299 assert_eq!(decoded, custom);
300 assert_eq!(decoded.as_str(), "misc_RNA");
301 }
302
303 #[test]
304 fn biotype_from_label_normalizes_empty_to_unknown() {
305 assert_eq!(Biotype::from_label(""), Biotype::Unknown);
306 assert_eq!(Biotype::from_label("unknown"), Biotype::Unknown);
307 }
308}