Skip to main content

cyanea_omics/
annotation.rs

1//! Gene, transcript, and exon annotation types.
2//!
3//! Hierarchical gene → transcript → exon model for representing genome
4//! annotations from sources like GENCODE, Ensembl, or RefSeq.
5
6use cyanea_core::{Annotated, Summarizable};
7
8use crate::genomic::{GenomicInterval, Strand};
9
10/// Classification of a gene's biotype.
11#[derive(Debug, Clone, PartialEq, Eq)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub enum GeneType {
14    ProteinCoding,
15    LncRNA,
16    MiRNA,
17    RRNA,
18    TRNA,
19    Pseudogene,
20    Other(String),
21}
22
23impl core::fmt::Display for GeneType {
24    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
25        match self {
26            GeneType::ProteinCoding => write!(f, "protein_coding"),
27            GeneType::LncRNA => write!(f, "lncRNA"),
28            GeneType::MiRNA => write!(f, "miRNA"),
29            GeneType::RRNA => write!(f, "rRNA"),
30            GeneType::TRNA => write!(f, "tRNA"),
31            GeneType::Pseudogene => write!(f, "pseudogene"),
32            GeneType::Other(s) => write!(f, "{s}"),
33        }
34    }
35}
36
37/// An exon within a transcript (0-based coordinates).
38#[derive(Debug, Clone, PartialEq, Eq)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub struct Exon {
41    pub exon_number: u32,
42    /// 0-based start (inclusive).
43    pub start: u64,
44    /// 0-based end (exclusive).
45    pub end: u64,
46}
47
48impl Exon {
49    /// Length of the exon in bases.
50    pub fn len(&self) -> u64 {
51        self.end - self.start
52    }
53
54    /// Whether the exon has zero length.
55    pub fn is_empty(&self) -> bool {
56        self.start == self.end
57    }
58}
59
60/// A transcript with optional CDS boundaries.
61#[derive(Debug, Clone, PartialEq, Eq)]
62#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
63pub struct Transcript {
64    pub transcript_id: String,
65    /// 0-based start (inclusive).
66    pub start: u64,
67    /// 0-based end (exclusive).
68    pub end: u64,
69    pub exons: Vec<Exon>,
70    /// CDS start (0-based inclusive), if protein-coding.
71    pub cds_start: Option<u64>,
72    /// CDS end (0-based exclusive), if protein-coding.
73    pub cds_end: Option<u64>,
74}
75
76impl Transcript {
77    /// Length of the transcript span in bases.
78    pub fn len(&self) -> u64 {
79        self.end - self.start
80    }
81
82    /// Whether the transcript has zero length.
83    pub fn is_empty(&self) -> bool {
84        self.start == self.end
85    }
86
87    /// Number of exons.
88    pub fn n_exons(&self) -> usize {
89        self.exons.len()
90    }
91
92    /// Total exonic length (sum of individual exon lengths).
93    pub fn exonic_length(&self) -> u64 {
94        self.exons.iter().map(|e| e.len()).sum()
95    }
96
97    /// Convert this transcript to a [`GenomicInterval`] on the given chromosome and strand.
98    pub fn to_genomic_interval(&self, chrom: &str, strand: Strand) -> GenomicInterval {
99        GenomicInterval {
100            chrom: chrom.into(),
101            start: self.start,
102            end: self.end,
103            strand,
104        }
105    }
106}
107
108/// A gene with its transcripts.
109#[derive(Debug, Clone)]
110#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
111pub struct Gene {
112    pub gene_id: String,
113    pub gene_name: String,
114    pub chrom: String,
115    /// 0-based start (inclusive).
116    pub start: u64,
117    /// 0-based end (exclusive).
118    pub end: u64,
119    pub strand: Strand,
120    pub gene_type: GeneType,
121    pub transcripts: Vec<Transcript>,
122}
123
124impl Gene {
125    /// Length of the gene span in bases.
126    pub fn len(&self) -> u64 {
127        self.end - self.start
128    }
129
130    /// Whether the gene has zero length.
131    pub fn is_empty(&self) -> bool {
132        self.start == self.end
133    }
134
135    /// Number of transcripts.
136    pub fn n_transcripts(&self) -> usize {
137        self.transcripts.len()
138    }
139
140    /// Convert to a [`GenomicInterval`].
141    pub fn to_genomic_interval(&self) -> GenomicInterval {
142        GenomicInterval {
143            chrom: self.chrom.clone(),
144            start: self.start,
145            end: self.end,
146            strand: self.strand,
147        }
148    }
149
150    /// Whether this gene is protein-coding.
151    pub fn is_protein_coding(&self) -> bool {
152        self.gene_type == GeneType::ProteinCoding
153    }
154}
155
156impl Annotated for Gene {
157    fn name(&self) -> &str {
158        &self.gene_name
159    }
160}
161
162impl Summarizable for Gene {
163    fn summary(&self) -> String {
164        format!(
165            "Gene: {} ({}:{}-{}, {}, {}, {} transcripts)",
166            self.gene_name,
167            self.chrom,
168            self.start,
169            self.end,
170            self.strand,
171            self.gene_type,
172            self.n_transcripts()
173        )
174    }
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    fn sample_gene() -> Gene {
182        Gene {
183            gene_id: "ENSG00000141510".into(),
184            gene_name: "TP53".into(),
185            chrom: "chr17".into(),
186            start: 7668421,
187            end: 7687490,
188            strand: Strand::Reverse,
189            gene_type: GeneType::ProteinCoding,
190            transcripts: vec![
191                Transcript {
192                    transcript_id: "ENST00000269305".into(),
193                    start: 7668421,
194                    end: 7687490,
195                    exons: vec![
196                        Exon { exon_number: 1, start: 7668421, end: 7668586 },
197                        Exon { exon_number: 2, start: 7670609, end: 7670715 },
198                        Exon { exon_number: 3, start: 7673534, end: 7673608 },
199                    ],
200                    cds_start: Some(7668421),
201                    cds_end: Some(7687490),
202                },
203                Transcript {
204                    transcript_id: "ENST00000413465".into(),
205                    start: 7669608,
206                    end: 7687490,
207                    exons: vec![
208                        Exon { exon_number: 1, start: 7669608, end: 7669690 },
209                    ],
210                    cds_start: None,
211                    cds_end: None,
212                },
213            ],
214        }
215    }
216
217    #[test]
218    fn test_exon_len() {
219        let exon = Exon { exon_number: 1, start: 100, end: 300 };
220        assert_eq!(exon.len(), 200);
221    }
222
223    #[test]
224    fn test_transcript_exonic_length() {
225        let gene = sample_gene();
226        let tx = &gene.transcripts[0];
227        // (7668586-7668421) + (7670715-7670609) + (7673608-7673534)
228        // = 165 + 106 + 74 = 345
229        assert_eq!(tx.exonic_length(), 345);
230    }
231
232    #[test]
233    fn test_transcript_n_exons() {
234        let gene = sample_gene();
235        assert_eq!(gene.transcripts[0].n_exons(), 3);
236        assert_eq!(gene.transcripts[1].n_exons(), 1);
237    }
238
239    #[test]
240    fn test_transcript_to_interval() {
241        let gene = sample_gene();
242        let tx = &gene.transcripts[0];
243        let iv = tx.to_genomic_interval("chr17", Strand::Reverse);
244        assert_eq!(iv.chrom, "chr17");
245        assert_eq!(iv.start, 7668421);
246        assert_eq!(iv.end, 7687490);
247        assert_eq!(iv.strand, Strand::Reverse);
248    }
249
250    #[test]
251    fn test_gene_len() {
252        let gene = sample_gene();
253        assert_eq!(gene.len(), 7687490 - 7668421);
254    }
255
256    #[test]
257    fn test_gene_n_transcripts() {
258        let gene = sample_gene();
259        assert_eq!(gene.n_transcripts(), 2);
260    }
261
262    #[test]
263    fn test_gene_to_interval() {
264        let gene = sample_gene();
265        let iv = gene.to_genomic_interval();
266        assert_eq!(iv.chrom, "chr17");
267        assert_eq!(iv.strand, Strand::Reverse);
268    }
269
270    #[test]
271    fn test_gene_is_protein_coding() {
272        let gene = sample_gene();
273        assert!(gene.is_protein_coding());
274    }
275
276    #[test]
277    fn test_annotated() {
278        let gene = sample_gene();
279        assert_eq!(gene.name(), "TP53");
280    }
281
282    #[test]
283    fn test_summary() {
284        let gene = sample_gene();
285        assert_eq!(
286            gene.summary(),
287            "Gene: TP53 (chr17:7668421-7687490, -, protein_coding, 2 transcripts)"
288        );
289    }
290
291    #[test]
292    fn test_gene_type_display() {
293        assert_eq!(GeneType::ProteinCoding.to_string(), "protein_coding");
294        assert_eq!(GeneType::LncRNA.to_string(), "lncRNA");
295        assert_eq!(GeneType::Other("snRNA".into()).to_string(), "snRNA");
296    }
297}