Skip to main content

ferro_hgvs/reference/
transcript.rs

1//! Transcript and exon models
2//!
3//! # Coordinate System
4//!
5//! All coordinates in this module are **1-based inclusive**:
6//!
7//! | Field | Basis | Notes |
8//! |-------|-------|-------|
9//! | `Exon.start`, `Exon.end` | 1-based | Transcript coordinates (inclusive) |
10//! | `Exon.genomic_start`, `Exon.genomic_end` | 1-based | Genomic coordinates (inclusive) |
11//! | `Intron.genomic_start`, `Intron.genomic_end` | 1-based | First/last intronic base |
12//! | `Transcript.cds_start`, `Transcript.cds_end` | 1-based | CDS boundaries in transcript space |
13//!
14//! For type-safe coordinate handling, see [`crate::coords`].
15
16use serde::{Deserialize, Serialize};
17use std::cmp::Ordering;
18use std::sync::OnceLock;
19
20/// Genome build/assembly version
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
22pub enum GenomeBuild {
23    /// GRCh37 / hg19
24    GRCh37,
25    /// GRCh38 / hg38
26    #[default]
27    GRCh38,
28    /// Unknown build
29    Unknown,
30}
31
32impl std::fmt::Display for GenomeBuild {
33    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
34        match self {
35            GenomeBuild::GRCh37 => write!(f, "GRCh37"),
36            GenomeBuild::GRCh38 => write!(f, "GRCh38"),
37            GenomeBuild::Unknown => write!(f, "Unknown"),
38        }
39    }
40}
41
42/// Strand orientation
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
44pub enum Strand {
45    #[serde(rename = "+")]
46    #[default]
47    Plus,
48    #[serde(rename = "-")]
49    Minus,
50}
51
52impl std::fmt::Display for Strand {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            Strand::Plus => write!(f, "+"),
56            Strand::Minus => write!(f, "-"),
57        }
58    }
59}
60
61/// An exon in a transcript
62#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
63pub struct Exon {
64    /// Exon number (1-based)
65    pub number: u32,
66    /// Start position in transcript coordinates (1-based, inclusive)
67    pub start: u64,
68    /// End position in transcript coordinates (1-based, inclusive)
69    pub end: u64,
70    /// Genomic start position (1-based, inclusive)
71    #[serde(skip_serializing_if = "Option::is_none")]
72    pub genomic_start: Option<u64>,
73    /// Genomic end position (1-based, inclusive)
74    #[serde(skip_serializing_if = "Option::is_none")]
75    pub genomic_end: Option<u64>,
76}
77
78/// An intron between two exons
79#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
80pub struct Intron {
81    /// Intron number (1-based, intron 1 is between exon 1 and exon 2)
82    pub number: u32,
83    /// 5' exon number (upstream exon)
84    pub upstream_exon: u32,
85    /// 3' exon number (downstream exon)
86    pub downstream_exon: u32,
87    /// Genomic start position (1-based, first intronic base)
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub genomic_start: Option<u64>,
90    /// Genomic end position (1-based, last intronic base)
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub genomic_end: Option<u64>,
93    /// Transcript position of last exonic base before intron (5' boundary)
94    pub tx_5prime_boundary: u64,
95    /// Transcript position of first exonic base after intron (3' boundary)
96    pub tx_3prime_boundary: u64,
97}
98
99impl Intron {
100    /// Create a new intron
101    pub fn new(
102        number: u32,
103        upstream_exon: u32,
104        downstream_exon: u32,
105        tx_5prime_boundary: u64,
106        tx_3prime_boundary: u64,
107    ) -> Self {
108        Self {
109            number,
110            upstream_exon,
111            downstream_exon,
112            genomic_start: None,
113            genomic_end: None,
114            tx_5prime_boundary,
115            tx_3prime_boundary,
116        }
117    }
118
119    /// Create a new intron with genomic coordinates
120    pub fn with_genomic(
121        number: u32,
122        upstream_exon: u32,
123        downstream_exon: u32,
124        tx_5prime_boundary: u64,
125        tx_3prime_boundary: u64,
126        genomic_start: u64,
127        genomic_end: u64,
128    ) -> Self {
129        Self {
130            number,
131            upstream_exon,
132            downstream_exon,
133            genomic_start: Some(genomic_start),
134            genomic_end: Some(genomic_end),
135            tx_5prime_boundary,
136            tx_3prime_boundary,
137        }
138    }
139
140    /// Get the genomic length of the intron
141    ///
142    /// Uses saturating arithmetic to prevent overflow in edge cases.
143    pub fn genomic_length(&self) -> Option<u64> {
144        match (self.genomic_start, self.genomic_end) {
145            (Some(start), Some(end)) if end >= start => Some((end - start).saturating_add(1)),
146            _ => None,
147        }
148    }
149
150    /// Check if a genomic position is within this intron
151    pub fn contains_genomic(&self, pos: u64) -> bool {
152        match (self.genomic_start, self.genomic_end) {
153            (Some(start), Some(end)) => pos >= start && pos <= end,
154            _ => false,
155        }
156    }
157}
158
159impl Exon {
160    /// Create a new exon with transcript coordinates only
161    pub fn new(number: u32, start: u64, end: u64) -> Self {
162        Self {
163            number,
164            start,
165            end,
166            genomic_start: None,
167            genomic_end: None,
168        }
169    }
170
171    /// Create a new exon with both transcript and genomic coordinates
172    pub fn with_genomic(
173        number: u32,
174        start: u64,
175        end: u64,
176        genomic_start: u64,
177        genomic_end: u64,
178    ) -> Self {
179        Self {
180            number,
181            start,
182            end,
183            genomic_start: Some(genomic_start),
184            genomic_end: Some(genomic_end),
185        }
186    }
187
188    /// Length of the exon
189    pub fn len(&self) -> u64 {
190        if self.end >= self.start {
191            self.end - self.start + 1
192        } else {
193            0
194        }
195    }
196
197    pub fn is_empty(&self) -> bool {
198        self.len() == 0
199    }
200
201    /// Check if a position is within this exon
202    pub fn contains(&self, pos: u64) -> bool {
203        pos >= self.start && pos <= self.end
204    }
205}
206
207/// MANE (Matched Annotation from NCBI and EBI) transcript status
208#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
209pub enum ManeStatus {
210    /// Not a MANE transcript
211    #[default]
212    None,
213    /// MANE Select - single representative transcript per gene
214    Select,
215    /// MANE Plus Clinical - additional clinically relevant transcripts
216    PlusClinical,
217}
218
219impl ManeStatus {
220    /// Check if this is a MANE transcript of any type
221    pub fn is_mane(&self) -> bool {
222        !matches!(self, ManeStatus::None)
223    }
224
225    /// Check if this is MANE Select
226    pub fn is_select(&self) -> bool {
227        matches!(self, ManeStatus::Select)
228    }
229
230    /// Check if this is MANE Plus Clinical
231    pub fn is_plus_clinical(&self) -> bool {
232        matches!(self, ManeStatus::PlusClinical)
233    }
234
235    /// Get priority score for sorting (lower is better)
236    pub fn priority(&self) -> u8 {
237        match self {
238            ManeStatus::Select => 0,
239            ManeStatus::PlusClinical => 1,
240            ManeStatus::None => 2,
241        }
242    }
243}
244
245impl std::fmt::Display for ManeStatus {
246    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
247        match self {
248            ManeStatus::None => write!(f, ""),
249            ManeStatus::Select => write!(f, "MANE Select"),
250            ManeStatus::PlusClinical => write!(f, "MANE Plus Clinical"),
251        }
252    }
253}
254
255/// A transcript with its exon structure and sequence
256#[derive(Debug, Serialize, Deserialize)]
257pub struct Transcript {
258    /// Transcript accession (e.g., "NM_000088.3")
259    pub id: String,
260
261    /// Gene symbol (e.g., "COL1A1")
262    #[serde(skip_serializing_if = "Option::is_none")]
263    pub gene_symbol: Option<String>,
264
265    /// Strand orientation
266    pub strand: Strand,
267
268    /// Full transcript sequence
269    pub sequence: String,
270
271    /// CDS start position (1-based, in transcript coordinates)
272    #[serde(skip_serializing_if = "Option::is_none")]
273    pub cds_start: Option<u64>,
274
275    /// CDS end position (1-based, in transcript coordinates)
276    #[serde(skip_serializing_if = "Option::is_none")]
277    pub cds_end: Option<u64>,
278
279    /// List of exons
280    pub exons: Vec<Exon>,
281
282    /// Chromosome name (e.g., "chr1", "1", "X")
283    #[serde(skip_serializing_if = "Option::is_none")]
284    pub chromosome: Option<String>,
285
286    /// Genomic start position of transcript (1-based, inclusive)
287    #[serde(skip_serializing_if = "Option::is_none")]
288    pub genomic_start: Option<u64>,
289
290    /// Genomic end position of transcript (1-based, inclusive)
291    #[serde(skip_serializing_if = "Option::is_none")]
292    pub genomic_end: Option<u64>,
293
294    /// Genome build/assembly version
295    #[serde(default)]
296    pub genome_build: GenomeBuild,
297
298    /// MANE (Matched Annotation from NCBI and EBI) status
299    #[serde(default)]
300    pub mane_status: ManeStatus,
301
302    /// RefSeq accession matched to this transcript (for Ensembl transcripts)
303    #[serde(skip_serializing_if = "Option::is_none")]
304    pub refseq_match: Option<String>,
305
306    /// Ensembl accession matched to this transcript (for RefSeq transcripts)
307    #[serde(skip_serializing_if = "Option::is_none")]
308    pub ensembl_match: Option<String>,
309
310    /// Cached introns (computed lazily from exons)
311    /// This field is for internal use and will be initialized automatically.
312    #[serde(skip)]
313    pub(crate) cached_introns: OnceLock<Vec<Intron>>,
314}
315
316impl Clone for Transcript {
317    fn clone(&self) -> Self {
318        Self {
319            id: self.id.clone(),
320            gene_symbol: self.gene_symbol.clone(),
321            strand: self.strand,
322            sequence: self.sequence.clone(),
323            cds_start: self.cds_start,
324            cds_end: self.cds_end,
325            exons: self.exons.clone(),
326            chromosome: self.chromosome.clone(),
327            genomic_start: self.genomic_start,
328            genomic_end: self.genomic_end,
329            genome_build: self.genome_build,
330            mane_status: self.mane_status,
331            refseq_match: self.refseq_match.clone(),
332            ensembl_match: self.ensembl_match.clone(),
333            // Cache is reset on clone - will be lazily re-initialized
334            cached_introns: OnceLock::new(),
335        }
336    }
337}
338
339impl PartialEq for Transcript {
340    fn eq(&self, other: &Self) -> bool {
341        // Compare all fields except the cache
342        self.id == other.id
343            && self.gene_symbol == other.gene_symbol
344            && self.strand == other.strand
345            && self.sequence == other.sequence
346            && self.cds_start == other.cds_start
347            && self.cds_end == other.cds_end
348            && self.exons == other.exons
349            && self.chromosome == other.chromosome
350            && self.genomic_start == other.genomic_start
351            && self.genomic_end == other.genomic_end
352            && self.genome_build == other.genome_build
353            && self.mane_status == other.mane_status
354            && self.refseq_match == other.refseq_match
355            && self.ensembl_match == other.ensembl_match
356    }
357}
358
359impl Eq for Transcript {}
360
361impl Transcript {
362    /// Create a new Transcript with the given fields
363    #[allow(clippy::too_many_arguments)]
364    pub fn new(
365        id: String,
366        gene_symbol: Option<String>,
367        strand: Strand,
368        sequence: String,
369        cds_start: Option<u64>,
370        cds_end: Option<u64>,
371        exons: Vec<Exon>,
372        chromosome: Option<String>,
373        genomic_start: Option<u64>,
374        genomic_end: Option<u64>,
375        genome_build: GenomeBuild,
376        mane_status: ManeStatus,
377        refseq_match: Option<String>,
378        ensembl_match: Option<String>,
379    ) -> Self {
380        Self {
381            id,
382            gene_symbol,
383            strand,
384            sequence,
385            cds_start,
386            cds_end,
387            exons,
388            chromosome,
389            genomic_start,
390            genomic_end,
391            genome_build,
392            mane_status,
393            refseq_match,
394            ensembl_match,
395            cached_introns: OnceLock::new(),
396        }
397    }
398
399    /// Get the length of the transcript sequence
400    pub fn sequence_length(&self) -> u64 {
401        self.sequence.len() as u64
402    }
403
404    /// Check if this is a coding transcript
405    pub fn is_coding(&self) -> bool {
406        self.cds_start.is_some() && self.cds_end.is_some()
407    }
408
409    /// Get the CDS length
410    pub fn cds_length(&self) -> Option<u64> {
411        match (self.cds_start, self.cds_end) {
412            (Some(start), Some(end)) if end >= start => Some(end - start + 1),
413            _ => None,
414        }
415    }
416
417    /// Get sequence at a position range (0-based)
418    pub fn get_sequence(&self, start: u64, end: u64) -> Option<&str> {
419        let start = start as usize;
420        let end = end as usize;
421        if end <= self.sequence.len() && start < end {
422            Some(&self.sequence[start..end])
423        } else {
424            None
425        }
426    }
427
428    /// Find which exon contains a position using binary search
429    ///
430    /// Assumes exons are sorted by start position (which they typically are).
431    /// Falls back to linear search if exons are not sorted.
432    pub fn exon_at(&self, pos: u64) -> Option<&Exon> {
433        // Use binary search for O(log n) lookup
434        self.exons
435            .binary_search_by(|e| {
436                if pos < e.start {
437                    Ordering::Greater
438                } else if pos > e.end {
439                    Ordering::Less
440                } else {
441                    Ordering::Equal
442                }
443            })
444            .ok()
445            .map(|i| &self.exons[i])
446    }
447
448    /// Get the 5' UTR length
449    pub fn utr5_length(&self) -> Option<u64> {
450        self.cds_start.map(|s| s.saturating_sub(1))
451    }
452
453    /// Get the 3' UTR length
454    pub fn utr3_length(&self) -> Option<u64> {
455        self.cds_end
456            .map(|e| self.sequence_length().saturating_sub(e))
457    }
458
459    /// Check if this transcript has genomic coordinates
460    pub fn has_genomic_coords(&self) -> bool {
461        self.chromosome.is_some() && self.genomic_start.is_some() && self.genomic_end.is_some()
462    }
463
464    /// Get the genomic length (span) of the transcript
465    pub fn genomic_length(&self) -> Option<u64> {
466        match (self.genomic_start, self.genomic_end) {
467            (Some(start), Some(end)) if end >= start => Some(end - start + 1),
468            _ => None,
469        }
470    }
471
472    /// Check if a genomic position falls within this transcript's span
473    pub fn contains_genomic_pos(&self, pos: u64) -> bool {
474        match (self.genomic_start, self.genomic_end) {
475            (Some(start), Some(end)) => pos >= start && pos <= end,
476            _ => false,
477        }
478    }
479
480    /// Check if this is a MANE Select transcript
481    pub fn is_mane_select(&self) -> bool {
482        self.mane_status.is_select()
483    }
484
485    /// Check if this is a MANE Plus Clinical transcript
486    pub fn is_mane_plus_clinical(&self) -> bool {
487        self.mane_status.is_plus_clinical()
488    }
489
490    /// Check if this is any type of MANE transcript
491    pub fn is_mane(&self) -> bool {
492        self.mane_status.is_mane()
493    }
494
495    /// Get cached introns, computing them lazily if not already cached
496    ///
497    /// This method provides O(1) access to introns after the first call,
498    /// avoiding recalculation on every lookup.
499    pub fn introns(&self) -> &[Intron] {
500        self.cached_introns.get_or_init(|| self.compute_introns())
501    }
502
503    /// Calculate introns from exon boundaries
504    ///
505    /// Returns a vector of Intron structs derived from adjacent exon pairs.
506    /// Exons should be sorted by transcript position.
507    ///
508    /// Note: This method uses caching internally. For repeated access,
509    /// prefer `introns()` which returns a slice reference.
510    pub fn calculate_introns(&self) -> Vec<Intron> {
511        self.introns().to_vec()
512    }
513
514    /// Compute introns from exon boundaries (internal, uncached)
515    fn compute_introns(&self) -> Vec<Intron> {
516        let mut introns = Vec::new();
517
518        // Sort exons by transcript position
519        let mut sorted_exons: Vec<_> = self.exons.iter().collect();
520        sorted_exons.sort_by_key(|e| e.start);
521
522        for (i, window) in sorted_exons.windows(2).enumerate() {
523            let upstream = window[0];
524            let downstream = window[1];
525
526            let intron_number = (i + 1) as u32;
527
528            // Calculate genomic coordinates for the intron if available
529            let (genomic_start, genomic_end) = match self.strand {
530                Strand::Plus => {
531                    // Plus strand: intron starts after upstream exon ends genomically
532                    let g_start = upstream.genomic_end.map(|e| e + 1);
533                    let g_end = downstream.genomic_start.map(|s| s - 1);
534                    (g_start, g_end)
535                }
536                Strand::Minus => {
537                    // Minus strand: genomic coordinates are reversed
538                    // Intron is between downstream's genomic_end+1 and upstream's genomic_start-1
539                    let g_start = downstream.genomic_end.map(|e| e + 1);
540                    let g_end = upstream.genomic_start.map(|s| s - 1);
541                    (g_start, g_end)
542                }
543            };
544
545            let mut intron = Intron::new(
546                intron_number,
547                upstream.number,
548                downstream.number,
549                upstream.end,     // tx position of last base in upstream exon
550                downstream.start, // tx position of first base in downstream exon
551            );
552
553            if let (Some(gs), Some(ge)) = (genomic_start, genomic_end) {
554                if ge >= gs {
555                    intron.genomic_start = Some(gs);
556                    intron.genomic_end = Some(ge);
557                }
558            }
559
560            introns.push(intron);
561        }
562
563        introns
564    }
565
566    /// Find which intron contains a genomic position
567    ///
568    /// Returns the intron and the offset from the nearest exon boundary.
569    /// Positive offset means downstream from 5' boundary (c.N+offset notation).
570    /// Negative offset means upstream from 3' boundary (c.N-offset notation).
571    pub fn find_intron_at_genomic(&self, genomic_pos: u64) -> Option<(Intron, IntronPosition)> {
572        // Use cached introns for O(1) access after first call
573        for intron in self.introns() {
574            if intron.contains_genomic(genomic_pos) {
575                let (g_start, g_end) = (intron.genomic_start?, intron.genomic_end?);
576                // Use saturating_add to prevent overflow at u64::MAX
577                let intron_length = (g_end - g_start).saturating_add(1);
578
579                // Calculate distance from each boundary
580                // Use saturating arithmetic to prevent overflow
581                let (dist_to_5prime, dist_to_3prime) = match self.strand {
582                    Strand::Plus => {
583                        // Plus strand: genomic start is 5' boundary
584                        let from_5prime = (genomic_pos - g_start).saturating_add(1); // 1-based offset
585                        let from_3prime = (g_end - genomic_pos).saturating_add(1);
586                        (from_5prime, from_3prime)
587                    }
588                    Strand::Minus => {
589                        // Minus strand: genomic end is 5' boundary
590                        let from_5prime = (g_end - genomic_pos).saturating_add(1);
591                        let from_3prime = (genomic_pos - g_start).saturating_add(1);
592                        (from_5prime, from_3prime)
593                    }
594                };
595
596                // Determine which boundary is closer and create position
597                let position = if dist_to_5prime <= dist_to_3prime {
598                    // Closer to 5' boundary (or equal): use +offset notation
599                    IntronPosition {
600                        intron_number: intron.number,
601                        boundary: IntronBoundary::FivePrime,
602                        offset: dist_to_5prime as i64,
603                        tx_boundary_pos: intron.tx_5prime_boundary,
604                        intron_length,
605                    }
606                } else {
607                    // Closer to 3' boundary: use -offset notation
608                    IntronPosition {
609                        intron_number: intron.number,
610                        boundary: IntronBoundary::ThreePrime,
611                        offset: -(dist_to_3prime as i64),
612                        tx_boundary_pos: intron.tx_3prime_boundary,
613                        intron_length,
614                    }
615                };
616
617                return Some((intron.clone(), position));
618            }
619        }
620
621        None
622    }
623
624    /// Get the number of introns in this transcript
625    pub fn intron_count(&self) -> usize {
626        if self.exons.len() > 1 {
627            self.exons.len() - 1
628        } else {
629            0
630        }
631    }
632
633    /// Find an intron given a transcript boundary position and offset
634    ///
635    /// This is used to convert intronic positions like c.100+5 or c.200-10
636    /// to find which intron they're in.
637    ///
638    /// # Arguments
639    /// * `tx_boundary` - The transcript position of the exon boundary
640    /// * `offset` - The offset into the intron (positive = after exon, negative = before exon)
641    ///
642    /// # Returns
643    /// The intron if found, along with whether this is a 5' or 3' boundary reference
644    pub fn find_intron_at_tx_boundary(&self, tx_boundary: u64, offset: i64) -> Option<&Intron> {
645        for intron in self.introns() {
646            if offset > 0 && intron.tx_5prime_boundary == tx_boundary {
647                // Positive offset: c.N+offset means we're after the 5' boundary (end of upstream exon)
648                return Some(intron);
649            } else if offset < 0 && intron.tx_3prime_boundary == tx_boundary {
650                // Negative offset: c.N-offset means we're before the 3' boundary (start of downstream exon)
651                return Some(intron);
652            }
653        }
654        None
655    }
656
657    /// Convert an intronic position to a genomic coordinate
658    ///
659    /// # Arguments
660    /// * `tx_boundary` - The transcript position of the exon boundary (the base part of c.N+offset)
661    /// * `offset` - The offset into the intron
662    ///
663    /// # Returns
664    /// The genomic position if the transcript has genomic coordinates
665    pub fn intronic_to_genomic(&self, tx_boundary: u64, offset: i64) -> Option<u64> {
666        let intron = self.find_intron_at_tx_boundary(tx_boundary, offset)?;
667
668        // Get the genomic coordinates of the intron
669        let (g_start, g_end) = (intron.genomic_start?, intron.genomic_end?);
670
671        match self.strand {
672            Strand::Plus => {
673                if offset > 0 {
674                    // c.N+offset: offset bases after the 5' exon boundary
675                    // genomic = intron start + offset - 1 (offset is 1-based)
676                    Some(g_start + offset as u64 - 1)
677                } else {
678                    // c.N-offset: |offset| bases before the 3' exon boundary
679                    // genomic = intron end - |offset| + 1
680                    Some(g_end - (-offset) as u64 + 1)
681                }
682            }
683            Strand::Minus => {
684                // On minus strand, genomic coordinates are reversed relative to transcript
685                if offset > 0 {
686                    // c.N+offset: offset bases after the 5' exon boundary
687                    // For minus strand: intron end - offset + 1
688                    Some(g_end - offset as u64 + 1)
689                } else {
690                    // c.N-offset: |offset| bases before the 3' exon boundary
691                    // For minus strand: intron start + |offset| - 1
692                    Some(g_start + (-offset) as u64 - 1)
693                }
694            }
695        }
696    }
697
698    /// Convert a genomic position to intronic transcript notation
699    ///
700    /// # Arguments
701    /// * `genomic_pos` - The genomic position
702    ///
703    /// # Returns
704    /// A tuple of (tx_boundary_position, offset) where:
705    /// - tx_boundary_position is the CDS/transcript position of the nearest exon boundary
706    /// - offset is positive (c.N+offset) or negative (c.N-offset)
707    pub fn genomic_to_intronic(&self, genomic_pos: u64) -> Option<(u64, i64)> {
708        let (intron, intron_pos) = self.find_intron_at_genomic(genomic_pos)?;
709
710        match intron_pos.boundary {
711            IntronBoundary::FivePrime => {
712                // Reference the 5' exon boundary with positive offset
713                Some((intron.tx_5prime_boundary, intron_pos.offset))
714            }
715            IntronBoundary::ThreePrime => {
716                // Reference the 3' exon boundary with negative offset
717                Some((intron.tx_3prime_boundary, intron_pos.offset))
718            }
719        }
720    }
721}
722
723/// Position within an intron
724#[derive(Debug, Clone, Copy, PartialEq, Eq)]
725pub struct IntronPosition {
726    /// Intron number (1-based)
727    pub intron_number: u32,
728    /// Which exon boundary this position is relative to
729    pub boundary: IntronBoundary,
730    /// Offset from the boundary (positive for 5' boundary, negative for 3')
731    pub offset: i64,
732    /// Transcript position of the exon boundary
733    pub tx_boundary_pos: u64,
734    /// Total length of the intron in bases
735    pub intron_length: u64,
736}
737
738impl IntronPosition {
739    /// Check if this is a deep intronic position (>50bp from nearest exon)
740    pub fn is_deep_intronic(&self) -> bool {
741        self.offset.abs() > 50
742    }
743
744    /// Check if this is at a canonical splice site (within 2bp of exon)
745    pub fn is_canonical_splice_site(&self) -> bool {
746        self.offset.abs() <= 2
747    }
748
749    /// Check if this is near a splice site (within 10bp of exon)
750    pub fn is_near_splice_site(&self) -> bool {
751        self.offset.abs() <= 10
752    }
753
754    /// Check if this is in the extended splice region (within 20bp of exon)
755    pub fn is_extended_splice_region(&self) -> bool {
756        self.offset.abs() <= 20
757    }
758
759    /// Get the splice site type based on position
760    pub fn splice_site_type(&self) -> SpliceSiteType {
761        let abs_offset = self.offset.abs();
762        match self.boundary {
763            IntronBoundary::FivePrime => {
764                // 5' end of intron = splice donor site
765                if abs_offset <= 2 {
766                    SpliceSiteType::DonorCanonical
767                } else if abs_offset <= 6 {
768                    SpliceSiteType::DonorExtended
769                } else if abs_offset <= 20 {
770                    SpliceSiteType::DonorRegion
771                } else if abs_offset <= 50 {
772                    SpliceSiteType::NearSplice
773                } else {
774                    SpliceSiteType::DeepIntronic
775                }
776            }
777            IntronBoundary::ThreePrime => {
778                // 3' end of intron = splice acceptor site
779                if abs_offset <= 2 {
780                    SpliceSiteType::AcceptorCanonical
781                } else if abs_offset <= 12 {
782                    // Branch point region is typically -18 to -35
783                    SpliceSiteType::AcceptorExtended
784                } else if abs_offset <= 20 {
785                    SpliceSiteType::AcceptorRegion
786                } else if abs_offset <= 50 {
787                    SpliceSiteType::NearSplice
788                } else {
789                    SpliceSiteType::DeepIntronic
790                }
791            }
792        }
793    }
794
795    /// Get distance from splice donor (5' end of intron)
796    pub fn distance_from_donor(&self) -> Option<u64> {
797        match self.boundary {
798            IntronBoundary::FivePrime => Some(self.offset.unsigned_abs()),
799            IntronBoundary::ThreePrime => {
800                // Distance = intron_length - distance_from_3prime
801                Some(self.intron_length - self.offset.unsigned_abs())
802            }
803        }
804    }
805
806    /// Get distance from splice acceptor (3' end of intron)
807    pub fn distance_from_acceptor(&self) -> Option<u64> {
808        match self.boundary {
809            IntronBoundary::ThreePrime => Some(self.offset.unsigned_abs()),
810            IntronBoundary::FivePrime => {
811                // Distance = intron_length - distance_from_5prime
812                Some(self.intron_length - self.offset.unsigned_abs())
813            }
814        }
815    }
816}
817
818/// Which boundary of an intron a position is relative to
819#[derive(Debug, Clone, Copy, PartialEq, Eq)]
820pub enum IntronBoundary {
821    /// 5' boundary (end of upstream exon) - uses + offset notation
822    FivePrime,
823    /// 3' boundary (start of downstream exon) - uses - offset notation
824    ThreePrime,
825}
826
827/// Type of splice site position
828#[derive(Debug, Clone, Copy, PartialEq, Eq)]
829pub enum SpliceSiteType {
830    /// Canonical splice donor (GT, positions +1 to +2)
831    DonorCanonical,
832    /// Extended splice donor consensus (positions +3 to +6)
833    DonorExtended,
834    /// Splice donor region (positions +7 to +20)
835    DonorRegion,
836    /// Canonical splice acceptor (AG, positions -1 to -2)
837    AcceptorCanonical,
838    /// Extended splice acceptor including polypyrimidine tract (positions -3 to -12)
839    AcceptorExtended,
840    /// Splice acceptor region (positions -13 to -20)
841    AcceptorRegion,
842    /// Near splice site but not in critical region (21-50bp from exon)
843    NearSplice,
844    /// Deep intronic (>50bp from nearest exon)
845    DeepIntronic,
846}
847
848#[cfg(test)]
849mod tests {
850    use super::*;
851
852    fn make_test_transcript() -> Transcript {
853        Transcript {
854            id: "NM_000088.3".to_string(),
855            gene_symbol: Some("COL1A1".to_string()),
856            strand: Strand::Plus,
857            sequence: "ATGCATGCATGCATGCATGC".to_string(), // 20 bases
858            cds_start: Some(5),
859            cds_end: Some(15),
860            exons: vec![
861                Exon::new(1, 1, 7),
862                Exon::new(2, 8, 14),
863                Exon::new(3, 15, 20),
864            ],
865            chromosome: None,
866            genomic_start: None,
867            genomic_end: None,
868            genome_build: GenomeBuild::default(),
869            mane_status: ManeStatus::default(),
870            refseq_match: None,
871            ensembl_match: None,
872            cached_introns: OnceLock::new(),
873        }
874    }
875
876    fn make_test_transcript_with_genomic() -> Transcript {
877        Transcript {
878            id: "NM_000088.4".to_string(),
879            gene_symbol: Some("COL1A1".to_string()),
880            strand: Strand::Plus,
881            sequence: "ATGCATGCATGCATGCATGC".to_string(),
882            cds_start: Some(5),
883            cds_end: Some(15),
884            exons: vec![
885                Exon::with_genomic(1, 1, 7, 50189542, 50189548),
886                Exon::with_genomic(2, 8, 14, 50190100, 50190106),
887                Exon::with_genomic(3, 15, 20, 50190500, 50190505),
888            ],
889            chromosome: Some("chr17".to_string()),
890            genomic_start: Some(50189542),
891            genomic_end: Some(50190505),
892            genome_build: GenomeBuild::GRCh38,
893            mane_status: ManeStatus::Select,
894            refseq_match: None,
895            ensembl_match: Some("ENST00000123456.5".to_string()),
896            cached_introns: OnceLock::new(),
897        }
898    }
899
900    #[test]
901    fn test_transcript_sequence_length() {
902        let tx = make_test_transcript();
903        assert_eq!(tx.sequence_length(), 20);
904    }
905
906    #[test]
907    fn test_transcript_is_coding() {
908        let tx = make_test_transcript();
909        assert!(tx.is_coding());
910
911        let noncoding = Transcript {
912            cds_start: None,
913            cds_end: None,
914            ..tx
915        };
916        assert!(!noncoding.is_coding());
917    }
918
919    #[test]
920    fn test_transcript_cds_length() {
921        let tx = make_test_transcript();
922        assert_eq!(tx.cds_length(), Some(11));
923    }
924
925    #[test]
926    fn test_exon_contains() {
927        let exon = Exon::new(1, 10, 20);
928        assert!(exon.contains(10));
929        assert!(exon.contains(15));
930        assert!(exon.contains(20));
931        assert!(!exon.contains(9));
932        assert!(!exon.contains(21));
933    }
934
935    #[test]
936    fn test_exon_at() {
937        let tx = make_test_transcript();
938        let exon = tx.exon_at(10).unwrap();
939        assert_eq!(exon.number, 2);
940    }
941
942    #[test]
943    fn test_get_sequence() {
944        let tx = make_test_transcript();
945        assert_eq!(tx.get_sequence(0, 3), Some("ATG"));
946    }
947
948    #[test]
949    fn test_strand_display() {
950        assert_eq!(format!("{}", Strand::Plus), "+");
951        assert_eq!(format!("{}", Strand::Minus), "-");
952    }
953
954    #[test]
955    fn test_genome_build_display() {
956        assert_eq!(format!("{}", GenomeBuild::GRCh37), "GRCh37");
957        assert_eq!(format!("{}", GenomeBuild::GRCh38), "GRCh38");
958        assert_eq!(format!("{}", GenomeBuild::Unknown), "Unknown");
959    }
960
961    #[test]
962    fn test_genome_build_default() {
963        assert_eq!(GenomeBuild::default(), GenomeBuild::GRCh38);
964    }
965
966    #[test]
967    fn test_exon_with_genomic() {
968        let exon = Exon::with_genomic(1, 1, 100, 50000, 50099);
969        assert_eq!(exon.number, 1);
970        assert_eq!(exon.start, 1);
971        assert_eq!(exon.end, 100);
972        assert_eq!(exon.genomic_start, Some(50000));
973        assert_eq!(exon.genomic_end, Some(50099));
974    }
975
976    #[test]
977    fn test_transcript_has_genomic_coords() {
978        let tx = make_test_transcript();
979        assert!(!tx.has_genomic_coords());
980
981        let tx_genomic = make_test_transcript_with_genomic();
982        assert!(tx_genomic.has_genomic_coords());
983    }
984
985    #[test]
986    fn test_transcript_genomic_length() {
987        let tx = make_test_transcript();
988        assert_eq!(tx.genomic_length(), None);
989
990        let tx_genomic = make_test_transcript_with_genomic();
991        // 50190505 - 50189542 + 1 = 964
992        assert_eq!(tx_genomic.genomic_length(), Some(964));
993    }
994
995    #[test]
996    fn test_transcript_contains_genomic_pos() {
997        let tx = make_test_transcript();
998        assert!(!tx.contains_genomic_pos(50189542));
999
1000        let tx_genomic = make_test_transcript_with_genomic();
1001        assert!(tx_genomic.contains_genomic_pos(50189542)); // start
1002        assert!(tx_genomic.contains_genomic_pos(50190000)); // middle
1003        assert!(tx_genomic.contains_genomic_pos(50190505)); // end
1004        assert!(!tx_genomic.contains_genomic_pos(50189541)); // before
1005        assert!(!tx_genomic.contains_genomic_pos(50190506)); // after
1006    }
1007
1008    #[test]
1009    fn test_mane_status_default() {
1010        assert_eq!(ManeStatus::default(), ManeStatus::None);
1011    }
1012
1013    #[test]
1014    fn test_mane_status_methods() {
1015        assert!(!ManeStatus::None.is_mane());
1016        assert!(!ManeStatus::None.is_select());
1017        assert!(!ManeStatus::None.is_plus_clinical());
1018
1019        assert!(ManeStatus::Select.is_mane());
1020        assert!(ManeStatus::Select.is_select());
1021        assert!(!ManeStatus::Select.is_plus_clinical());
1022
1023        assert!(ManeStatus::PlusClinical.is_mane());
1024        assert!(!ManeStatus::PlusClinical.is_select());
1025        assert!(ManeStatus::PlusClinical.is_plus_clinical());
1026    }
1027
1028    #[test]
1029    fn test_mane_status_priority() {
1030        assert!(ManeStatus::Select.priority() < ManeStatus::PlusClinical.priority());
1031        assert!(ManeStatus::PlusClinical.priority() < ManeStatus::None.priority());
1032    }
1033
1034    #[test]
1035    fn test_mane_status_display() {
1036        assert_eq!(format!("{}", ManeStatus::None), "");
1037        assert_eq!(format!("{}", ManeStatus::Select), "MANE Select");
1038        assert_eq!(
1039            format!("{}", ManeStatus::PlusClinical),
1040            "MANE Plus Clinical"
1041        );
1042    }
1043
1044    #[test]
1045    fn test_transcript_mane_methods() {
1046        let tx = make_test_transcript();
1047        assert!(!tx.is_mane());
1048        assert!(!tx.is_mane_select());
1049        assert!(!tx.is_mane_plus_clinical());
1050
1051        let tx_mane = make_test_transcript_with_genomic();
1052        assert!(tx_mane.is_mane());
1053        assert!(tx_mane.is_mane_select());
1054        assert!(!tx_mane.is_mane_plus_clinical());
1055    }
1056
1057    #[test]
1058    fn test_transcript_matched_accessions() {
1059        let tx = make_test_transcript_with_genomic();
1060        assert_eq!(tx.refseq_match, None);
1061        assert_eq!(tx.ensembl_match, Some("ENST00000123456.5".to_string()));
1062    }
1063
1064    #[test]
1065    fn test_calculate_introns() {
1066        let tx = make_test_transcript_with_genomic();
1067        let introns = tx.calculate_introns();
1068
1069        // Should have 2 introns for 3 exons
1070        assert_eq!(introns.len(), 2);
1071
1072        // Intron 1 is between exon 1 and exon 2
1073        let intron1 = &introns[0];
1074        assert_eq!(intron1.number, 1);
1075        assert_eq!(intron1.upstream_exon, 1);
1076        assert_eq!(intron1.downstream_exon, 2);
1077        assert_eq!(intron1.tx_5prime_boundary, 7); // end of exon 1
1078        assert_eq!(intron1.tx_3prime_boundary, 8); // start of exon 2
1079
1080        // Intron 2 is between exon 2 and exon 3
1081        let intron2 = &introns[1];
1082        assert_eq!(intron2.number, 2);
1083        assert_eq!(intron2.upstream_exon, 2);
1084        assert_eq!(intron2.downstream_exon, 3);
1085    }
1086
1087    #[test]
1088    fn test_intron_genomic_coords() {
1089        let tx = make_test_transcript_with_genomic();
1090        let introns = tx.calculate_introns();
1091
1092        // Intron 1 genomic: 50189549 to 50190099 (between exon 1 end 50189548 and exon 2 start 50190100)
1093        let intron1 = &introns[0];
1094        assert_eq!(intron1.genomic_start, Some(50189549));
1095        assert_eq!(intron1.genomic_end, Some(50190099));
1096        assert!(intron1.genomic_length().is_some());
1097        assert_eq!(intron1.genomic_length().unwrap(), 551);
1098    }
1099
1100    #[test]
1101    fn test_intron_contains_genomic() {
1102        let tx = make_test_transcript_with_genomic();
1103        let introns = tx.calculate_introns();
1104        let intron1 = &introns[0];
1105
1106        // Position inside intron 1
1107        assert!(intron1.contains_genomic(50189600));
1108        assert!(intron1.contains_genomic(50190000));
1109
1110        // Position outside intron 1
1111        assert!(!intron1.contains_genomic(50189548)); // in exon 1
1112        assert!(!intron1.contains_genomic(50190100)); // in exon 2
1113    }
1114
1115    #[test]
1116    fn test_find_intron_at_genomic() {
1117        let tx = make_test_transcript_with_genomic();
1118
1119        // Position in intron 1 (close to 5' end)
1120        let result = tx.find_intron_at_genomic(50189550);
1121        assert!(result.is_some());
1122        let (intron, pos) = result.unwrap();
1123        assert_eq!(intron.number, 1);
1124        assert_eq!(pos.intron_number, 1);
1125        assert_eq!(pos.boundary, IntronBoundary::FivePrime);
1126        assert_eq!(pos.offset, 2); // 2 bases into intron
1127
1128        // Position in intron 1 (close to 3' end)
1129        let result = tx.find_intron_at_genomic(50190098);
1130        assert!(result.is_some());
1131        let (_, pos) = result.unwrap();
1132        assert_eq!(pos.boundary, IntronBoundary::ThreePrime);
1133        assert!(pos.offset < 0);
1134    }
1135
1136    #[test]
1137    fn test_intron_position_splice_site_type() {
1138        // Test canonical donor (+1, +2)
1139        let pos = IntronPosition {
1140            intron_number: 1,
1141            boundary: IntronBoundary::FivePrime,
1142            offset: 1,
1143            tx_boundary_pos: 100,
1144            intron_length: 500,
1145        };
1146        assert_eq!(pos.splice_site_type(), SpliceSiteType::DonorCanonical);
1147        assert!(pos.is_canonical_splice_site());
1148        assert!(!pos.is_deep_intronic());
1149
1150        // Test canonical acceptor (-1, -2)
1151        let pos = IntronPosition {
1152            intron_number: 1,
1153            boundary: IntronBoundary::ThreePrime,
1154            offset: -2,
1155            tx_boundary_pos: 200,
1156            intron_length: 500,
1157        };
1158        assert_eq!(pos.splice_site_type(), SpliceSiteType::AcceptorCanonical);
1159
1160        // Test deep intronic (>50bp)
1161        let pos = IntronPosition {
1162            intron_number: 1,
1163            boundary: IntronBoundary::FivePrime,
1164            offset: 100,
1165            tx_boundary_pos: 100,
1166            intron_length: 500,
1167        };
1168        assert_eq!(pos.splice_site_type(), SpliceSiteType::DeepIntronic);
1169        assert!(pos.is_deep_intronic());
1170    }
1171
1172    #[test]
1173    fn test_intron_position_distances() {
1174        let pos = IntronPosition {
1175            intron_number: 1,
1176            boundary: IntronBoundary::FivePrime,
1177            offset: 10,
1178            tx_boundary_pos: 100,
1179            intron_length: 500,
1180        };
1181
1182        // Distance from donor should be the offset
1183        assert_eq!(pos.distance_from_donor(), Some(10));
1184        // Distance from acceptor should be intron_length - offset
1185        assert_eq!(pos.distance_from_acceptor(), Some(490));
1186    }
1187
1188    #[test]
1189    fn test_intron_count() {
1190        let tx = make_test_transcript_with_genomic();
1191        assert_eq!(tx.intron_count(), 2);
1192
1193        // Single exon transcript should have 0 introns
1194        let single_exon = Transcript {
1195            id: "NR_TEST.1".to_string(),
1196            gene_symbol: None,
1197            strand: Strand::Plus,
1198            sequence: "A".repeat(100),
1199            cds_start: None,
1200            cds_end: None,
1201            exons: vec![Exon::new(1, 1, 100)],
1202            chromosome: None,
1203            genomic_start: None,
1204            genomic_end: None,
1205            genome_build: GenomeBuild::default(),
1206            mane_status: ManeStatus::default(),
1207            refseq_match: None,
1208            ensembl_match: None,
1209            cached_introns: std::sync::OnceLock::new(),
1210        };
1211        assert_eq!(single_exon.intron_count(), 0);
1212    }
1213}