Skip to main content

cyanea_seq/
orf.rs

1//! Open Reading Frame (ORF) finder.
2//!
3//! Scans nucleotide sequences for ORFs across all six reading frames
4//! (three forward, three reverse complement). An ORF begins at a start
5//! codon (ATG by default) and extends to the next in-frame stop codon
6//! (TAA, TAG, TGA) or the end of the sequence.
7
8/// Result of an ORF search.
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct OrfResult {
11    /// Start position in the input sequence (0-indexed).
12    pub start: usize,
13    /// End position (exclusive) in the input sequence.
14    pub end: usize,
15    /// Reading frame (0, 1, or 2).
16    pub frame: usize,
17    /// Strand: `Forward` or `Reverse`.
18    pub strand: Strand,
19    /// The nucleotide sequence of the ORF.
20    pub sequence: Vec<u8>,
21}
22
23/// Strand orientation.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum Strand {
26    Forward,
27    Reverse,
28}
29
30const DEFAULT_STARTS: &[&[u8]] = &[b"ATG"];
31const DEFAULT_STOPS: &[&[u8]] = &[b"TAA", b"TAG", b"TGA"];
32
33/// Compute the reverse complement of a DNA sequence.
34///
35/// Complements each base (A<->T, C<->G) and reverses the result.
36/// Non-ACGT characters are left unchanged.
37fn reverse_complement(seq: &[u8]) -> Vec<u8> {
38    seq.iter()
39        .rev()
40        .map(|&b| match b {
41            b'A' => b'T',
42            b'T' => b'A',
43            b'C' => b'G',
44            b'G' => b'C',
45            other => other,
46        })
47        .collect()
48}
49
50/// Check whether `codon` matches any entry in `codons`.
51fn matches_codon(codon: &[u8], codons: &[&[u8]]) -> bool {
52    codons.iter().any(|c| c == &codon)
53}
54
55/// Find ORFs on the forward strand using configurable start/stop codons.
56///
57/// An ORF starts at a codon matching `start_codons` and extends to the
58/// next in-frame codon matching `stop_codons` (inclusive) or end of
59/// sequence. Only ORFs with nucleotide length >= `min_length` are returned.
60pub fn find_orfs_with_codons(
61    seq: &[u8],
62    min_length: usize,
63    start_codons: &[&[u8]],
64    stop_codons: &[&[u8]],
65) -> Vec<OrfResult> {
66    let mut results = Vec::new();
67    let len = seq.len();
68
69    for frame in 0..3 {
70        let mut pos = frame;
71        let mut orf_start: Option<usize> = None;
72
73        while pos + 3 <= len {
74            let codon = &seq[pos..pos + 3];
75
76            if orf_start.is_none() && matches_codon(codon, start_codons) {
77                orf_start = Some(pos);
78            } else if orf_start.is_some() && matches_codon(codon, stop_codons) {
79                let start = orf_start.unwrap();
80                let end = pos + 3;
81                if end - start >= min_length {
82                    results.push(OrfResult {
83                        start,
84                        end,
85                        frame,
86                        strand: Strand::Forward,
87                        sequence: seq[start..end].to_vec(),
88                    });
89                }
90                orf_start = None;
91            }
92
93            pos += 3;
94        }
95
96        // ORF extending to end of sequence (no stop codon found)
97        if let Some(start) = orf_start {
98            let end = len;
99            if end - start >= min_length {
100                results.push(OrfResult {
101                    start,
102                    end,
103                    frame,
104                    strand: Strand::Forward,
105                    sequence: seq[start..end].to_vec(),
106                });
107            }
108        }
109    }
110
111    results
112}
113
114/// Find ORFs in all three forward reading frames.
115///
116/// Uses the standard start codon (ATG) and stop codons (TAA, TAG, TGA).
117/// `min_length` is in nucleotides. Input should be uppercase DNA.
118pub fn find_orfs(seq: &[u8], min_length: usize) -> Vec<OrfResult> {
119    find_orfs_with_codons(seq, min_length, DEFAULT_STARTS, DEFAULT_STOPS)
120}
121
122/// Find ORFs in all six reading frames (forward + reverse complement).
123///
124/// For the reverse strand, the reverse complement is computed internally.
125/// Coordinates in `OrfResult` for reverse-strand ORFs refer to positions
126/// on the original (input) sequence.
127pub fn find_orfs_both_strands(seq: &[u8], min_length: usize) -> Vec<OrfResult> {
128    let mut results = find_orfs(seq, min_length);
129
130    let rc = reverse_complement(seq);
131    let rc_orfs = find_orfs(&rc, min_length);
132    let len = seq.len();
133
134    for mut orf in rc_orfs {
135        // Map reverse-complement positions back to the original sequence.
136        // Position `p` on the RC corresponds to `len - p` on the original.
137        let orig_start = len - orf.end;
138        let orig_end = len - orf.start;
139        orf.start = orig_start;
140        orf.end = orig_end;
141        orf.strand = Strand::Reverse;
142        // sequence stays as the ORF nucleotides on the RC strand
143        results.push(orf);
144    }
145
146    results
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn known_orf() {
155        // ATGAAATAA: ATG (start) + AAA + TAA (stop) = 9 nt
156        let orfs = find_orfs(b"ATGAAATAA", 1);
157        assert_eq!(orfs.len(), 1);
158        assert_eq!(orfs[0].start, 0);
159        assert_eq!(orfs[0].end, 9);
160        assert_eq!(orfs[0].frame, 0);
161        assert_eq!(orfs[0].strand, Strand::Forward);
162        assert_eq!(orfs[0].sequence, b"ATGAAATAA");
163    }
164
165    #[test]
166    fn multiple_orfs_different_frames() {
167        // Frame 0: ATG CCC TAA (pos 0..9)
168        // Frame 1: xATG AAA TGA x (pos 1..10)
169        let seq = b"AATGAAATGAX";
170        let orfs = find_orfs(seq, 1);
171        // Frame 0 starting at 0: no ATG at pos 0 (AAT), pos 3 (GAA), pos 6 (ATG) -> ATG at 6, then no stop -> extends to end
172        // Frame 1 starting at 1: pos 1 (ATG) -> start, pos 4 (AAA), pos 7 (TGA) -> stop => ORF 1..10
173        // Frame 2 starting at 2: pos 2 (TGA) no start, pos 5 (AAT), pos 8 (GAX) no start
174        // Also check frame 0: pos 0=AAT, pos 3=GAA, pos 6=ATG -> start at 6, no stop -> ORF 6..11
175        assert!(orfs.len() >= 2);
176        let frame1: Vec<_> = orfs.iter().filter(|o| o.frame == 1).collect();
177        assert_eq!(frame1.len(), 1);
178        assert_eq!(frame1[0].start, 1);
179        assert_eq!(frame1[0].end, 10);
180    }
181
182    #[test]
183    fn no_orfs_no_atg() {
184        let orfs = find_orfs(b"CCCGGGTTTAAA", 1);
185        assert!(orfs.is_empty());
186    }
187
188    #[test]
189    fn orf_extending_to_end() {
190        // ATG followed by non-stop codons, no stop before end
191        let orfs = find_orfs(b"ATGAAACCC", 1);
192        assert_eq!(orfs.len(), 1);
193        assert_eq!(orfs[0].start, 0);
194        assert_eq!(orfs[0].end, 9);
195        assert_eq!(orfs[0].sequence, b"ATGAAACCC");
196    }
197
198    #[test]
199    fn overlapping_orfs_same_frame() {
200        // Two ORFs in frame 0: ATG AAA TAA ATG CCC TAG
201        let seq = b"ATGAAATAAATGCCCTAG";
202        let orfs: Vec<_> = find_orfs(seq, 1)
203            .into_iter()
204            .filter(|o| o.frame == 0)
205            .collect();
206        assert_eq!(orfs.len(), 2);
207        assert_eq!(orfs[0].start, 0);
208        assert_eq!(orfs[0].end, 9);
209        assert_eq!(orfs[1].start, 9);
210        assert_eq!(orfs[1].end, 18);
211    }
212
213    #[test]
214    fn min_length_filtering() {
215        // ORF is 9 nt (ATG AAA TAA)
216        let orfs = find_orfs(b"ATGAAATAA", 10);
217        assert!(orfs.is_empty());
218
219        let orfs = find_orfs(b"ATGAAATAA", 9);
220        assert_eq!(orfs.len(), 1);
221    }
222
223    #[test]
224    fn both_strands() {
225        // Forward: ATG AAA TAA (ORF at 0..9)
226        // Reverse complement of ATGAAATAA is TTATTTTCAT — no ATG on RC
227        // Use a sequence that has ORFs on both strands.
228        // Forward: ...ATG CCC TAA...
229        // If we also want a reverse-strand ORF, the RC must contain ATG...stop.
230        // RC of TTA GGG CAT = ATG CCC TAA → so the original is TTA GGG CAT
231        // Combined: ATG CCC TAA TTA GGG CAT
232        let seq = b"ATGCCCTAATTAGGGCAT";
233        let orfs = find_orfs_both_strands(seq, 1);
234        let fwd: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Forward).collect();
235        let rev: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Reverse).collect();
236        assert!(!fwd.is_empty(), "expected forward ORFs");
237        assert!(!rev.is_empty(), "expected reverse ORFs");
238    }
239
240    #[test]
241    fn reverse_strand_coordinates() {
242        // Sequence: TTAGGGCAT (len 9)
243        // RC:       ATGCCCTAA — ORF at RC positions 0..9
244        // Mapped back: start = 9-9 = 0, end = 9-0 = 9
245        let seq = b"TTAGGGCAT";
246        let orfs = find_orfs_both_strands(seq, 1);
247        let rev: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Reverse).collect();
248        assert_eq!(rev.len(), 1);
249        assert_eq!(rev[0].start, 0);
250        assert_eq!(rev[0].end, 9);
251        assert_eq!(rev[0].sequence, b"ATGCCCTAA");
252    }
253
254    #[test]
255    fn configurable_start_stop_codons() {
256        // Use GTG as an alternative start codon
257        let seq = b"GTGAAATAA";
258        let orfs = find_orfs(seq, 1);
259        assert!(orfs.is_empty(), "standard ATG-only should miss GTG start");
260
261        let orfs = find_orfs_with_codons(seq, 1, &[b"ATG", b"GTG"], DEFAULT_STOPS);
262        assert_eq!(orfs.len(), 1);
263        assert_eq!(orfs[0].start, 0);
264        assert_eq!(orfs[0].end, 9);
265    }
266
267    #[test]
268    fn empty_sequence() {
269        let orfs = find_orfs(b"", 1);
270        assert!(orfs.is_empty());
271    }
272
273    #[test]
274    fn sequence_shorter_than_codon() {
275        let orfs = find_orfs(b"AT", 1);
276        assert!(orfs.is_empty());
277
278        let orfs = find_orfs(b"A", 1);
279        assert!(orfs.is_empty());
280    }
281
282    #[test]
283    fn reverse_complement_basic() {
284        assert_eq!(reverse_complement(b"ATGC"), b"GCAT");
285        assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
286        assert_eq!(reverse_complement(b""), b"");
287    }
288}