1#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct OrfResult {
11 pub start: usize,
13 pub end: usize,
15 pub frame: usize,
17 pub strand: Strand,
19 pub sequence: Vec<u8>,
21}
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum Strand {
26 Forward,
27 Reverse,
28}
29
30const DEFAULT_STARTS: &[&[u8]] = &[b"ATG"];
31const DEFAULT_STOPS: &[&[u8]] = &[b"TAA", b"TAG", b"TGA"];
32
33fn reverse_complement(seq: &[u8]) -> Vec<u8> {
38 seq.iter()
39 .rev()
40 .map(|&b| match b {
41 b'A' => b'T',
42 b'T' => b'A',
43 b'C' => b'G',
44 b'G' => b'C',
45 other => other,
46 })
47 .collect()
48}
49
50fn matches_codon(codon: &[u8], codons: &[&[u8]]) -> bool {
52 codons.iter().any(|c| c == &codon)
53}
54
55pub fn find_orfs_with_codons(
61 seq: &[u8],
62 min_length: usize,
63 start_codons: &[&[u8]],
64 stop_codons: &[&[u8]],
65) -> Vec<OrfResult> {
66 let mut results = Vec::new();
67 let len = seq.len();
68
69 for frame in 0..3 {
70 let mut pos = frame;
71 let mut orf_start: Option<usize> = None;
72
73 while pos + 3 <= len {
74 let codon = &seq[pos..pos + 3];
75
76 if orf_start.is_none() && matches_codon(codon, start_codons) {
77 orf_start = Some(pos);
78 } else if orf_start.is_some() && matches_codon(codon, stop_codons) {
79 let start = orf_start.unwrap();
80 let end = pos + 3;
81 if end - start >= min_length {
82 results.push(OrfResult {
83 start,
84 end,
85 frame,
86 strand: Strand::Forward,
87 sequence: seq[start..end].to_vec(),
88 });
89 }
90 orf_start = None;
91 }
92
93 pos += 3;
94 }
95
96 if let Some(start) = orf_start {
98 let end = len;
99 if end - start >= min_length {
100 results.push(OrfResult {
101 start,
102 end,
103 frame,
104 strand: Strand::Forward,
105 sequence: seq[start..end].to_vec(),
106 });
107 }
108 }
109 }
110
111 results
112}
113
114pub fn find_orfs(seq: &[u8], min_length: usize) -> Vec<OrfResult> {
119 find_orfs_with_codons(seq, min_length, DEFAULT_STARTS, DEFAULT_STOPS)
120}
121
122pub fn find_orfs_both_strands(seq: &[u8], min_length: usize) -> Vec<OrfResult> {
128 let mut results = find_orfs(seq, min_length);
129
130 let rc = reverse_complement(seq);
131 let rc_orfs = find_orfs(&rc, min_length);
132 let len = seq.len();
133
134 for mut orf in rc_orfs {
135 let orig_start = len - orf.end;
138 let orig_end = len - orf.start;
139 orf.start = orig_start;
140 orf.end = orig_end;
141 orf.strand = Strand::Reverse;
142 results.push(orf);
144 }
145
146 results
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 #[test]
154 fn known_orf() {
155 let orfs = find_orfs(b"ATGAAATAA", 1);
157 assert_eq!(orfs.len(), 1);
158 assert_eq!(orfs[0].start, 0);
159 assert_eq!(orfs[0].end, 9);
160 assert_eq!(orfs[0].frame, 0);
161 assert_eq!(orfs[0].strand, Strand::Forward);
162 assert_eq!(orfs[0].sequence, b"ATGAAATAA");
163 }
164
165 #[test]
166 fn multiple_orfs_different_frames() {
167 let seq = b"AATGAAATGAX";
170 let orfs = find_orfs(seq, 1);
171 assert!(orfs.len() >= 2);
176 let frame1: Vec<_> = orfs.iter().filter(|o| o.frame == 1).collect();
177 assert_eq!(frame1.len(), 1);
178 assert_eq!(frame1[0].start, 1);
179 assert_eq!(frame1[0].end, 10);
180 }
181
182 #[test]
183 fn no_orfs_no_atg() {
184 let orfs = find_orfs(b"CCCGGGTTTAAA", 1);
185 assert!(orfs.is_empty());
186 }
187
188 #[test]
189 fn orf_extending_to_end() {
190 let orfs = find_orfs(b"ATGAAACCC", 1);
192 assert_eq!(orfs.len(), 1);
193 assert_eq!(orfs[0].start, 0);
194 assert_eq!(orfs[0].end, 9);
195 assert_eq!(orfs[0].sequence, b"ATGAAACCC");
196 }
197
198 #[test]
199 fn overlapping_orfs_same_frame() {
200 let seq = b"ATGAAATAAATGCCCTAG";
202 let orfs: Vec<_> = find_orfs(seq, 1)
203 .into_iter()
204 .filter(|o| o.frame == 0)
205 .collect();
206 assert_eq!(orfs.len(), 2);
207 assert_eq!(orfs[0].start, 0);
208 assert_eq!(orfs[0].end, 9);
209 assert_eq!(orfs[1].start, 9);
210 assert_eq!(orfs[1].end, 18);
211 }
212
213 #[test]
214 fn min_length_filtering() {
215 let orfs = find_orfs(b"ATGAAATAA", 10);
217 assert!(orfs.is_empty());
218
219 let orfs = find_orfs(b"ATGAAATAA", 9);
220 assert_eq!(orfs.len(), 1);
221 }
222
223 #[test]
224 fn both_strands() {
225 let seq = b"ATGCCCTAATTAGGGCAT";
233 let orfs = find_orfs_both_strands(seq, 1);
234 let fwd: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Forward).collect();
235 let rev: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Reverse).collect();
236 assert!(!fwd.is_empty(), "expected forward ORFs");
237 assert!(!rev.is_empty(), "expected reverse ORFs");
238 }
239
240 #[test]
241 fn reverse_strand_coordinates() {
242 let seq = b"TTAGGGCAT";
246 let orfs = find_orfs_both_strands(seq, 1);
247 let rev: Vec<_> = orfs.iter().filter(|o| o.strand == Strand::Reverse).collect();
248 assert_eq!(rev.len(), 1);
249 assert_eq!(rev[0].start, 0);
250 assert_eq!(rev[0].end, 9);
251 assert_eq!(rev[0].sequence, b"ATGCCCTAA");
252 }
253
254 #[test]
255 fn configurable_start_stop_codons() {
256 let seq = b"GTGAAATAA";
258 let orfs = find_orfs(seq, 1);
259 assert!(orfs.is_empty(), "standard ATG-only should miss GTG start");
260
261 let orfs = find_orfs_with_codons(seq, 1, &[b"ATG", b"GTG"], DEFAULT_STOPS);
262 assert_eq!(orfs.len(), 1);
263 assert_eq!(orfs[0].start, 0);
264 assert_eq!(orfs[0].end, 9);
265 }
266
267 #[test]
268 fn empty_sequence() {
269 let orfs = find_orfs(b"", 1);
270 assert!(orfs.is_empty());
271 }
272
273 #[test]
274 fn sequence_shorter_than_codon() {
275 let orfs = find_orfs(b"AT", 1);
276 assert!(orfs.is_empty());
277
278 let orfs = find_orfs(b"A", 1);
279 assert!(orfs.is_empty());
280 }
281
282 #[test]
283 fn reverse_complement_basic() {
284 assert_eq!(reverse_complement(b"ATGC"), b"GCAT");
285 assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
286 assert_eq!(reverse_complement(b""), b"");
287 }
288}