bio/seq_analysis/
orf.rs

1// Copyright 2014-2016 Johannes Köster, Martin Larralde.
2// Licensed under the MIT license (http://opensource.org/licenses/MIT)
3// This file may not be copied, modified, or distributed
4// except according to those terms.
5
6//! One-way open reading frame (ORF) finder algorithm.
7//!
8//! Complexity: O(n).
9//!
10//! # Example
11//!
12//! ```
13//! use bio::seq_analysis::orf::{Finder, Orf};
14//! let start_codons = vec![b"ATG"];
15//! let stop_codons = vec![b"TGA", b"TAG", b"TAA"];
16//! let min_len = 50;
17//! let finder = Finder::new(start_codons, stop_codons, min_len);
18//!
19//! let sequence = b"ACGGCTAGAAAAGGCTAGAAAA";
20//!
21//! for Orf { start, end, offset } in finder.find_all(sequence) {
22//!     let orf = &sequence[start..end];
23//!     //...do something with orf sequence...
24//! }
25//! ```
26//!
27//! Right now the only way to check the reverse strand for ORF is to use
28//! the `alphabet::dna::RevComp` struct and to check for both sequences.
29//! But that's not so performance friendly, as the reverse complementation and the ORF research
30//! could go on at the same time.
31
32use std::borrow::Borrow;
33use std::collections::VecDeque;
34use std::iter;
35
36/// An implementation of a naive algorithm finder
37// Implementation note:
38//
39// VecDeque is used rather than the obvious [u8; 3] to represent
40// codons because a VecDeque<u8> is used to represent a sliding codon
41// (see: State.codon) window which unfortunately, cannot be compared
42// to [u8; 3].
43#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
44pub struct Finder {
45    start_codons: Vec<VecDeque<u8>>,
46    stop_codons: Vec<VecDeque<u8>>,
47    min_len: usize,
48}
49
50impl Finder {
51    /// Create a new instance of a finder for the given start and stop codons and the minimum
52    /// length of an ORF.
53    pub fn new<'a>(
54        start_codons: Vec<&'a [u8; 3]>,
55        stop_codons: Vec<&'a [u8; 3]>,
56        min_len: usize,
57    ) -> Self {
58        Finder {
59            start_codons: start_codons
60                .iter()
61                .map(|x| x.iter().copied().collect::<VecDeque<u8>>())
62                .collect(),
63            stop_codons: stop_codons
64                .iter()
65                .map(|x| x.iter().copied().collect::<VecDeque<u8>>())
66                .collect(),
67            min_len,
68        }
69    }
70
71    /// Find all ORFs in the given sequence
72    pub fn find_all<C, T>(&self, seq: T) -> Matches<'_, C, T::IntoIter>
73    where
74        C: Borrow<u8>,
75        T: IntoIterator<Item = C>,
76    {
77        Matches {
78            finder: self,
79            state: State::new(),
80            seq: seq.into_iter().enumerate(),
81        }
82    }
83}
84
85/// An ORF representation with start and end position of said ORF,
86/// as well as offset of the reading frame (1,2,3) and strand location
87// (current: +, reverse complementary: -).
88#[derive(
89    Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize,
90)]
91pub struct Orf {
92    pub start: usize,
93    pub end: usize,
94    pub offset: i8,
95}
96
97/// The current algorithm state.
98#[derive(Clone, Debug)]
99struct State {
100    start_pos: [Vec<usize>; 3],
101    codon: VecDeque<u8>,
102    found: VecDeque<Orf>,
103}
104
105impl State {
106    /// Create new state.
107    pub fn new() -> Self {
108        State {
109            start_pos: [Vec::new(), Vec::new(), Vec::new()],
110            codon: VecDeque::new(),
111            found: VecDeque::new(),
112        }
113    }
114}
115
116/// Iterator over offset, start position, end position and sequence of matched ORFs.
117#[derive(Clone, Debug)]
118pub struct Matches<'a, C, T>
119where
120    C: Borrow<u8>,
121    T: Iterator<Item = C>,
122{
123    finder: &'a Finder,
124    state: State,
125    seq: iter::Enumerate<T>,
126}
127
128impl<'a, C, T> Iterator for Matches<'a, C, T>
129where
130    C: Borrow<u8>,
131    T: Iterator<Item = C>,
132{
133    type Item = Orf;
134
135    fn next(&mut self) -> Option<Orf> {
136        let mut offset: usize;
137
138        // return any orfs already found
139        if !self.state.found.is_empty() {
140            return self.state.found.pop_front();
141        }
142
143        for (index, nuc) in self.seq.by_ref() {
144            // update the codon
145            if self.state.codon.len() >= 3 {
146                self.state.codon.pop_front();
147            }
148            self.state.codon.push_back(*nuc.borrow());
149            offset = (index + 1) % 3;
150
151            // check if entering orf
152            if self.finder.start_codons.contains(&self.state.codon) {
153                self.state.start_pos[offset].push(index);
154            }
155            // inside orf
156            if !self.state.start_pos[offset].is_empty() {
157                // check if leaving orf
158                if self.finder.stop_codons.contains(&self.state.codon) {
159                    for start_pos in &self.state.start_pos[offset] {
160                        // check if length is sufficient
161                        if index + 1 - start_pos > self.finder.min_len {
162                            // build results
163                            self.state.found.push_back(Orf {
164                                start: start_pos - 2,
165                                end: index + 1,
166                                offset: offset as i8,
167                            });
168                        // if the first orf is too short, so are the others
169                        } else {
170                            break;
171                        }
172                    }
173                    // reinitialize
174                    self.state.start_pos[offset] = Vec::new();
175                }
176            }
177            if !self.state.found.is_empty() {
178                return self.state.found.pop_front();
179            }
180        }
181        None
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    fn basic_finder() -> Finder {
190        let start_codons = vec![b"ATG"];
191        let stop_codons = vec![b"TGA", b"TAG", b"TAA"];
192        let min_len = 5;
193        Finder::new(start_codons, stop_codons, min_len)
194    }
195
196    #[test]
197    fn test_no_orf() {
198        let finder = basic_finder();
199        let sequence = b"ACGGCTAGAAAAGGCTAGAAAA";
200        assert!(finder.find_all(sequence).next().is_none());
201    }
202
203    #[test]
204    fn test_one_orf_no_offset() {
205        let finder = basic_finder();
206        let sequence = b"GGGATGGGGTGAGGG";
207        let expected = vec![Orf {
208            start: 3,
209            end: 12,
210            offset: 0,
211        }];
212        assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
213    }
214
215    #[test]
216    fn test_one_orf_with_offset() {
217        let finder = basic_finder();
218        let sequence = b"AGGGATGGGGTGAGGG";
219        let expected = vec![Orf {
220            start: 4,
221            end: 13,
222            offset: 1,
223        }];
224        assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
225    }
226
227    #[test]
228    fn test_two_orfs_different_offsets() {
229        let finder = basic_finder();
230        let sequence = b"ATGGGGTGAGGGGGATGGAAAAATAAG";
231        let expected = vec![
232            Orf {
233                start: 0,
234                end: 9,
235                offset: 0,
236            },
237            Orf {
238                start: 14,
239                end: 26,
240                offset: 2,
241            },
242        ];
243        assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
244    }
245
246    #[test]
247    fn test_three_nested_and_offset_orfs() {
248        let finder = basic_finder();
249        let sequence = b"ATGGGGATGGGGGGATGGAAAAATAAGTAG";
250        let expected = vec![
251            Orf {
252                start: 14,
253                end: 26,
254                offset: 2,
255            },
256            Orf {
257                start: 0,
258                end: 30,
259                offset: 0,
260            },
261            Orf {
262                start: 6,
263                end: 30,
264                offset: 0,
265            },
266        ];
267        assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
268    }
269}