use std::borrow::Borrow;
use std::collections::VecDeque;
use std::iter;
#[derive(Default, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
pub struct Finder {
start_codons: Vec<VecDeque<u8>>,
stop_codons: Vec<VecDeque<u8>>,
min_len: usize,
}
impl Finder {
pub fn new<'a>(
start_codons: Vec<&'a [u8; 3]>,
stop_codons: Vec<&'a [u8; 3]>,
min_len: usize,
) -> Self {
Finder {
start_codons: start_codons
.iter()
.map(|x| x.iter().copied().collect::<VecDeque<u8>>())
.collect(),
stop_codons: stop_codons
.iter()
.map(|x| x.iter().copied().collect::<VecDeque<u8>>())
.collect(),
min_len,
}
}
pub fn find_all<C, T>(&self, seq: T) -> Matches<'_, C, T::IntoIter>
where
C: Borrow<u8>,
T: IntoIterator<Item = C>,
{
Matches {
finder: self,
state: State::new(),
seq: seq.into_iter().enumerate(),
}
}
}
#[derive(
Default, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize,
)]
pub struct Orf {
pub start: usize,
pub end: usize,
pub offset: i8,
}
#[derive(Clone, Debug)]
struct State {
start_pos: [Vec<usize>; 3],
codon: VecDeque<u8>,
found: VecDeque<Orf>,
}
impl State {
pub fn new() -> Self {
State {
start_pos: [Vec::new(), Vec::new(), Vec::new()],
codon: VecDeque::new(),
found: VecDeque::new(),
}
}
}
#[derive(Clone, Debug)]
pub struct Matches<'a, C, T>
where
C: Borrow<u8>,
T: Iterator<Item = C>,
{
finder: &'a Finder,
state: State,
seq: iter::Enumerate<T>,
}
impl<'a, C, T> Iterator for Matches<'a, C, T>
where
C: Borrow<u8>,
T: Iterator<Item = C>,
{
type Item = Orf;
fn next(&mut self) -> Option<Orf> {
let mut offset: usize;
if !self.state.found.is_empty() {
return self.state.found.pop_front();
}
for (index, nuc) in self.seq.by_ref() {
if self.state.codon.len() >= 3 {
self.state.codon.pop_front();
}
self.state.codon.push_back(*nuc.borrow());
offset = (index + 1) % 3;
if self.finder.start_codons.contains(&self.state.codon) {
self.state.start_pos[offset].push(index);
}
if !self.state.start_pos[offset].is_empty() {
if self.finder.stop_codons.contains(&self.state.codon) {
for start_pos in &self.state.start_pos[offset] {
if index + 1 - start_pos > self.finder.min_len {
self.state.found.push_back(Orf {
start: start_pos - 2,
end: index + 1,
offset: offset as i8,
});
} else {
break;
}
}
self.state.start_pos[offset] = Vec::new();
}
}
if !self.state.found.is_empty() {
return self.state.found.pop_front();
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
fn basic_finder() -> Finder {
let start_codons = vec![b"ATG"];
let stop_codons = vec![b"TGA", b"TAG", b"TAA"];
let min_len = 5;
Finder::new(start_codons, stop_codons, min_len)
}
#[test]
fn test_no_orf() {
let finder = basic_finder();
let sequence = b"ACGGCTAGAAAAGGCTAGAAAA";
assert!(finder.find_all(sequence).next().is_none());
}
#[test]
fn test_one_orf_no_offset() {
let finder = basic_finder();
let sequence = b"GGGATGGGGTGAGGG";
let expected = vec![Orf {
start: 3,
end: 12,
offset: 0,
}];
assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
}
#[test]
fn test_one_orf_with_offset() {
let finder = basic_finder();
let sequence = b"AGGGATGGGGTGAGGG";
let expected = vec![Orf {
start: 4,
end: 13,
offset: 1,
}];
assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
}
#[test]
fn test_two_orfs_different_offsets() {
let finder = basic_finder();
let sequence = b"ATGGGGTGAGGGGGATGGAAAAATAAG";
let expected = vec![
Orf {
start: 0,
end: 9,
offset: 0,
},
Orf {
start: 14,
end: 26,
offset: 2,
},
];
assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
}
#[test]
fn test_three_nested_and_offset_orfs() {
let finder = basic_finder();
let sequence = b"ATGGGGATGGGGGGATGGAAAAATAAGTAG";
let expected = vec![
Orf {
start: 14,
end: 26,
offset: 2,
},
Orf {
start: 0,
end: 30,
offset: 0,
},
Orf {
start: 6,
end: 30,
offset: 0,
},
];
assert_eq!(expected, finder.find_all(sequence).collect::<Vec<Orf>>());
}
}