fn is_good_base(chr: u8) -> bool {
match chr as char {
'a' | 'c' | 'g' | 't' | 'A' | 'C' | 'G' | 'T' => true,
_ => false,
}
}
pub struct Kmers<'a> {
k: u8,
start_pos: usize,
buffer: &'a [u8],
}
impl<'a> Kmers<'a> {
pub fn new(buffer: &'a [u8], k: u8) -> Self {
Kmers {
k,
start_pos: 0,
buffer,
}
}
}
impl<'a> Iterator for Kmers<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
if self.start_pos + self.k as usize > self.buffer.len() {
return None;
}
let pos = self.start_pos;
self.start_pos += 1;
Some(&self.buffer[pos..pos + self.k as usize])
}
}
pub struct CanonicalKmers<'a> {
k: u8,
start_pos: usize,
buffer: &'a [u8],
rc_buffer: &'a [u8],
}
impl<'a> CanonicalKmers<'a> {
pub fn new(buffer: &'a [u8], rc_buffer: &'a [u8], k: u8) -> Self {
let mut nucl_kmers = CanonicalKmers {
k,
start_pos: 0,
buffer,
rc_buffer,
};
nucl_kmers.update_position(true);
nucl_kmers
}
fn update_position(&mut self, initial: bool) -> bool {
if self.start_pos + self.k as usize > self.buffer.len() {
return false;
}
let (mut kmer_len, stop_len) = if initial {
(0, (self.k - 1) as usize)
} else {
((self.k - 1) as usize, self.k as usize)
};
while kmer_len < stop_len {
if is_good_base(self.buffer[self.start_pos + kmer_len]) {
kmer_len += 1;
} else {
kmer_len = 0;
self.start_pos += kmer_len + 1;
if self.start_pos + self.k as usize > self.buffer.len() {
return false;
}
}
}
true
}
}
impl<'a> Iterator for CanonicalKmers<'a> {
type Item = (usize, &'a [u8], bool);
fn next(&mut self) -> Option<(usize, &'a [u8], bool)> {
if !self.update_position(false) {
return None;
}
let pos = self.start_pos;
self.start_pos += 1;
let result = &self.buffer[pos..pos + self.k as usize];
let rc_buffer = self.rc_buffer;
let rc_result = &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos];
if result < rc_result {
Some((pos, result, false))
} else {
Some((pos, rc_result, true))
}
}
}
#[cfg(tests)]
mod tests {
use super::*;
#[test]
fn can_kmerize() {
let k_iter = Kmers::new(b"AGCT", 1);
for (i, k) in k_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"A"[..]),
1 => assert_eq!(k, &b"G"[..]),
2 => assert_eq!(k, &b"C"[..]),
3 => assert_eq!(k, &b"T"[..]),
_ => unreachable!("Too many kmers"),
}
}
let k_iter = Kmers::new(b"AGNCT", 2);
for (i, k) in k_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"AC"[..]),
1 => assert_eq!(k, &b"CN"[..]),
2 => assert_eq!(k, &b"NG"[..]),
3 => assert_eq!(k, &b"GT"[..]),
_ => unreachable!("Too many kmers"),
}
}
let k_iter = Kmers::new(b"AC", 2);
for k in k_iter {
assert_eq!(k, &b"AC"[..]);
}
}
#[test]
fn can_canonicalize() {
let seq = b"AGCT";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 1);
for (i, (_, k, is_c)) in c_iter.enumerate() {
match i {
0 => {
assert_eq!(k, &b"A"[..]);
assert_eq!(is_c, false);
}
1 => {
assert_eq!(k, &b"C"[..]);
assert_eq!(is_c, true);
}
2 => {
assert_eq!(k, &b"C"[..]);
assert_eq!(is_c, false);
}
3 => {
assert_eq!(k, &b"A"[..]);
assert_eq!(is_c, true);
}
_ => unreachable!("Too many kmers"),
}
}
let seq = b"AGCTA";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
for (i, (_, k, _)) in c_iter.enumerate() {
match i {
0 => assert_eq!(k, &b"AG"[..]),
1 => assert_eq!(k, &b"GC"[..]),
2 => assert_eq!(k, &b"AG"[..]),
3 => assert_eq!(k, &b"TA"[..]),
_ => unreachable!("Too many kmers"),
}
}
let seq = b"AGNTA";
let c_iter = CanonicalKmers(seq, &seq.reverse_complement(), 2);
for (i, (ix, k, _)) in c_iter.enumerate() {
match i {
0 => {
assert_eq!(ix, 0);
assert_eq!(k, &b"AG"[..]);
}
1 => {
assert_eq!(ix, 3);
assert_eq!(k, &b"TA"[..]);
}
_ => unreachable!("Too many kmers"),
}
}
}
}