mod error;
pub use crate::error::{Error, Result};
pub(crate) const MAXIMUM_K_SIZE: usize = u32::max_value() as usize;
const H_LOOKUP: [u64; 256] = {
let mut lookup = [1; 256];
lookup[b'A' as usize] = 0x3c8b_fbb3_95c6_0474;
lookup[b'C' as usize] = 0x3193_c185_62a0_2b4c;
lookup[b'G' as usize] = 0x2032_3ed0_8257_2324;
lookup[b'T' as usize] = 0x2955_49f5_4be2_4456;
lookup[b'N' as usize] = 0;
lookup
};
const RC_LOOKUP: [u64; 256] = {
let mut lookup = [1; 256];
lookup[b'A' as usize] = 0x2955_49f5_4be2_4456;
lookup[b'C' as usize] = 0x2032_3ed0_8257_2324;
lookup[b'G' as usize] = 0x3193_c185_62a0_2b4c;
lookup[b'T' as usize] = 0x3c8b_fbb3_95c6_0474;
lookup[b'N' as usize] = 0;
lookup
};
#[inline(always)]
fn h(c: u8) -> u64 {
let val = H_LOOKUP[c as usize];
if val == 1 {
panic!("Non-ACGTN nucleotide encountered! {}", c as char)
}
val
}
#[inline(always)]
fn rc(nt: u8) -> u64 {
let val = RC_LOOKUP[nt as usize];
if val == 1 {
panic!("Non-ACGTN nucleotide encountered! {}", nt as char)
}
val
}
pub fn ntf64(s: &[u8], i: usize, k: usize) -> u64 {
let mut out = h(s[i + k - 1]);
for (idx, v) in s.iter().skip(i).take(k - 1).enumerate() {
out ^= h(*v).rotate_left((k - idx - 1) as u32);
}
out
}
pub fn ntr64(s: &[u8], i: usize, k: usize) -> u64 {
let mut out = rc(s[i]);
for (idx, v) in s.iter().skip(i + 1).take(k - 1).enumerate() {
out ^= rc(*v).rotate_left(idx as u32 + 1);
}
out
}
pub fn ntc64(s: &[u8], i: usize, ksize: usize) -> u64 {
u64::min(ntr64(s, i, ksize), ntf64(s, i, ksize))
}
pub fn nthash(seq: &[u8], ksize: usize) -> Vec<u64> {
seq.windows(ksize).map(|x| ntc64(x, 0, ksize)).collect()
}
#[derive(Debug)]
pub struct NtHashIterator<'a> {
seq: &'a [u8],
k: usize,
fh: u64,
rh: u64,
current_idx: usize,
max_idx: usize,
}
impl<'a> NtHashIterator<'a> {
pub fn new(seq: &'a [u8], k: usize) -> Result<NtHashIterator<'a>> {
if k > seq.len() {
return Err(Error::KSizeOutOfRange {
ksize: k,
seq_size: seq.len(),
});
}
if k > MAXIMUM_K_SIZE {
return Err(Error::KSizeTooBig(k));
}
let mut fh = 0;
for (i, v) in seq[0..k].iter().enumerate() {
fh ^= h(*v).rotate_left((k - i - 1) as u32);
}
let mut rh = 0;
for (i, v) in seq[0..k].iter().rev().enumerate() {
rh ^= rc(*v).rotate_left((k - i - 1) as u32);
}
Ok(NtHashIterator {
seq,
k,
fh,
rh,
current_idx: 0,
max_idx: seq.len() - k + 1,
})
}
}
impl<'a> Iterator for NtHashIterator<'a> {
type Item = u64;
fn next(&mut self) -> Option<u64> {
if self.current_idx == self.max_idx {
return None;
};
if self.current_idx != 0 {
let i = self.current_idx - 1;
let seqi = self.seq[i];
let seqk = self.seq[i + self.k];
self.fh = self.fh.rotate_left(1) ^ h(seqi).rotate_left(self.k as u32) ^ h(seqk);
self.rh = self.rh.rotate_right(1)
^ rc(seqi).rotate_right(1)
^ rc(seqk).rotate_left(self.k as u32 - 1);
}
self.current_idx += 1;
Some(u64::min(self.rh, self.fh))
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.max_idx, Some(self.max_idx))
}
}
impl<'a> ExactSizeIterator for NtHashIterator<'a> {}
#[derive(Debug)]
pub struct NtHashForwardIterator<'a> {
seq: &'a [u8],
k: usize,
fh: u64,
current_idx: usize,
max_idx: usize,
}
impl<'a> NtHashForwardIterator<'a> {
pub fn new(seq: &'a [u8], k: usize) -> Result<NtHashForwardIterator<'a>> {
if k > seq.len() {
return Err(Error::KSizeOutOfRange {
ksize: k,
seq_size: seq.len(),
});
}
if k > MAXIMUM_K_SIZE {
return Err(Error::KSizeTooBig(k));
}
let mut fh = 0;
for (i, v) in seq[0..k].iter().enumerate() {
fh ^= h(*v).rotate_left((k - i - 1) as u32);
}
Ok(NtHashForwardIterator {
seq,
k,
fh,
current_idx: 0,
max_idx: seq.len() - k + 1,
})
}
}
impl<'a> Iterator for NtHashForwardIterator<'a> {
type Item = u64;
fn next(&mut self) -> Option<u64> {
if self.current_idx == self.max_idx {
return None;
};
if self.current_idx != 0 {
let i = self.current_idx - 1;
let seqi = self.seq[i];
let seqk = self.seq[i + self.k];
self.fh = self.fh.rotate_left(1) ^ h(seqi).rotate_left(self.k as u32) ^ h(seqk);
}
self.current_idx += 1;
Some(self.fh)
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.max_idx, Some(self.max_idx))
}
}
impl<'a> ExactSizeIterator for NtHashForwardIterator<'a> {}