#![allow(clippy::unreadable_literal)]
pub mod errors;
use std::collections::{HashMap, VecDeque};
use std::iter::Peekable;
use std::str;
use bbhash::MPHF;
use failure::{Error, SyncFailure};
use lazy_static::lazy_static;
use nthash::{ntf64, NtHashForwardIterator};
use crate::errors::UKHSError;
lazy_static! {
static ref UKHS_HASHES: HashMap<(usize, usize), &'static str> = {
let mut map = HashMap::new();
map.insert((7, 20), include_str!("../data/res_7_20_4_0.txt"));
map.insert((9, 20), include_str!("../data/res_9_20_4_0.txt"));
map.insert((9, 30), include_str!("../data/res_9_30_4_0.txt"));
map
};
}
pub struct UKHS {
k: usize,
w: usize,
mphf: MPHF,
revmap: Vec<u64>,
kmers: Vec<String>,
kmers_hashes: Vec<u64>,
}
impl<'a> UKHS {
pub fn new(k: usize, w: usize) -> Result<UKHS, Error> {
if k > w {
return Err(UKHSError::KSizeOutOfWRange { ksize: k, wsize: w }.into());
}
let w_round = (w / 10) * 10;
let entries = UKHS_HASHES[&(k, w_round)];
let mut kmers: Vec<String> = entries
.split('\n')
.filter_map(|s| {
if s.len() == k {
Some(String::from(s))
} else {
None
}
})
.collect();
kmers.sort_unstable();
let kmers_hashes: Vec<u64> = kmers.iter().map(|h| ntf64(h.as_bytes(), 0, k)).collect();
let mphf = MPHF::new(kmers_hashes.clone(), 1, 1.0);
let mut revmap = vec![0; kmers_hashes.len()];
for hash in &kmers_hashes {
revmap[mphf.lookup(*hash).unwrap() as usize] = *hash;
}
Ok(UKHS {
k,
w,
mphf,
revmap,
kmers,
kmers_hashes,
})
}
pub fn len(&self) -> usize {
self.kmers_hashes.len()
}
pub fn k(&self) -> usize {
self.k
}
pub fn w(&self) -> usize {
self.w
}
pub fn query_bucket(&self, hash: u64) -> Option<usize> {
if let Some(pos) = self.mphf.lookup(hash) {
if self.revmap[pos as usize] == hash {
return Some(pos as usize);
}
}
None
}
pub fn iter_sequence(&'a self, seq: &'a [u8]) -> UKHSIterator<'a> {
let mut max_idx = seq.len() - self.k + 1;
if self.k > seq.len() || self.w > seq.len() {
max_idx = 0;
}
UKHSIterator {
seq,
ukhs: self,
current_k_idx: 0,
current_w_idx: 0,
max_idx,
}
}
pub fn hash_iter_sequence(&'a self, seq: &'a [u8]) -> Result<UKHSHashIterator<'a>, Error> {
if self.k > seq.len() {
return Err(UKHSError::KSizeOutOfRange {
ksize: self.k,
sequence: String::from_utf8(seq.to_vec()).unwrap(),
}
.into());
}
if self.w > seq.len() {
return Err(UKHSError::WSizeOutOfRange {
wsize: self.w,
sequence: String::from_utf8(seq.to_vec()).unwrap(),
}
.into());
}
let mut nthash_k_iter = NtHashForwardIterator::new(seq, self.k)
.map_err(SyncFailure::new)?
.peekable();
let mut nthash_w_iter = NtHashForwardIterator::new(seq, self.w)
.map_err(SyncFailure::new)?
.peekable();
let current_w_hash = nthash_w_iter.next().unwrap();
let mut current_unikmers = VecDeque::with_capacity(self.k);
for i in 0..=self.w - self.k {
let k_hash = nthash_k_iter.next().unwrap();
if self.contains(k_hash) {
current_unikmers.push_back((i, k_hash));
}
}
let current_w_idx = 0;
let current_k_idx = 0;
let max_k_pos = seq.len() - self.k + 1;
Ok(UKHSHashIterator {
nthash_k_iter,
nthash_w_iter,
ukhs: self,
current_w_hash,
current_w_idx,
current_k_idx,
max_k_pos,
current_unikmers,
})
}
pub fn contains(&self, hash: u64) -> bool {
if let Some(pos) = self.mphf.lookup(hash) {
if self.revmap[pos as usize] == hash {
return true;
}
}
false
}
pub fn contains_kmer(&self, kmer: &str) -> bool {
self.kmers.binary_search(&kmer.into()).is_ok()
}
pub fn kmer_for_ukhs_hash(&self, hash: u64) -> Option<String> {
if let Some(pos) = self.kmers_hashes.iter().position(|&x| x == hash) {
Some(self.kmers[pos].clone())
} else {
None
}
}
}
pub struct UKHSIterator<'a> {
seq: &'a [u8],
ukhs: &'a UKHS,
current_k_idx: usize,
current_w_idx: usize,
max_idx: usize,
}
impl<'a> Iterator for UKHSIterator<'a> {
type Item = (String, String);
fn next(&mut self) -> Option<Self::Item> {
while self.current_k_idx != self.max_idx {
let last_k_pos = self.current_w_idx + self.ukhs.w() - self.ukhs.k() + 1;
if self.current_k_idx == last_k_pos {
self.current_w_idx += 1;
self.current_k_idx = self.current_w_idx;
}
let current_kmer =
str::from_utf8(&self.seq[self.current_k_idx..self.current_k_idx + self.ukhs.k])
.unwrap();
self.current_k_idx += 1;
if self.ukhs.contains_kmer(current_kmer) {
let wmer_start = self.current_w_idx;
let current_wmer =
str::from_utf8(&self.seq[wmer_start..wmer_start + self.ukhs.w]).unwrap();
return Some((current_wmer.into(), current_kmer.into()));
};
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.max_idx, Some(self.max_idx))
}
}
impl<'a> ExactSizeIterator for UKHSIterator<'a> {}
pub struct UKHSHashIterator<'a> {
ukhs: &'a UKHS,
nthash_k_iter: Peekable<NtHashForwardIterator<'a>>,
nthash_w_iter: Peekable<NtHashForwardIterator<'a>>,
current_w_hash: u64,
current_w_idx: usize,
current_k_idx: usize,
max_k_pos: usize,
current_unikmers: VecDeque<(usize, u64)>,
}
impl<'a> Iterator for UKHSHashIterator<'a> {
type Item = (u64, u64);
fn next(&mut self) -> Option<Self::Item> {
loop {
if self.current_k_idx >= self.max_k_pos {
return None;
}
let last_k_pos = self.current_w_idx + self.ukhs.w() - self.ukhs.k() + 1;
if self.current_k_idx < last_k_pos {
self.current_k_idx += 1;
if let Some((pos, k_hash)) = self
.current_unikmers
.iter()
.find(|(p, _)| *p >= self.current_k_idx - 1)
{
self.current_k_idx = *pos + 1;
return Some((self.current_w_hash, *k_hash));
} else {
self.current_k_idx = last_k_pos;
}
} else {
self.current_w_idx += 1;
if let Some((pos, _)) = self.current_unikmers.front() {
if *pos < self.current_w_idx {
self.current_unikmers.pop_front();
}
}
let new_k_hash = self.nthash_k_iter.next().unwrap();
if self.ukhs.contains(new_k_hash) {
self.current_unikmers.push_back((last_k_pos, new_k_hash));
}
self.current_w_hash = self.nthash_w_iter.next().unwrap();
self.current_k_idx = self.current_w_idx;
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.nthash_k_iter.size_hint()
}
}
impl<'a> ExactSizeIterator for UKHSHashIterator<'a> {}
#[cfg(test)]
mod test {
use super::*;
use std::collections::HashSet;
use std::iter::FromIterator;
#[test]
fn basic_check() {
let seq = b"ACACCGTAGCCTCCAGATGC";
let w = 20;
let k = 7;
let ukhs = UKHS::new(k, w).unwrap();
let it = ukhs.iter_sequence(seq);
let mut unikmers: Vec<String> = it.map(|(_, x)| x).collect();
unikmers.sort_unstable();
assert_eq!(unikmers, ["ACACCGT", "AGCCTCC", "CCGTAGC", "GCCTCCA"]);
let it = ukhs.hash_iter_sequence(seq).unwrap();
let ukhs_hash: Vec<(u64, u64)> = it.collect();
assert!(ukhs_hash.len() >= seq.len() - w + 1);
let mut ukhs_unhash_set: HashSet<String> = ukhs_hash
.iter()
.map(|(_, hash)| ukhs.kmer_for_ukhs_hash(*hash).unwrap())
.collect();
let mut ukhs_unhash = Vec::<String>::from_iter(ukhs_unhash_set.into_iter());
ukhs_unhash.sort_unstable();
assert_eq!(unikmers, ukhs_unhash);
assert_eq!(
ukhs_hash,
[
(0x37137c91412bb512, 0xfbd9591aa929c685),
(0x37137c91412bb512, 0x9cd9a1bcb779d6ad),
(0x37137c91412bb512, 0x46fa47d28c0ffba5),
(0x37137c91412bb512, 0xf482addc6edbc920)
]
);
}
#[test]
fn longer_check() {
let seq = b"ACACCGTAGCCTCCAGATGCGTAG";
let k = 7;
let w = 20;
let ukhs = UKHS::new(k, w).unwrap();
let it = ukhs.iter_sequence(seq);
let mut unikmers: Vec<String> = it.map(|(_, x)| x).collect();
assert_eq!(
unikmers,
[
"ACACCGT", "CCGTAGC", "AGCCTCC", "GCCTCCA",
"CCGTAGC", "AGCCTCC", "GCCTCCA",
"CCGTAGC", "AGCCTCC", "GCCTCCA",
"CCGTAGC", "AGCCTCC", "GCCTCCA", "ATGCGTA",
"AGCCTCC", "GCCTCCA", "ATGCGTA"
]
);
let it = ukhs.hash_iter_sequence(seq).unwrap();
let ukhs_hash: Vec<(u64, u64)> = it.collect();
assert!(
ukhs_hash.len() >= seq.len() - w + 1,
"iter len {}, should be at least {}",
ukhs_hash.len(),
seq.len() - w + 1
);
let mut ukhs_unhash: Vec<String> = ukhs_hash
.iter()
.map(|(_, hash)| ukhs.kmer_for_ukhs_hash(*hash).unwrap())
.collect();
assert_eq!(unikmers, ukhs_unhash);
assert_eq!(
ukhs_hash,
[
(0x37137c91412bb512, 0xfbd9591aa929c685),
(0x37137c91412bb512, 0x9cd9a1bcb779d6ad),
(0x37137c91412bb512, 0x46fa47d28c0ffba5),
(0x37137c91412bb512, 0xf482addc6edbc920),
(0xf52d9b92474381bf, 0x9cd9a1bcb779d6ad),
(0xf52d9b92474381bf, 0x46fa47d28c0ffba5),
(0xf52d9b92474381bf, 0xf482addc6edbc920),
(0xdb5854d371a65e15, 0x9cd9a1bcb779d6ad),
(0xdb5854d371a65e15, 0x46fa47d28c0ffba5),
(0xdb5854d371a65e15, 0xf482addc6edbc920),
(0x31020e7531c970e0, 0x9cd9a1bcb779d6ad),
(0x31020e7531c970e0, 0x46fa47d28c0ffba5),
(0x31020e7531c970e0, 0xf482addc6edbc920),
(0x31020e7531c970e0, 0x6903a07436e4ffa1),
(0x5a6008385506dbd8, 0x46fa47d28c0ffba5),
(0x5a6008385506dbd8, 0xf482addc6edbc920),
(0x5a6008385506dbd8, 0x6903a07436e4ffa1)
]
);
}
}