mod anti_lex;
mod intrinsics;
mod nthash;
#[cfg(test)]
mod test;
pub use anti_lex::AntiLexHasher;
pub use nthash::{MulHasher, NtHasher};
pub use packed_seq;
use packed_seq::{ChunkIt, Delay, PackedNSeq, PaddedIt, Seq};
use std::iter::{repeat, zip};
type S = wide::u32x8;
pub trait KmerHasher {
const CANONICAL: bool;
fn new(k: usize) -> Self;
#[inline(always)]
fn is_canonical(&self) -> bool {
Self::CANONICAL
}
fn k(&self) -> usize;
#[inline(always)]
fn delay(&self) -> Delay {
Delay(self.k() - 1)
}
fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32;
fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S;
fn in_out_mapper_ambiguous_scalar<'s>(
&self,
nseq: PackedNSeq<'s>,
) -> impl FnMut((u8, u8)) -> u32 {
let mut mapper = self.in_out_mapper_scalar(nseq.seq);
let mut ambiguous = nseq.ambiguous.iter_kmer_ambiguity(self.k());
let k = self.k();
let mut i = 0;
move |(a, r)| {
let hash = mapper((a, r));
let ambiguous = if i > k - 1 {
ambiguous.next().unwrap()
} else {
false
};
i += 1;
if ambiguous { u32::MAX } else { hash }
}
}
#[inline(always)]
fn in_out_mapper_ambiguous_simd<'s>(
&self,
nseq: PackedNSeq<'s>,
context: usize,
) -> impl FnMut((S, S)) -> S {
let mut mapper = self.in_out_mapper_simd(nseq.seq);
let mut ambiguous = nseq.ambiguous.par_iter_kmer_ambiguity(self.k(), context, 0);
move |(a, r)| {
let hash = mapper((a, r));
let ambiguous = ambiguous.it.next().unwrap();
ambiguous.blend(S::MAX, hash)
}
}
#[inline(always)]
fn hash_kmers_scalar<'s>(&self, seq: impl Seq<'s>) -> impl ExactSizeIterator<Item = u32> {
let k = self.k();
let delay = self.delay();
let mut add = seq.iter_bp();
let mut remove = seq.iter_bp();
let mut mapper = self.in_out_mapper_scalar(seq);
zip(add.by_ref().take(delay.0), repeat(0)).for_each(|a| {
mapper(a);
});
zip(add.by_ref(), remove.by_ref())
.take(k - 1 - delay.0)
.for_each(|a| {
mapper(a);
});
zip(add, remove).map(mapper)
}
#[inline(always)]
fn hash_kmers_simd<'s>(&self, seq: impl Seq<'s>, context: usize) -> PaddedIt<impl ChunkIt<S>> {
let k = self.k();
let delay = self.delay();
seq.par_iter_bp_delayed(context + k - 1, delay)
.map(self.in_out_mapper_simd(seq))
.advance(k - 1)
}
#[inline(always)]
fn hash_valid_kmers_scalar<'s>(
&self,
nseq: PackedNSeq<'s>,
) -> impl ExactSizeIterator<Item = u32> {
let k = self.k();
let delay = self.delay();
assert!(delay.0 < k);
let mut mapper = self.in_out_mapper_scalar(nseq.seq);
let mut a = nseq.seq.iter_bp();
let mut r = nseq.seq.iter_bp();
a.by_ref().take(delay.0).for_each(
#[inline(always)]
|a| {
mapper((a, 0));
},
);
zip(a.by_ref(), r.by_ref())
.take((k - 1) - delay.0)
.for_each(
#[inline(always)]
|(a, r)| {
mapper((a, r));
},
);
zip(zip(a, r), nseq.ambiguous.iter_kmer_ambiguity(k)).map(
#[inline(always)]
move |(ar, ambiguous)| {
let hash = mapper(ar);
if ambiguous { u32::MAX } else { hash }
},
)
}
#[inline(always)]
fn hash_valid_kmers_simd<'s, 't>(
&'t self,
nseq: PackedNSeq<'s>,
context: usize,
) -> PaddedIt<impl ChunkIt<S> + use<'s, 't, Self>> {
let k = self.k();
let delay = self.delay();
let mut hash_mapper = self.in_out_mapper_simd(nseq.seq);
let mut ambiguity_it = nseq
.ambiguous
.par_iter_kmer_ambiguity(k, context + k - 1, 0);
nseq.seq
.par_iter_bp_delayed_with_factor(context + k - 1, delay, 2)
.map(
#[inline(always)]
move |(a, r)| {
let is_ambiguous = unsafe { ambiguity_it.it.next().unwrap_unchecked() };
let hash = hash_mapper((a, r));
is_ambiguous.blend(S::MAX, hash)
},
)
.advance(k - 1)
}
fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32;
#[inline(always)]
fn hash_seq<'s>(&self, seq: impl Seq<'s>) -> u32 {
seq.iter_bp().map(self.mapper(seq)).last().unwrap_or(0)
}
#[inline(always)]
fn hash_prefixes<'s>(&self, seq: impl Seq<'s>) -> impl ExactSizeIterator<Item = u32> {
seq.iter_bp().map(self.mapper(seq))
}
}