#![allow(clippy::missing_transmute_annotations)]
mod canonical;
pub mod collect;
mod minimizers;
mod sliding_min;
pub mod syncmers;
mod intrinsics {
mod dedup;
pub use dedup::{append_filtered_vals, append_unique_vals, append_unique_vals_2};
}
#[cfg(test)]
mod test;
pub mod private {
pub mod canonical {
pub use crate::canonical::*;
}
pub mod minimizers {
pub use crate::minimizers::*;
}
pub mod sliding_min {
pub use crate::sliding_min::*;
}
pub use packed_seq::u32x8 as S;
}
use collect::CollectAndDedup;
use collect::collect_and_dedup_into_scalar;
use collect::collect_and_dedup_with_index_into_scalar;
use minimizers::canonical_minimizers_skip_ambiguous_windows;
pub use packed_seq;
use packed_seq::PackedNSeq;
use packed_seq::PackedSeq;
pub use seq_hash;
use minimizers::{
canonical_minimizers_seq_scalar, canonical_minimizers_seq_simd, minimizers_seq_scalar,
minimizers_seq_simd,
};
use packed_seq::Seq;
use packed_seq::u32x8 as S;
use seq_hash::KmerHasher;
pub use minimizers::one_minimizer;
use seq_hash::NtHasher;
pub use sliding_min::Cache;
use syncmers::CollectSyncmers;
use syncmers::collect_syncmers_scalar;
thread_local! {
static CACHE: std::cell::RefCell<(Cache, Vec<S>, Vec<S>)> = std::cell::RefCell::new(Default::default());
}
pub struct Builder<'h, const CANONICAL: bool, H: KmerHasher, SkPos, const SYNCMER: u8> {
k: usize,
w: usize,
hasher: Option<&'h H>,
sk_pos: SkPos,
}
pub struct Output<'o, const CANONICAL: bool, S> {
len: usize,
seq: S,
min_pos: &'o Vec<u32>,
}
#[must_use]
pub const fn minimizers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), 0> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
#[must_use]
pub const fn canonical_minimizers(
k: usize,
w: usize,
) -> Builder<'static, true, NtHasher<true>, (), 0> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
#[must_use]
pub const fn closed_syncmers(
k: usize,
w: usize,
) -> Builder<'static, false, NtHasher<false>, (), 1> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
#[must_use]
pub const fn canonical_closed_syncmers(
k: usize,
w: usize,
) -> Builder<'static, true, NtHasher<true>, (), 1> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
#[must_use]
pub const fn open_syncmers(k: usize, w: usize) -> Builder<'static, false, NtHasher<false>, (), 2> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
#[must_use]
pub const fn canonical_open_syncmers(
k: usize,
w: usize,
) -> Builder<'static, true, NtHasher<true>, (), 2> {
Builder {
k,
w,
hasher: None,
sk_pos: (),
}
}
impl<const CANONICAL: bool, const SYNCMERS: u8>
Builder<'static, CANONICAL, NtHasher<CANONICAL>, (), SYNCMERS>
{
#[must_use]
pub const fn hasher<'h, H2: KmerHasher>(
&self,
hasher: &'h H2,
) -> Builder<'h, CANONICAL, H2, (), SYNCMERS> {
Builder {
k: self.k,
w: self.w,
sk_pos: (),
hasher: Some(hasher),
}
}
}
impl<'h, const CANONICAL: bool, H: KmerHasher> Builder<'h, CANONICAL, H, (), 0> {
#[must_use]
pub const fn super_kmers<'o2>(
&self,
sk_pos: &'o2 mut Vec<u32>,
) -> Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, 0> {
Builder {
k: self.k,
w: self.w,
hasher: self.hasher,
sk_pos,
}
}
}
impl<'h, const CANONICAL: bool, H: KmerHasher, const SYNCMERS: u8>
Builder<'h, CANONICAL, H, (), SYNCMERS>
{
pub fn run_scalar_once<'s, SEQ: Seq<'s>>(&self, seq: SEQ) -> Vec<u32> {
let mut min_pos = vec![];
self.run_impl::<false, _>(seq, &mut min_pos);
min_pos
}
pub fn run_once<'s, SEQ: Seq<'s>>(&self, seq: SEQ) -> Vec<u32> {
let mut min_pos = vec![];
self.run_impl::<true, _>(seq, &mut min_pos);
min_pos
}
pub fn run_scalar<'s, 'o, SEQ: Seq<'s>>(
&self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, CANONICAL, SEQ> {
self.run_impl::<false, _>(seq, min_pos)
}
pub fn run<'s, 'o, SEQ: Seq<'s>>(
&self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, CANONICAL, SEQ> {
self.run_impl::<true, _>(seq, min_pos)
}
fn run_impl<'s, 'o, const SIMD: bool, SEQ: Seq<'s>>(
&self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, CANONICAL, SEQ> {
let default_hasher = self.hasher.is_none().then(|| H::new(self.k));
let hasher = self
.hasher
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
CACHE.with_borrow_mut(|cache| match (SIMD, CANONICAL, SYNCMERS) {
(false, false, 0) => collect_and_dedup_into_scalar(
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(false, false, 1) => collect_syncmers_scalar::<false>(
self.w,
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(false, false, 2) => collect_syncmers_scalar::<true>(
self.w,
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(false, true, 0) => collect_and_dedup_into_scalar(
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(false, true, 1) => collect_syncmers_scalar::<false>(
self.w,
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(false, true, 2) => collect_syncmers_scalar::<true>(
self.w,
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
),
(true, false, 0) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_and_dedup_into::<false>(min_pos),
(true, false, 1) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_syncmers_into::<false>(self.w, min_pos),
(true, false, 2) => minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_syncmers_into::<true>(self.w, min_pos),
(true, true, 0) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_and_dedup_into::<false>(min_pos),
(true, true, 1) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_syncmers_into::<false>(self.w, min_pos),
(true, true, 2) => canonical_minimizers_seq_simd(seq, hasher, self.w, &mut cache.0)
.collect_syncmers_into::<true>(self.w, min_pos),
_ => unreachable!("SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)."),
});
Output {
len: if SYNCMERS != 0 {
self.k + self.w - 1
} else {
self.k
},
seq,
min_pos,
}
}
}
impl<'h, H: KmerHasher, const SYNCMERS: u8> Builder<'h, true, H, (), SYNCMERS> {
pub fn run_skip_ambiguous_windows_once<'s>(&self, nseq: PackedNSeq<'s>) -> Vec<u32> {
let mut min_pos = vec![];
self.run_skip_ambiguous_windows(nseq, &mut min_pos);
min_pos
}
pub fn run_skip_ambiguous_windows<'s, 'o>(
&self,
nseq: PackedNSeq<'s>,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, true, PackedSeq<'s>> {
CACHE
.with_borrow_mut(|cache| self.run_skip_ambiguous_windows_with_buf(nseq, min_pos, cache))
}
pub fn run_skip_ambiguous_windows_with_buf<'s, 'o>(
&self,
nseq: PackedNSeq<'s>,
min_pos: &'o mut Vec<u32>,
cache: &mut (Cache, Vec<S>, Vec<S>),
) -> Output<'o, true, PackedSeq<'s>> {
let default_hasher = self.hasher.is_none().then(|| H::new(self.k));
let hasher = self
.hasher
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
match SYNCMERS {
0 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
.collect_and_dedup_into::<true>(min_pos),
1 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
.collect_syncmers_into::<false>(self.w, min_pos),
2 => canonical_minimizers_skip_ambiguous_windows(nseq, hasher, self.w, cache)
.collect_syncmers_into::<true>(self.w, min_pos),
_ => panic!(
"SYNCMERS generic must be 0 (no syncmers), 1 (closed syncmers), or 2 (open syncmers)."
),
}
Output {
len: if SYNCMERS != 0 {
self.k + self.w - 1
} else {
self.k
},
seq: nseq.seq,
min_pos,
}
}
}
impl<'h, 'o2, const CANONICAL: bool, H: KmerHasher>
Builder<'h, CANONICAL, H, &'o2 mut Vec<u32>, 0>
{
pub fn run_scalar_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec<u32> {
let mut min_pos = vec![];
self.run_scalar(seq, &mut min_pos);
min_pos
}
pub fn run_scalar<'s, 'o, SEQ: Seq<'s>>(
self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, CANONICAL, SEQ> {
let default_hasher = self.hasher.is_none().then(|| H::new(self.k));
let hasher = self
.hasher
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
CACHE.with_borrow_mut(|cache| match CANONICAL {
false => collect_and_dedup_with_index_into_scalar(
minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
self.sk_pos,
),
true => collect_and_dedup_with_index_into_scalar(
canonical_minimizers_seq_scalar(seq, hasher, self.w, &mut cache.0),
min_pos,
self.sk_pos,
),
});
Output {
len: self.k,
seq,
min_pos,
}
}
pub fn run_once<'s, SEQ: Seq<'s>>(self, seq: SEQ) -> Vec<u32> {
let mut min_pos = vec![];
self.run(seq, &mut min_pos);
min_pos
}
pub fn run<'s, 'o, SEQ: Seq<'s>>(
self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
) -> Output<'o, CANONICAL, SEQ> {
CACHE.with_borrow_mut(|cache| self.run_with_buf(seq, min_pos, &mut cache.0))
}
#[inline(always)]
fn run_with_buf<'s, 'o, SEQ: Seq<'s>>(
self,
seq: SEQ,
min_pos: &'o mut Vec<u32>,
cache: &mut Cache,
) -> Output<'o, CANONICAL, SEQ> {
let default_hasher = self.hasher.is_none().then(|| H::new(self.k));
let hasher = self
.hasher
.unwrap_or_else(|| default_hasher.as_ref().unwrap());
match CANONICAL {
false => minimizers_seq_simd(seq, hasher, self.w, cache)
.collect_and_dedup_with_index_into(min_pos, self.sk_pos),
true => canonical_minimizers_seq_simd(seq, hasher, self.w, cache)
.collect_and_dedup_with_index_into(min_pos, self.sk_pos),
};
Output {
len: self.k,
seq,
min_pos,
}
}
}
impl<'s, 'o, const CANONICAL: bool, SEQ: Seq<'s>> Output<'o, CANONICAL, SEQ> {
#[must_use]
pub fn values_u64(&self) -> impl ExactSizeIterator<Item = u64> {
self.pos_and_values_u64().map(|(_pos, val)| val)
}
#[must_use]
pub fn values_u128(&self) -> impl ExactSizeIterator<Item = u128> {
self.pos_and_values_u128().map(|(_pos, val)| val)
}
#[must_use]
pub fn pos_and_values_u64(&self) -> impl ExactSizeIterator<Item = (u32, u64)> {
self.min_pos.iter().map(
#[inline(always)]
move |&pos| {
let val = if CANONICAL {
let a = self.seq.read_kmer(self.len, pos as usize);
let b = self.seq.read_revcomp_kmer(self.len, pos as usize);
core::cmp::min(a, b)
} else {
self.seq.read_kmer(self.len, pos as usize)
};
(pos, val)
},
)
}
#[must_use]
pub fn pos_and_values_u128(&self) -> impl ExactSizeIterator<Item = (u32, u128)> {
self.min_pos.iter().map(
#[inline(always)]
move |&pos| {
let val = if CANONICAL {
let a = self.seq.read_kmer_u128(self.len, pos as usize);
let b = self.seq.read_revcomp_kmer_u128(self.len, pos as usize);
core::cmp::min(a, b)
} else {
self.seq.read_kmer_u128(self.len, pos as usize)
};
(pos, val)
},
)
}
}
pub fn minimizer_positions<'s>(seq: impl Seq<'s>, k: usize, w: usize) -> Vec<u32> {
minimizers(k, w).run_once(seq)
}
pub fn canonical_minimizer_positions<'s>(seq: impl Seq<'s>, k: usize, w: usize) -> Vec<u32> {
canonical_minimizers(k, w).run_once(seq)
}