use crate::algorithm::{Minimizer, MinimizerAlgorithm, ModMinimizer};
use crate::iterator::*;
use core::hash::{BuildHasher, Hash};
use core::marker::PhantomData;
use minimizer_queue::DefaultHashBuilder;
use num_traits::PrimInt;
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct MinimizerBuilder<
T: PrimInt = u64,
A: MinimizerAlgorithm = Minimizer,
S: BuildHasher = DefaultHashBuilder,
const CANONICAL: bool = false,
> {
minimizer_size: usize,
width: u16,
hasher: S,
encoding: [u8; 256],
_marker: PhantomData<(T, A)>,
}
impl<T: PrimInt + Hash> MinimizerBuilder<T> {
#[inline]
pub fn new() -> Self {
Self::_new()
}
}
impl<T: PrimInt + Hash> Default for MinimizerBuilder<T> {
#[inline]
fn default() -> Self {
Self::_new()
}
}
impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, Minimizer, S, false> {
#[inline]
pub fn iter(self, seq: &[u8]) -> MinimizerIterator<T, S> {
MinimizerIterator::new(
seq,
self.minimizer_size,
self.width,
self.hasher,
self.encoding,
)
}
#[inline]
pub fn iter_pos(self, seq: &[u8]) -> MinimizerPosIterator<T, S> {
MinimizerPosIterator::new(
seq,
self.minimizer_size,
self.width,
self.hasher,
self.encoding,
)
}
}
impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, Minimizer, S, true> {
#[inline]
pub fn iter(self, seq: &[u8]) -> CanonicalMinimizerIterator<T, S> {
assert_eq!(
self.width % 2,
1,
"width must be odd to break ties between multiple minimizers"
);
CanonicalMinimizerIterator::new(
seq,
self.minimizer_size,
self.width,
self.hasher,
self.encoding,
)
}
#[inline]
pub fn iter_pos(self, seq: &[u8]) -> CanonicalMinimizerPosIterator<T, S> {
assert_eq!(
self.width % 2,
1,
"width must be odd to break ties between multiple minimizers"
);
CanonicalMinimizerPosIterator::new(
seq,
self.minimizer_size,
self.width,
self.hasher,
self.encoding,
)
}
}
const R: usize = 4;
impl<T: PrimInt + Hash> MinimizerBuilder<T, ModMinimizer> {
#[inline]
pub fn new_mod() -> Self {
Self::_new()
}
}
impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, ModMinimizer, S, false> {
#[inline]
pub fn iter(self, seq: &[u8]) -> ModSamplingIterator<T, S> {
assert!(
self.minimizer_size >= R,
"mod-minimizers require minimizer_size ≥ r={R}"
);
ModSamplingIterator::new(
seq,
self.minimizer_size,
self.width,
R + ((self.minimizer_size - R) % self.width as usize),
self.hasher,
self.encoding,
)
}
#[inline]
pub fn iter_pos(self, seq: &[u8]) -> ModSamplingPosIterator<T, S> {
assert!(
self.minimizer_size >= R,
"mod-minimizers require minimizer_size ≥ r={R}"
);
ModSamplingPosIterator::new(
seq,
self.minimizer_size,
self.width,
R + ((self.minimizer_size - R) % self.width as usize),
self.hasher,
self.encoding,
)
}
}
impl<T: PrimInt + Hash, S: BuildHasher> MinimizerBuilder<T, ModMinimizer, S, true> {
#[inline]
pub fn iter(self, seq: &[u8]) -> CanonicalModSamplingIterator<T, S> {
assert!(
self.minimizer_size >= R,
"mod-minimizers require minimizer_size ≥ r={R}"
);
assert_eq!(
self.width % 2,
1,
"width must be odd to break ties between multiple minimizers"
);
CanonicalModSamplingIterator::new(
seq,
self.minimizer_size,
self.width,
R + ((self.minimizer_size - R) % self.width as usize),
self.hasher,
self.encoding,
)
}
#[inline]
pub fn iter_pos(self, seq: &[u8]) -> CanonicalModSamplingPosIterator<T, S> {
assert!(
self.minimizer_size >= R,
"mod-minimizers require minimizer_size ≥ r={R}"
);
assert_eq!(
self.width % 2,
1,
"width must be odd to break ties between multiple minimizers"
);
CanonicalModSamplingPosIterator::new(
seq,
self.minimizer_size,
self.width,
R + ((self.minimizer_size - R) % self.width as usize),
self.hasher,
self.encoding,
)
}
}
impl<T: PrimInt + Hash, A: MinimizerAlgorithm> MinimizerBuilder<T, A, DefaultHashBuilder> {
fn _new() -> Self {
let mut encoding = [0u8; 256];
encoding[b'A' as usize] = 0b00;
encoding[b'a' as usize] = 0b00;
encoding[b'C' as usize] = 0b01;
encoding[b'c' as usize] = 0b01;
encoding[b'G' as usize] = 0b10;
encoding[b'g' as usize] = 0b10;
encoding[b'T' as usize] = 0b11;
encoding[b't' as usize] = 0b11;
Self {
minimizer_size: 21,
width: 31 - 21 + 1,
hasher: DefaultHashBuilder::default(),
encoding,
_marker: PhantomData,
}
}
pub fn seed(mut self, seed: u64) -> Self {
self.hasher = DefaultHashBuilder::with_seed(seed);
self
}
}
impl<T: PrimInt + Hash, A: MinimizerAlgorithm, S: BuildHasher, const CANONICAL: bool>
MinimizerBuilder<T, A, S, CANONICAL>
{
pub fn minimizer_size(mut self, minimizer_size: usize) -> Self {
let max_size = (T::zero().count_zeros() / 2) as usize;
assert!(
minimizer_size <= max_size,
"With this integer type, minimizer_size must be ≤ {max_size}. Please select a smaller size or a larger type."
);
self.minimizer_size = minimizer_size;
self
}
pub const fn width(mut self, width: u16) -> Self {
self.width = width;
self
}
pub fn hasher<H: BuildHasher>(self, hasher: H) -> MinimizerBuilder<T, A, H, CANONICAL> {
MinimizerBuilder::<T, A, H, CANONICAL> {
minimizer_size: self.minimizer_size,
width: self.width,
hasher,
encoding: self.encoding,
_marker: self._marker,
}
}
pub fn encoding(mut self, a: u8, c: u8, g: u8, t: u8) -> Self {
self.encoding[b'A' as usize] = a;
self.encoding[b'a' as usize] = a;
self.encoding[b'C' as usize] = c;
self.encoding[b'c' as usize] = c;
self.encoding[b'G' as usize] = g;
self.encoding[b'g' as usize] = g;
self.encoding[b'T' as usize] = t;
self.encoding[b't' as usize] = t;
self
}
pub fn canonical(self) -> MinimizerBuilder<T, A, S, true> {
MinimizerBuilder::<T, A, S, true> {
minimizer_size: self.minimizer_size,
width: self.width,
hasher: self.hasher,
encoding: self.encoding,
_marker: self._marker,
}
}
pub fn non_canonical(self) -> MinimizerBuilder<T, A, S, false> {
MinimizerBuilder::<T, A, S, false> {
minimizer_size: self.minimizer_size,
width: self.width,
hasher: self.hasher,
encoding: self.encoding,
_marker: self._marker,
}
}
}