use crate::kmer::Kmer;
use crate::hasher::DeterministicHasher;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct MinimizerInfo {
pub value: u64,
pub position: u64,
pub pos_in_kmer: usize,
}
impl MinimizerInfo {
pub fn new(value: u64, position: u64, pos_in_kmer: usize) -> Self {
Self {
value,
position,
pos_in_kmer,
}
}
}
pub struct MinimizerIterator {
k: usize,
m: usize,
position: u64,
min_pos_in_kmer: usize,
min_value: u64,
min_position: u64,
min_hash: u64,
hasher: DeterministicHasher,
}
impl MinimizerIterator {
pub fn with_seed(k: usize, m: usize, seed: u64) -> Self {
assert!(k > 0 && m <= k, "k must be > 0 and m <= k");
let hasher = DeterministicHasher::new(seed);
let mut iter = Self {
k,
m,
position: 0,
min_pos_in_kmer: 0,
min_value: 0,
min_position: 0,
min_hash: u64::MAX,
hasher,
};
iter.reset();
iter
}
pub fn new(k: usize, m: usize, _position: u64) -> Self {
Self::with_seed(k, m, 1)
}
pub fn set_position(&mut self, position: u64) {
self.position = position;
self.reset();
}
fn reset(&mut self) {
self.min_pos_in_kmer = 0;
self.min_position = self.position.wrapping_sub(1);
self.min_hash = u64::MAX;
}
fn hash_u64(&self, value: u64) -> u64 {
self.hasher.hash_u64(value)
}
pub fn next<const K: usize>(&mut self, kmer: Kmer<K>) -> MinimizerInfo
where
Kmer<K>: crate::kmer::KmerBits,
{
assert_eq!(K, self.k, "k-mer size must match iterator k");
if self.min_pos_in_kmer == 0 {
self.position = self.min_position.wrapping_add(1);
self.rescan(kmer);
} else {
self.position += 1;
let mmer_value = self.extract_mmer(kmer, self.k - self.m);
let hash = self.hash_u64(mmer_value);
if hash < self.min_hash {
self.min_hash = hash;
self.min_value = mmer_value;
self.min_position = self.position;
self.min_pos_in_kmer = self.k - self.m;
} else {
self.min_pos_in_kmer -= 1;
}
}
MinimizerInfo::new(self.min_value, self.min_position, self.min_pos_in_kmer)
}
fn rescan<const K: usize>(&mut self, kmer: Kmer<K>)
where
Kmer<K>: crate::kmer::KmerBits,
{
self.min_hash = u64::MAX;
self.min_value = 0;
self.min_pos_in_kmer = 0;
let begin = self.position;
for i in 0..=(self.k - self.m) {
let mmer_value = self.extract_mmer(kmer, i);
let hash = self.hash_u64(mmer_value);
if hash < self.min_hash {
self.min_hash = hash;
self.min_value = mmer_value;
self.min_pos_in_kmer = i;
}
}
self.position = begin + (self.k - self.m) as u64;
self.min_position = begin + self.min_pos_in_kmer as u64;
}
#[inline]
fn extract_mmer<const K: usize>(&self, kmer: Kmer<K>, start_pos: usize) -> u64
where
Kmer<K>: crate::kmer::KmerBits,
{
let shift = start_pos * 2;
let mask = (1u64 << (self.m * 2)) - 1;
let bits_u128 = <Kmer<K> as crate::kmer::KmerBits>::to_u128(kmer.bits());
((bits_u128 >> shift) as u64) & mask
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::kmer::Kmer;
#[test]
fn test_minimizer_iterator_basic() {
let kmer: Kmer<15> = Kmer::from_str("ACGTACGTACGTACG").unwrap();
let mut iter = MinimizerIterator::new(15, 7, 0);
let mini = iter.next(kmer);
assert!(mini.pos_in_kmer <= 15 - 7); assert_eq!(mini.position, mini.pos_in_kmer as u64);
}
#[test]
fn test_minimizer_iterator_position() {
let kmer1: Kmer<9> = Kmer::from_str("ACGTACGTA").unwrap();
let kmer2: Kmer<9> = Kmer::from_str("CGTACGTAC").unwrap();
let mut iter = MinimizerIterator::new(9, 5, 0);
let mini1 = iter.next(kmer1);
let mini2 = iter.next(kmer2);
assert!(mini1.pos_in_kmer < 9);
assert!(mini2.pos_in_kmer < 9);
}
#[test]
fn test_minimizer_info_creation() {
let mini = MinimizerInfo::new(42, 100, 5);
assert_eq!(mini.value, 42);
assert_eq!(mini.position, 100);
assert_eq!(mini.pos_in_kmer, 5);
}
#[test]
fn test_minimizer_consistency_fresh_vs_sequential() {
let seq = "ATTTTCAGGATGTTTTCAGGTTCATCATCTCCCTTCTTTGCAGGATAGTAGATAAGATCGCTCATCAACGGATGTTGTGT";
let k = 31usize;
let m = 19usize;
let seed = 1u64;
let num_kmers = seq.len() - k + 1;
let mut seq_iter = MinimizerIterator::with_seed(k, m, seed);
seq_iter.set_position(0);
for kmer_pos in 0..num_kmers {
let kmer_str = &seq[kmer_pos..kmer_pos+k];
let kmer = Kmer::<31>::from_str(kmer_str).unwrap();
let seq_mini = seq_iter.next(kmer);
let mut fresh_iter = MinimizerIterator::with_seed(k, m, seed);
let fresh_mini = fresh_iter.next(kmer);
assert_eq!(
seq_mini.value, fresh_mini.value,
"Minimizer VALUE mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
kmer_pos, seq_mini.value, fresh_mini.value, kmer_str
);
assert_eq!(
seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer,
"Minimizer POS mismatch at kmer_pos={}: seq={} fresh={} kmer={}",
kmer_pos, seq_mini.pos_in_kmer, fresh_mini.pos_in_kmer, kmer_str
);
}
}
}