nthash_rs/
kmer.rs

1//! Canonical **ntHash** implementation for *contiguous* k‑mers.
2//!
3//! This module provides a panic‑free, idiomatic Rust port of the C++ `NtHash`
4//! class.  It computes rolling hashes over DNA k‑mers (A/C/G/T/N) in **O(1)**
5//! time per base (after an initial **O(k)** seed computation), skipping over
6//! any windows that contain ‘N’ exactly as the reference does.
7//!
8//! All heavy bit‑twiddling is delegated to the `tables` (split‑rotate) and
9//! `constants` (lookup tables) modules, plus `util::extend_hashes` for
10//! generating extra hash values per k‑mer.
11//!
12//! Additionally, a Rust‑idiomatic **builder + iterator** facade
13//! (`NtHashBuilder` / `NtHashIter`) is provided.
14
15use crate::{
16    constants::*,
17    tables::{srol, srol_n, srol_table, sror},
18    util::extend_hashes,
19    NtHashError, // unified crate-level error
20};
21
22/// Convenient alias for fallible operations in this module.
23pub type Result<T> = crate::Result<T>;
24
25/// Rolling k‑mer hasher over a contiguous DNA sequence.
26///
27/// - Initialization is deferred until the first valid k‑mer (skips any
28///   windows containing `N`).
29/// - `roll()` / `roll_back()` advance by one base, handling skips transparently.
30/// - Each valid k‑mer emits `num_hashes` values: the canonical hash plus
31///   extra mixes.
32pub struct NtHash<'a> {
33    seq: &'a [u8],
34    k: u16,
35    pos: usize,
36    initialized: bool,
37    fwd_hash: u64,
38    rev_hash: u64,
39    hashes: Vec<u64>,
40}
41
42impl<'a> NtHash<'a> {
43    /// Create a new `NtHash` starting at `pos`.
44    ///
45    /// # Arguments
46    ///
47    /// * `seq` – full DNA sequence (`A,C,G,T,N` recognized; others treated as `N`)
48    /// * `k` – k‑mer length (> 0)
49    /// * `num_hashes` – how many hash values per k‑mer
50    /// * `pos` – starting index
51    ///
52    /// # Errors
53    ///
54    /// Returns if `k == 0`, `seq.len() < k`, or `pos` too large.
55    pub fn new(seq: &'a [u8], k: u16, num_hashes: u8, pos: usize) -> Result<Self> {
56        if k == 0 {
57            return Err(NtHashError::InvalidK);
58        }
59        let len = seq.len();
60        let k_usz = k as usize;
61        if len < k_usz {
62            return Err(NtHashError::SequenceTooShort { seq_len: len, k });
63        }
64        if pos > len - k_usz {
65            return Err(NtHashError::PositionOutOfRange { pos, seq_len: len });
66        }
67        Ok(Self {
68            seq: seq,
69            k,
70            pos,
71            initialized: false,
72            fwd_hash: 0,
73            rev_hash: 0,
74            hashes: vec![0; num_hashes as usize],
75        })
76    }
77
78    /// Advance forward by one base, skipping over k‑mers with `N`.
79    /// Returns `true` if a new valid hash was produced.
80    pub fn roll(&mut self) -> bool {
81        if !self.initialized {
82            return self.init();
83        }
84        let k_usz = self.k as usize;
85        if self.pos >= self.seq.len() - k_usz {
86            return false;
87        }
88        let incoming = self.seq[self.pos + k_usz];
89        if SEED_TAB[incoming as usize] == SEED_N {
90            self.pos += k_usz;
91            return self.init();
92        }
93        let outgoing = self.seq[self.pos];
94        self.fwd_hash = next_forward_hash(self.fwd_hash, self.k, outgoing, incoming);
95        self.rev_hash = next_reverse_hash(self.rev_hash, self.k, outgoing, incoming);
96        self.update_hashes();
97        self.pos += 1;
98        true
99    }
100
101    /// Move backward by one base, skipping over k‑mers with `N`.
102    pub fn roll_back(&mut self) -> bool {
103        if !self.initialized && !self.init() {
104            return false;
105        }
106        if self.pos == 0 {
107            return false;
108        }
109        let incoming = self.seq[self.pos - 1];
110        if SEED_TAB[incoming as usize] == SEED_N {
111            if self.pos < self.k as usize {
112                return false;
113            }
114            self.pos -= self.k as usize;
115            return self.init();
116        }
117        let outgoing = self.seq[self.pos + self.k as usize - 1];
118        self.fwd_hash = prev_forward_hash(self.fwd_hash, self.k, outgoing, incoming);
119        self.rev_hash = prev_reverse_hash(self.rev_hash, self.k, outgoing, incoming);
120        self.update_hashes();
121        self.pos -= 1;
122        true
123    }
124
125    /// Peek the next k‑mer without mutating self.
126    pub fn peek(&mut self) -> bool {
127        if self.pos >= self.seq.len() - self.k as usize {
128            return false;
129        }
130        let incoming = self.seq[self.pos + self.k as usize];
131        self.peek_char(incoming)
132    }
133
134    /// Peek with an explicit incoming byte.
135    pub fn peek_char(&mut self, incoming: u8) -> bool {
136        if !self.initialized && !self.init() {
137            return false;
138        }
139        if SEED_TAB[incoming as usize] == SEED_N {
140            return false;
141        }
142        let outgoing = self.seq[self.pos];
143        let fwd = next_forward_hash(self.fwd_hash, self.k, outgoing, incoming);
144        let rev = next_reverse_hash(self.rev_hash, self.k, outgoing, incoming);
145        self.fill_hash_buffer(fwd, rev);
146        true
147    }
148
149    /// Peek the previous k‑mer without mutating self.
150    pub fn peek_back(&mut self) -> bool {
151        if self.pos == 0 {
152            return false;
153        }
154        let incoming = self.seq[self.pos - 1];
155        self.peek_back_char(incoming)
156    }
157
158    /// Peek backward with explicit incoming byte.
159    pub fn peek_back_char(&mut self, incoming: u8) -> bool {
160        if !self.initialized && !self.init() {
161            return false;
162        }
163        if SEED_TAB[incoming as usize] == SEED_N {
164            return false;
165        }
166        let outgoing = self.seq[self.pos + self.k as usize - 1];
167        let fwd = prev_forward_hash(self.fwd_hash, self.k, outgoing, incoming);
168        let rev = prev_reverse_hash(self.rev_hash, self.k, outgoing, incoming);
169        self.fill_hash_buffer(fwd, rev);
170        true
171    }
172
173    /// Returns the most recent hash buffer.
174    #[inline(always)]
175    pub fn hashes(&self) -> &[u64] {
176        &self.hashes
177    }
178
179    /// Returns the current k‑mer start index.
180    #[inline(always)]
181    pub fn pos(&self) -> usize {
182        self.pos
183    }
184
185    /// Returns the forward‑strand hash.
186    #[inline(always)]
187    pub fn forward_hash(&self) -> u64 {
188        self.fwd_hash
189    }
190
191    /// Returns the reverse‑complement hash.
192    #[inline(always)]
193    pub fn reverse_hash(&self) -> u64 {
194        self.rev_hash
195    }
196
197    /// Initialize on the first valid k‑mer.
198    fn init(&mut self) -> bool {
199        let k_usz = self.k as usize;
200        while self.pos <= self.seq.len() - k_usz {
201            let mut skip = 0;
202            if has_invalid_base(&self.seq[self.pos..], k_usz, &mut skip) {
203                self.pos += skip + 1;
204                continue;
205            }
206            self.fwd_hash = base_forward_hash(&self.seq[self.pos..], self.k);
207            self.rev_hash = base_reverse_hash(&self.seq[self.pos..], self.k);
208            self.update_hashes();
209            self.initialized = true;
210            return true;
211        }
212        false
213    }
214
215    #[inline(always)]
216    fn update_hashes(&mut self) {
217        extend_hashes(
218            self.fwd_hash,
219            self.rev_hash,
220            self.k as u32,
221            &mut self.hashes,
222        );
223    }
224
225    #[inline(always)]
226    fn fill_hash_buffer(&mut self, fwd: u64, rev: u64) {
227        extend_hashes(fwd, rev, self.k as u32, &mut self.hashes);
228    }
229}
230
231#[inline(always)]
232pub fn has_invalid_base(seq: &[u8], k: usize, pos_n: &mut usize) -> bool {
233    if let Some(idx) = seq[..k]
234        .iter()
235        .rposition(|&c| SEED_TAB[c as usize] == SEED_N)
236    {
237        *pos_n = idx;
238        true
239    } else {
240        false
241    }
242}
243
244#[inline]
245pub fn base_forward_hash(seq: &[u8], k: u16) -> u64 {
246    let k = k as usize;
247    let mut h = 0_u64;
248
249    for chunk in seq[..k - k % 4].chunks_exact(4) {
250        h = srol_n(h, 4);
251
252        // build 0‑255 index with 8‑bit wrapping
253        let idx = (CONVERT_TAB[chunk[0] as usize] as usize) * 64
254            + (CONVERT_TAB[chunk[1] as usize] as usize) * 16
255            + (CONVERT_TAB[chunk[2] as usize] as usize) * 4
256            + CONVERT_TAB[chunk[3] as usize] as usize;
257        h ^= TETRAMER_TAB[idx & 0xFF];
258    }
259
260    h = srol_n(h, (k % 4) as u32);
261    match k % 4 {
262        3 => {
263            let idx = (CONVERT_TAB[seq[k - 3] as usize] as usize) * 16
264                + (CONVERT_TAB[seq[k - 2] as usize] as usize) * 4
265                + CONVERT_TAB[seq[k - 1] as usize] as usize;
266            h ^= TRIMER_TAB[idx & 0x3F];
267        }
268        2 => {
269            let idx = (CONVERT_TAB[seq[k - 2] as usize] as usize) * 4
270                + CONVERT_TAB[seq[k - 1] as usize] as usize;
271            h ^= DIMER_TAB[idx & 0x0F];
272        }
273        1 => h ^= SEED_TAB[seq[k - 1] as usize],
274        _ => {}
275    }
276    h
277}
278
279#[inline]
280pub fn base_reverse_hash(seq: &[u8], k: u16) -> u64 {
281    let k = k as usize;
282    let mut h = 0_u64;
283
284    // Handle the ‘tail’ (k % 4 = 1,2,3)
285    match k % 4 {
286        3 => {
287            let idx = (RC_CONVERT_TAB[seq[k - 1] as usize] as usize) * 16
288                + (RC_CONVERT_TAB[seq[k - 2] as usize] as usize) * 4
289                + RC_CONVERT_TAB[seq[k - 3] as usize] as usize;
290            h ^= TRIMER_TAB[idx & 0x3F];
291        }
292        2 => {
293            let idx = (RC_CONVERT_TAB[seq[k - 1] as usize] as usize) * 4
294                + RC_CONVERT_TAB[seq[k - 2] as usize] as usize;
295            h ^= DIMER_TAB[idx & 0x0F];
296        }
297        1 => {
298            let c = seq[k - 1] & CP_OFF;
299            h ^= SEED_TAB[c as usize];
300        }
301        _ => {}
302    }
303
304    // Process full 4‑mer chunks in reverse order
305    let mut i = k - k % 4;
306    while i >= 4 {
307        // split‑rotate the accumulator by 4
308        h = srol_n(h, 4);
309
310        // build 4‑mer index, mask to 8 bits
311        let idx = (RC_CONVERT_TAB[seq[i - 1] as usize] as usize) * 64
312            + (RC_CONVERT_TAB[seq[i - 2] as usize] as usize) * 16
313            + (RC_CONVERT_TAB[seq[i - 3] as usize] as usize) * 4
314            + RC_CONVERT_TAB[seq[i - 4] as usize] as usize;
315        h ^= TETRAMER_TAB[idx & 0xFF];
316
317        i -= 4;
318    }
319    h
320}
321
322#[inline(always)]
323fn next_forward_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
324    let mut h = srol(prev);
325    h ^= SEED_TAB[char_in as usize];
326    h ^= srol_table(char_out, k as u32);
327    h
328}
329
330#[inline(always)]
331fn prev_forward_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
332    let mut h = prev ^ srol_table(char_in, k as u32);
333    h ^= SEED_TAB[char_out as usize];
334    sror(h)
335}
336
337#[inline(always)]
338fn next_reverse_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
339    let mut h = prev ^ srol_table(char_in & CP_OFF, k as u32);
340    h ^= SEED_TAB[(char_out & CP_OFF) as usize];
341    sror(h)
342}
343
344#[inline(always)]
345fn prev_reverse_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
346    let mut h = srol(prev);
347    h ^= SEED_TAB[(char_in & CP_OFF) as usize];
348    h ^= srol_table(char_out & CP_OFF, k as u32);
349    h
350}
351
352// -------------------------------------------------------------------------
353// Builder + Iterator facade
354// -------------------------------------------------------------------------
355
356/// Configure and consume a rolling‐hash computation as an iterator.
357pub struct NtHashBuilder<'a> {
358    seq: &'a [u8],
359    k: u16,
360    num_hashes: u8,
361    pos: usize,
362}
363
364impl<'a> NtHashBuilder<'a> {
365    /// Begin building over `seq`.
366    pub fn new(seq: &'a [u8]) -> Self {
367        NtHashBuilder {
368            seq,
369            k: 0,
370            num_hashes: 1,
371            pos: 0,
372        }
373    }
374
375    /// Set the k‑mer length.
376    pub fn k(mut self, k: u16) -> Self {
377        self.k = k;
378        self
379    }
380
381    /// Set how many hashes per k‑mer.
382    pub fn num_hashes(mut self, m: u8) -> Self {
383        self.num_hashes = m;
384        self
385    }
386
387    /// Set the starting position.
388    pub fn pos(mut self, pos: usize) -> Self {
389        self.pos = pos;
390        self
391    }
392
393    /// Finalize into an iterator.
394    pub fn finish(self) -> Result<NtHashIter<'a>> {
395        let hasher = NtHash::new(self.seq, self.k, self.num_hashes, self.pos)?;
396        Ok(NtHashIter {
397            hasher,
398            done: false,
399        })
400    }
401}
402
403/// Iterator yielding `(pos, Vec<u64>)` for each valid k‑mer.
404pub struct NtHashIter<'a> {
405    hasher: NtHash<'a>,
406    done: bool,
407}
408
409impl<'a> Iterator for NtHashIter<'a> {
410    type Item = (usize, Vec<u64>);
411
412    fn next(&mut self) -> Option<Self::Item> {
413        if self.done {
414            return None;
415        }
416        if !self.hasher.roll() {
417            self.done = true;
418            return None;
419        }
420        let out = (self.hasher.pos(), self.hasher.hashes().to_owned());
421        Some(out)
422    }
423}
424
425impl<'a> IntoIterator for NtHashBuilder<'a> {
426    type Item = (usize, Vec<u64>);
427    type IntoIter = NtHashIter<'a>;
428
429    fn into_iter(self) -> Self::IntoIter {
430        self.finish().expect("invalid NtHashBuilder configuration")
431    }
432}