nthash_rs/
blind.rs

1//! **Streaming (“blind”) ntHash** for *contiguous* k‑mers.
2//!
3//! Unlike [`kmer::NtHash`](crate::kmer), which owns an entire DNA string and
4//! skips windows containing  'N', **`BlindNtHash` works on a pre‑cleaned input
5//! where every window of length *k* is known to be valid**.  
6//!
7//! The hasher maintains an *exact* k‑base sliding window in a small ring
8//! buffer and lets the caller **feed the next / previous character** to move
9//! the window forward or backward.
10//!
11//! Heavy bit‑twiddling is delegated to the `tables` (split‑rotate) and
12//! `constants` (lookup tables) modules, plus `util::extend_hashes` for
13//! generating extra hash values per window.
14//!
15//! A Rust‑idiomatic **builder + iterator** facade
16//! (`BlindNtHashBuilder` / `BlindNtHashIter`) is included for ergonomic
17//! streaming over an already‑sanitised sequence.
18
19use std::collections::VecDeque;
20
21use crate::{
22    constants::*,
23    kmer::{base_forward_hash, base_reverse_hash},
24    tables::{srol, srol_table, sror},
25    util::extend_hashes,
26    NtHashError, Result,
27};
28
29/// Rolling hash over a *fixed‑width* window that the caller rolls manually.
30///
31/// The window is stored in a `VecDeque<u8>`:
32/// - `roll()` removes the **front** base and pushes a new base at the **back**.
33/// - `roll_back()` does the opposite.
34/// - `peek()` / `peek_back()` compute hashes for the next / previous window
35///   **without** mutating internal state.
36pub struct BlindNtHash {
37    window: VecDeque<u8>,
38    k: u16,
39    pos: isize,
40    fwd_hash: u64,
41    rev_hash: u64,
42    hashes: Vec<u64>,
43}
44
45impl BlindNtHash {
46    /// Create a new `BlindNtHash` whose initial window is `seq[pos..pos+k]`.
47    ///
48    /// * The caller must guarantee* that the slice contains **no ambiguous
49    /// bases (‘N’)** – the blind variant will not skip over invalid windows.
50    ///
51    /// # Errors
52    ///
53    /// Returns if `k == 0`, `seq.len() < k`, or `pos` too large.
54    pub fn new(seq: &[u8], k: u16, num_hashes: u8, pos: isize) -> Result<Self> {
55        if k == 0 {
56            return Err(NtHashError::InvalidK);
57        }
58        let len = seq.len();
59        let k_usz = k as usize;
60
61        if pos < 0 || (pos as usize) > len - k_usz {
62            return Err(NtHashError::PositionOutOfRange {
63                pos: pos as usize,
64                seq_len: len,
65            });
66        }
67
68        let slice = &seq[(pos as usize)..(pos as usize + k_usz)];
69        let mut window = VecDeque::with_capacity(k_usz);
70        window.extend(slice.iter().copied());
71
72        let fwd_hash = base_forward_hash(slice, k);
73        let rev_hash = base_reverse_hash(slice, k);
74
75        let mut hashes = vec![0; num_hashes as usize];
76        extend_hashes(fwd_hash, rev_hash, k as u32, &mut hashes);
77
78        Ok(Self {
79            window,
80            k,
81            pos,
82            fwd_hash,
83            rev_hash,
84            hashes,
85        })
86    }
87
88    /// Returns `true` if a new valid hash was produced.
89    pub fn roll(&mut self, char_in: u8) -> bool {
90        let char_out = self
91            .window
92            .pop_front()
93            .expect("window length is always k > 0");
94        self.window.push_back(char_in);
95
96        self.fwd_hash = next_forward_hash(self.fwd_hash, self.k, char_out, char_in);
97        self.rev_hash = next_reverse_hash(self.rev_hash, self.k, char_out, char_in);
98        extend_hashes(
99            self.fwd_hash,
100            self.rev_hash,
101            self.k as u32,
102            &mut self.hashes,
103        );
104        self.pos += 1;
105        true
106    }
107
108    pub fn roll_back(&mut self, char_in: u8) -> bool {
109        debug_assert_eq!(self.window.len(), self.k as usize);
110        let char_out = self
111            .window
112            .pop_back()
113            .expect("window length is always k > 0");
114        self.window.push_front(char_in);
115
116        self.fwd_hash = prev_forward_hash(self.fwd_hash, self.k, char_out, char_in);
117        self.rev_hash = prev_reverse_hash(self.rev_hash, self.k, char_out, char_in);
118        extend_hashes(
119            self.fwd_hash,
120            self.rev_hash,
121            self.k as u32,
122            &mut self.hashes,
123        );
124        self.pos -= 1;
125        true
126    }
127
128    /// Compute hashes for the **next** window without mutating `self`.
129    pub fn peek(&mut self, char_in: u8) {
130        let char_out = *self.window.front().unwrap();
131        let fwd = next_forward_hash(self.fwd_hash, self.k, char_out, char_in);
132        let rev = next_reverse_hash(self.rev_hash, self.k, char_out, char_in);
133        extend_hashes(fwd, rev, self.k as u32, &mut self.hashes);
134    }
135
136    pub fn peek_back(&mut self, char_in: u8) {
137        let char_out = *self.window.back().unwrap();
138        let fwd = prev_forward_hash(self.fwd_hash, self.k, char_out, char_in);
139        let rev = prev_reverse_hash(self.rev_hash, self.k, char_out, char_in);
140        extend_hashes(fwd, rev, self.k as u32, &mut self.hashes);
141    }
142
143    #[inline(always)]
144    pub fn hashes(&self) -> &[u64] {
145        &self.hashes
146    }
147
148    #[inline(always)]
149    pub fn pos(&self) -> isize {
150        self.pos
151    }
152
153    #[inline(always)]
154    pub fn forward_hash(&self) -> u64 {
155        self.fwd_hash
156    }
157
158    #[inline(always)]
159    pub fn reverse_hash(&self) -> u64 {
160        self.rev_hash
161    }
162}
163
164#[inline(always)]
165fn next_forward_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
166    srol(prev) ^ SEED_TAB[char_in as usize] ^ srol_table(char_out, k as u32)
167}
168
169#[inline(always)]
170fn prev_forward_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
171    sror(prev ^ srol_table(char_in, k as u32) ^ SEED_TAB[char_out as usize])
172}
173
174#[inline(always)]
175fn next_reverse_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
176    sror(prev ^ srol_table(char_in & CP_OFF, k as u32) ^ SEED_TAB[(char_out & CP_OFF) as usize])
177}
178
179#[inline(always)]
180fn prev_reverse_hash(prev: u64, k: u16, char_out: u8, char_in: u8) -> u64 {
181    srol(prev) ^ SEED_TAB[(char_in & CP_OFF) as usize] ^ srol_table(char_out & CP_OFF, k as u32)
182}
183
184pub struct BlindNtHashBuilder<'a> {
185    seq: &'a [u8],
186    k: u16,
187    num_hashes: u8,
188    start_pos: usize,
189}
190
191impl<'a> BlindNtHashBuilder<'a> {
192    pub fn new(seq: &'a [u8]) -> Self {
193        Self {
194            seq,
195            k: 0,
196            num_hashes: 1,
197            start_pos: 0,
198        }
199    }
200
201    pub fn k(mut self, k: u16) -> Self {
202        self.k = k;
203        self
204    }
205
206    pub fn num_hashes(mut self, m: u8) -> Self {
207        self.num_hashes = m;
208        self
209    }
210
211    pub fn pos(mut self, pos: usize) -> Self {
212        self.start_pos = pos;
213        self
214    }
215
216    pub fn finish(self) -> Result<BlindNtHashIter<'a>> {
217        let hasher = BlindNtHash::new(self.seq, self.k, self.num_hashes, self.start_pos as isize)?;
218        let end = self.seq.len() - self.k as usize;
219        Ok(BlindNtHashIter {
220            seq: self.seq,
221            end,
222            hasher,
223            first: true,
224        })
225    }
226}
227
228pub struct BlindNtHashIter<'a> {
229    seq: &'a [u8],
230    end: usize,
231    hasher: BlindNtHash,
232    first: bool,
233}
234
235impl<'a> Iterator for BlindNtHashIter<'a> {
236    type Item = (usize, Vec<u64>);
237
238    fn next(&mut self) -> Option<Self::Item> {
239        if self.first {
240            self.first = false;
241            return Some((self.hasher.pos() as usize, self.hasher.hashes().to_vec()));
242        }
243
244        let cur = self.hasher.pos() as usize;
245        if cur >= self.end {
246            return None;
247        }
248
249        let incoming = self.seq[cur + self.hasher.k as usize];
250        self.hasher.roll(incoming);
251
252        Some((self.hasher.pos() as usize, self.hasher.hashes().to_vec()))
253    }
254}
255
256impl<'a> IntoIterator for BlindNtHashBuilder<'a> {
257    type Item = (usize, Vec<u64>);
258    type IntoIter = BlindNtHashIter<'a>;
259
260    fn into_iter(self) -> Self::IntoIter {
261        self.finish()
262            .expect("invalid BlindNtHashBuilder configuration")
263    }
264}