nthash_rs/seed.rs
1//! **Streaming spaced-seed ntHash** for *non-contiguous* k‑mers.
2//!
3//! **`SeedNtHash` computes hashes using spaced seeds**, where only selected
4//! positions in the k‑mer are considered (“care sites”).
5//!
6//! Hashes are re‑computed per window rather than rolled, allowing support
7//! for multiple seeds and arbitrary binary masks.
8//!
9//! Bit-level operations are delegated to `tables`, `constants`, and
10//! `util::extend_hashes` for efficient hash computation.
11//!
12//! A Rust‑idiomatic **builder + iterator** (`SeedNtHashBuilder` / `SeedNtHashIter`)
13//! provides ergonomic traversal over valid k‑mers.
14
15use crate::{
16 constants::{CP_OFF, SEED_N, SEED_TAB},
17 tables::srol_table,
18 util::extend_hashes,
19 NtHashError, Result,
20};
21
22/// Parses a spaced-seed mask string composed of '0' and '1' characters
23/// into a list of indices indicating which positions should be used ("care positions").
24///
25/// # Errors
26/// Returns an error if the mask length does not match `k`, or contains characters other than '0' or '1'.
27fn parse_seed_string(mask: &str, k: usize) -> Result<Vec<usize>> {
28 if mask.len() != k {
29 return Err(NtHashError::InvalidK);
30 }
31 if !mask.bytes().all(|b| b == b'0' || b == b'1') {
32 return Err(NtHashError::InvalidSequence);
33 }
34 Ok(mask
35 .bytes()
36 .enumerate()
37 .filter_map(|(i, b)| if b == b'1' { Some(i) } else { None })
38 .collect())
39}
40
41/// Computes the forward and reverse hash values for a given k-mer using a spaced seed.
42///
43/// # Arguments
44/// - `window`: The current k-mer slice from the sequence.
45/// - `care`: The positions to include in hashing (as defined by the spaced seed).
46/// - `k`: Length of the k-mer.
47///
48/// # Returns
49/// A tuple of (forward_hash, reverse_hash).
50#[inline]
51fn compute_pair(window: &[u8], care: &[usize], k: usize) -> (u64, u64) {
52 let mut fwd = 0u64;
53 let mut rev = 0u64;
54 for &p in care {
55 let c_f = window[p];
56 let c_r = c_f & CP_OFF; // Apply complement transformation
57
58 fwd ^= srol_table(c_f, (k - 1 - p) as u32); // Position-dependent rotation
59 rev ^= srol_table(c_r, p as u32);
60 }
61 (fwd, rev)
62}
63
64/// Struct for computing spaced-seed ntHash values in a re-computational manner.
65/// Can handle multiple seeds and generates multiple hashes per k-mer.
66pub struct SeedNtHash<'a> {
67 seq: &'a [u8], // Input nucleotide sequence
68 k: usize, // k-mer size
69 num_hashes: usize, // Number of hashes per seed
70 seeds: Vec<Vec<usize>>, // Care indices for each seed
71 pos: usize, // Current position in the sequence
72 hashes: Vec<u64>, // Hash results (flattened)
73 initialised: bool, // Whether the hasher has found the first valid k-mer
74}
75
76impl<'a> SeedNtHash<'a> {
77 /// Creates a new hasher from a sequence and spaced-seed masks.
78 ///
79 /// # Errors
80 /// Returns an error if `k` is zero, the sequence is too short, or a mask is invalid.
81 pub fn new(
82 seq: &'a [u8],
83 seed_masks: &[String],
84 num_hashes_per_seed: usize,
85 k: u16,
86 start_pos: usize,
87 ) -> Result<Self> {
88 if k == 0 {
89 return Err(NtHashError::InvalidK);
90 }
91 let k_usz = k as usize;
92 if seq.len() < k_usz {
93 return Err(NtHashError::SequenceTooShort {
94 seq_len: seq.len(),
95 k,
96 });
97 }
98 if start_pos > seq.len() - k_usz {
99 return Err(NtHashError::PositionOutOfRange {
100 pos: start_pos,
101 seq_len: seq.len(),
102 });
103 }
104
105 let mut seeds = Vec::with_capacity(seed_masks.len());
106 for m in seed_masks {
107 seeds.push(parse_seed_string(m, k_usz)?);
108 }
109
110 Ok(Self {
111 seq,
112 k: k_usz,
113 num_hashes: num_hashes_per_seed.max(1),
114 seeds,
115 pos: start_pos,
116 hashes: vec![0; seed_masks.len() * num_hashes_per_seed.max(1)],
117 initialised: false,
118 })
119 }
120
121 /// Alternative constructor using pre-parsed care indices (skips mask parsing).
122 pub fn from_care_indices(
123 seq: &'a [u8],
124 seeds: Vec<Vec<usize>>,
125 num_hashes_per_seed: usize,
126 k: u16,
127 start_pos: usize,
128 ) -> Result<Self> {
129 let k_usz = k as usize;
130 if seeds.iter().any(|v| v.iter().any(|&i| i >= k_usz)) {
131 return Err(NtHashError::InvalidWindowOffsets);
132 }
133 Self::new(
134 seq,
135 &vec![String::from_utf8(vec![b'0'; k_usz]).unwrap(); seeds.len()], // dummy masks
136 num_hashes_per_seed,
137 k,
138 start_pos,
139 )
140 .map(|mut s| {
141 s.seeds = seeds;
142 s
143 })
144 }
145
146 /// Returns the current position in the sequence.
147 #[inline(always)]
148 pub fn pos(&self) -> usize {
149 self.pos
150 }
151
152 /// Returns the current set of hash values.
153 #[inline(always)]
154 pub fn hashes(&self) -> &[u64] {
155 &self.hashes
156 }
157
158 /// Advances the iterator by one position.
159 /// On first call, searches for the first valid k-mer (initialization).
160 pub fn roll(&mut self) -> bool {
161 if !self.initialised {
162 return self.init();
163 }
164
165 if self.pos >= self.seq.len() - self.k {
166 return false; // End of sequence
167 }
168
169 self.pos += 1;
170 self.compute_current()
171 }
172
173 /// Computes hashes for the k-mer at the current position.
174 /// Returns false if any ambiguous base is found.
175 fn compute_current(&mut self) -> bool {
176 let win = &self.seq[self.pos..self.pos + self.k];
177 for care in &self.seeds {
178 if care.iter().any(|&p| SEED_TAB[win[p] as usize] == SEED_N) {
179 return false;
180 }
181 }
182
183 for (i_seed, care) in self.seeds.iter().enumerate() {
184 let (fwd, rev) = compute_pair(win, care, self.k);
185 let slice = &mut self.hashes[i_seed * self.num_hashes
186 ..(i_seed + 1) * self.num_hashes];
187 extend_hashes(fwd, rev, self.k as u32, slice);
188 }
189 true
190 }
191
192 /// Initializes by finding the first valid k-mer in the sequence.
193 fn init(&mut self) -> bool {
194 while self.pos <= self.seq.len() - self.k {
195 if self.compute_current() {
196 self.initialised = true;
197 return true;
198 }
199 self.pos += 1;
200 }
201 false
202 }
203}
204
205// ─────────────────────────────────────────────────────────────────────────────
206// Builder + Iterator façade for ergonomic traversal of spaced-seed hashes
207// ─────────────────────────────────────────────────────────────────────────────
208
209/// Builder for creating a `SeedNtHashIter`, providing ergonomic configuration.
210///
211/// Example:
212/// ```rust
213/// use nthash_rs::{SeedNtHashBuilder, Result};
214///
215/// # fn main() -> Result<()> {
216/// let seq = b"ATCGTACGATGCATGCATGCTGACG";
217/// let masks = vec!["000111", "010101"];
218///
219/// for (pos, hashes) in SeedNtHashBuilder::new(seq)
220/// .k(6)
221/// .masks(masks)
222/// .num_hashes(2)
223/// .finish()? {
224/// println!("{pos:2} {:016x}", hashes[0]);
225/// }
226/// # Ok(()) }
227/// ```
228pub struct SeedNtHashBuilder<'a> {
229 seq: &'a [u8],
230 masks: Vec<String>,
231 k: u16,
232 num_hashes: usize,
233 start_pos: usize,
234}
235
236impl<'a> SeedNtHashBuilder<'a> {
237 /// Starts building a new ntHash configuration from the given sequence.
238 pub fn new(seq: &'a [u8]) -> Self {
239 Self {
240 seq,
241 masks: Vec::new(),
242 k: 0,
243 num_hashes: 1,
244 start_pos: 0,
245 }
246 }
247
248 /// Sets the k-mer size.
249 pub fn k(mut self, k: u16) -> Self {
250 self.k = k;
251 self
252 }
253
254 /// Adds seed masks where '1' indicates positions to hash.
255 pub fn masks<S: Into<String>, I: IntoIterator<Item = S>>(mut self, m: I) -> Self {
256 self.masks = m.into_iter().map(Into::into).collect();
257 self
258 }
259
260 /// Specifies number of hashes per spaced seed.
261 pub fn num_hashes(mut self, n: usize) -> Self {
262 self.num_hashes = n;
263 self
264 }
265
266 /// Sets the start position in the sequence.
267 pub fn pos(mut self, p: usize) -> Self {
268 self.start_pos = p;
269 self
270 }
271
272 /// Finalizes the builder and returns an iterator over the hashes.
273 pub fn finish(self) -> Result<SeedNtHashIter<'a>> {
274 let hasher = SeedNtHash::new(
275 self.seq,
276 &self.masks,
277 self.num_hashes,
278 self.k,
279 self.start_pos,
280 )?;
281 Ok(SeedNtHashIter { hasher, done: false })
282 }
283}
284
285/// Iterator for traversing valid k-mers and yielding spaced-seed hashes.
286pub struct SeedNtHashIter<'a> {
287 hasher: SeedNtHash<'a>,
288 done: bool,
289}
290
291impl<'a> Iterator for SeedNtHashIter<'a> {
292 type Item = (usize, Vec<u64>);
293
294 fn next(&mut self) -> Option<Self::Item> {
295 if self.done {
296 return None;
297 }
298 if !self.hasher.roll() {
299 self.done = true;
300 return None;
301 }
302 Some((self.hasher.pos(), self.hasher.hashes().to_vec()))
303 }
304}
305
306impl<'a> IntoIterator for SeedNtHashBuilder<'a> {
307 type Item = (usize, Vec<u64>);
308 type IntoIter = SeedNtHashIter<'a>;
309
310 fn into_iter(self) -> Self::IntoIter {
311 self.finish()
312 .expect("invalid SeedNtHashBuilder configuration")
313 }
314}
315
316// ─────────────────────────────────────────────────────────────────────────────
317// Basic Unit Test
318// ─────────────────────────────────────────────────────────────────────────────
319#[cfg(test)]
320mod tests {
321 use super::*;
322
323 #[test]
324 fn basic_spaced_seed() {
325 let seq = b"ATCGTACGATGCATGCATGCTGACG";
326 let masks = vec!["000111".to_string(), "010101".to_string()];
327 let mut h = SeedNtHash::new(seq, &masks, 2, 6, 0).unwrap();
328 assert!(h.roll()); // first valid
329 let first = h.hashes()[0];
330 assert!(h.roll()); // next valid
331 assert_ne!(first, h.hashes()[0]); // hashes should differ
332 }
333}