nthash_rs/
seed.rs

1//! **Streaming spaced-seed ntHash** for *non-contiguous* k‑mers.
2//!
3//! **`SeedNtHash` computes hashes using spaced seeds**, where only selected
4//! positions in the k‑mer are considered (“care sites”).
5//!
6//! Hashes are re‑computed per window rather than rolled, allowing support
7//! for multiple seeds and arbitrary binary masks.
8//!
9//! Bit-level operations are delegated to `tables`, `constants`, and
10//! `util::extend_hashes` for efficient hash computation.
11//!
12//! A Rust‑idiomatic **builder + iterator** (`SeedNtHashBuilder` / `SeedNtHashIter`)
13//! provides ergonomic traversal over valid k‑mers.
14
15use crate::{
16    constants::{CP_OFF, SEED_N, SEED_TAB},
17    tables::srol_table,
18    util::extend_hashes,
19    NtHashError, Result,
20};
21
22/// Parses a spaced-seed mask string composed of '0' and '1' characters
23/// into a list of indices indicating which positions should be used ("care positions").
24/// 
25/// # Errors
26/// Returns an error if the mask length does not match `k`, or contains characters other than '0' or '1'.
27fn parse_seed_string(mask: &str, k: usize) -> Result<Vec<usize>> {
28    if mask.len() != k {
29        return Err(NtHashError::InvalidK);
30    }
31    if !mask.bytes().all(|b| b == b'0' || b == b'1') {
32        return Err(NtHashError::InvalidSequence);
33    }
34    Ok(mask
35        .bytes()
36        .enumerate()
37        .filter_map(|(i, b)| if b == b'1' { Some(i) } else { None })
38        .collect())
39}
40
41/// Computes the forward and reverse hash values for a given k-mer using a spaced seed.
42/// 
43/// # Arguments
44/// - `window`: The current k-mer slice from the sequence.
45/// - `care`: The positions to include in hashing (as defined by the spaced seed).
46/// - `k`: Length of the k-mer.
47/// 
48/// # Returns
49/// A tuple of (forward_hash, reverse_hash).
50#[inline]
51fn compute_pair(window: &[u8], care: &[usize], k: usize) -> (u64, u64) {
52    let mut fwd = 0u64;
53    let mut rev = 0u64;
54    for &p in care {
55        let c_f = window[p];
56        let c_r = c_f & CP_OFF; // Apply complement transformation
57
58        fwd ^= srol_table(c_f, (k - 1 - p) as u32); // Position-dependent rotation
59        rev ^= srol_table(c_r, p as u32);
60    }
61    (fwd, rev)
62}
63
64/// Struct for computing spaced-seed ntHash values in a re-computational manner.
65/// Can handle multiple seeds and generates multiple hashes per k-mer.
66pub struct SeedNtHash<'a> {
67    seq:      &'a [u8],        // Input nucleotide sequence
68    k:        usize,           // k-mer size
69    num_hashes: usize,         // Number of hashes per seed
70    seeds:    Vec<Vec<usize>>, // Care indices for each seed
71    pos:      usize,           // Current position in the sequence
72    hashes:   Vec<u64>,        // Hash results (flattened)
73    initialised: bool,         // Whether the hasher has found the first valid k-mer
74}
75
76impl<'a> SeedNtHash<'a> {
77    /// Creates a new hasher from a sequence and spaced-seed masks.
78    /// 
79    /// # Errors
80    /// Returns an error if `k` is zero, the sequence is too short, or a mask is invalid.
81    pub fn new(
82        seq: &'a [u8],
83        seed_masks: &[String],
84        num_hashes_per_seed: usize,
85        k: u16,
86        start_pos: usize,
87    ) -> Result<Self> {
88        if k == 0 {
89            return Err(NtHashError::InvalidK);
90        }
91        let k_usz = k as usize;
92        if seq.len() < k_usz {
93            return Err(NtHashError::SequenceTooShort {
94                seq_len: seq.len(),
95                k,
96            });
97        }
98        if start_pos > seq.len() - k_usz {
99            return Err(NtHashError::PositionOutOfRange {
100                pos: start_pos,
101                seq_len: seq.len(),
102            });
103        }
104
105        let mut seeds = Vec::with_capacity(seed_masks.len());
106        for m in seed_masks {
107            seeds.push(parse_seed_string(m, k_usz)?);
108        }
109
110        Ok(Self {
111            seq,
112            k: k_usz,
113            num_hashes: num_hashes_per_seed.max(1),
114            seeds,
115            pos: start_pos,
116            hashes: vec![0; seed_masks.len() * num_hashes_per_seed.max(1)],
117            initialised: false,
118        })
119    }
120
121    /// Alternative constructor using pre-parsed care indices (skips mask parsing).
122    pub fn from_care_indices(
123        seq: &'a [u8],
124        seeds: Vec<Vec<usize>>,
125        num_hashes_per_seed: usize,
126        k: u16,
127        start_pos: usize,
128    ) -> Result<Self> {
129        let k_usz = k as usize;
130        if seeds.iter().any(|v| v.iter().any(|&i| i >= k_usz)) {
131            return Err(NtHashError::InvalidWindowOffsets);
132        }
133        Self::new(
134            seq,
135            &vec![String::from_utf8(vec![b'0'; k_usz]).unwrap(); seeds.len()], // dummy masks
136            num_hashes_per_seed,
137            k,
138            start_pos,
139        )
140        .map(|mut s| {
141            s.seeds = seeds;
142            s
143        })
144    }
145
146    /// Returns the current position in the sequence.
147    #[inline(always)]
148    pub fn pos(&self) -> usize {
149        self.pos
150    }
151
152    /// Returns the current set of hash values.
153    #[inline(always)]
154    pub fn hashes(&self) -> &[u64] {
155        &self.hashes
156    }
157
158    /// Advances the iterator by one position.
159    /// On first call, searches for the first valid k-mer (initialization).
160    pub fn roll(&mut self) -> bool {
161        if !self.initialised {
162            return self.init();
163        }
164
165        if self.pos >= self.seq.len() - self.k {
166            return false; // End of sequence
167        }
168
169        self.pos += 1;
170        self.compute_current()
171    }
172
173    /// Computes hashes for the k-mer at the current position.
174    /// Returns false if any ambiguous base is found.
175    fn compute_current(&mut self) -> bool {
176        let win = &self.seq[self.pos..self.pos + self.k];
177        for care in &self.seeds {
178            if care.iter().any(|&p| SEED_TAB[win[p] as usize] == SEED_N) {
179                return false;
180            }
181        }
182
183        for (i_seed, care) in self.seeds.iter().enumerate() {
184            let (fwd, rev) = compute_pair(win, care, self.k);
185            let slice = &mut self.hashes[i_seed * self.num_hashes
186                ..(i_seed + 1) * self.num_hashes];
187            extend_hashes(fwd, rev, self.k as u32, slice);
188        }
189        true
190    }
191
192    /// Initializes by finding the first valid k-mer in the sequence.
193    fn init(&mut self) -> bool {
194        while self.pos <= self.seq.len() - self.k {
195            if self.compute_current() {
196                self.initialised = true;
197                return true;
198            }
199            self.pos += 1;
200        }
201        false
202    }
203}
204
205// ─────────────────────────────────────────────────────────────────────────────
206// Builder + Iterator façade for ergonomic traversal of spaced-seed hashes
207// ─────────────────────────────────────────────────────────────────────────────
208
209/// Builder for creating a `SeedNtHashIter`, providing ergonomic configuration.
210///
211/// Example:
212/// ```rust
213/// use nthash_rs::{SeedNtHashBuilder, Result};
214///
215/// # fn main() -> Result<()> {
216/// let seq   = b"ATCGTACGATGCATGCATGCTGACG";
217/// let masks = vec!["000111", "010101"];
218///
219/// for (pos, hashes) in SeedNtHashBuilder::new(seq)
220///                        .k(6)
221///                        .masks(masks)
222///                        .num_hashes(2)
223///                        .finish()? {
224///     println!("{pos:2}  {:016x}", hashes[0]);
225/// }
226/// # Ok(()) }
227/// ```
228pub struct SeedNtHashBuilder<'a> {
229    seq:        &'a [u8],
230    masks:      Vec<String>,
231    k:          u16,
232    num_hashes: usize,
233    start_pos:  usize,
234}
235
236impl<'a> SeedNtHashBuilder<'a> {
237    /// Starts building a new ntHash configuration from the given sequence.
238    pub fn new(seq: &'a [u8]) -> Self {
239        Self {
240            seq,
241            masks: Vec::new(),
242            k: 0,
243            num_hashes: 1,
244            start_pos: 0,
245        }
246    }
247
248    /// Sets the k-mer size.
249    pub fn k(mut self, k: u16) -> Self {
250        self.k = k;
251        self
252    }
253
254    /// Adds seed masks where '1' indicates positions to hash.
255    pub fn masks<S: Into<String>, I: IntoIterator<Item = S>>(mut self, m: I) -> Self {
256        self.masks = m.into_iter().map(Into::into).collect();
257        self
258    }
259
260    /// Specifies number of hashes per spaced seed.
261    pub fn num_hashes(mut self, n: usize) -> Self {
262        self.num_hashes = n;
263        self
264    }
265
266    /// Sets the start position in the sequence.
267    pub fn pos(mut self, p: usize) -> Self {
268        self.start_pos = p;
269        self
270    }
271
272    /// Finalizes the builder and returns an iterator over the hashes.
273    pub fn finish(self) -> Result<SeedNtHashIter<'a>> {
274        let hasher = SeedNtHash::new(
275            self.seq,
276            &self.masks,
277            self.num_hashes,
278            self.k,
279            self.start_pos,
280        )?;
281        Ok(SeedNtHashIter { hasher, done: false })
282    }
283}
284
285/// Iterator for traversing valid k-mers and yielding spaced-seed hashes.
286pub struct SeedNtHashIter<'a> {
287    hasher: SeedNtHash<'a>,
288    done:   bool,
289}
290
291impl<'a> Iterator for SeedNtHashIter<'a> {
292    type Item = (usize, Vec<u64>);
293
294    fn next(&mut self) -> Option<Self::Item> {
295        if self.done {
296            return None;
297        }
298        if !self.hasher.roll() {
299            self.done = true;
300            return None;
301        }
302        Some((self.hasher.pos(), self.hasher.hashes().to_vec()))
303    }
304}
305
306impl<'a> IntoIterator for SeedNtHashBuilder<'a> {
307    type Item = (usize, Vec<u64>);
308    type IntoIter = SeedNtHashIter<'a>;
309
310    fn into_iter(self) -> Self::IntoIter {
311        self.finish()
312            .expect("invalid SeedNtHashBuilder configuration")
313    }
314}
315
316// ─────────────────────────────────────────────────────────────────────────────
317// Basic Unit Test
318// ─────────────────────────────────────────────────────────────────────────────
319#[cfg(test)]
320mod tests {
321    use super::*;
322
323    #[test]
324    fn basic_spaced_seed() {
325        let seq = b"ATCGTACGATGCATGCATGCTGACG";
326        let masks = vec!["000111".to_string(), "010101".to_string()];
327        let mut h = SeedNtHash::new(seq, &masks, 2, 6, 0).unwrap();
328        assert!(h.roll()); // first valid
329        let first = h.hashes()[0];
330        assert!(h.roll()); // next valid
331        assert_ne!(first, h.hashes()[0]); // hashes should differ
332    }
333}