nthash_rs/
util.rs

1//! Miscellaneous helpers shared across all ntHash variants.
2//!
3//! This module provides two small yet critical building blocks used by
4//! every ntHash implementation (`NtHash`, `SeedNtHash`, `BlindNtHash`, etc.):
5//!
6//! - **`canonical`** — combine forward and reverse‐complement hashes into a
7//!   strand‐independent value by wrapping addition.
8//! - **`extend_hashes`** — generate a sequence of "extra" hash values from
9//!   one canonical base hash, matching the C++ reference’s multiplicative
10//!   mixing and shift scheme.
11//!
12//! These functions are marked `#[inline]` for zero‐overhead calls in hot paths,
13//! and the code is dependency‐free (only `core`/`std`), so it can be used
14//! in no‐std contexts if needed.
15
16use crate::constants::{MULTISEED, MULTISHIFT};
17
18/// Combine forward and reverse‐complement strand hashes into a single
19/// *canonical* k‑mer hash (strand‐independent).
20///
21/// The original ntHash definition simply **adds** the two 64‑bit words with
22/// wrapping arithmetic to remain well‐defined on overflow.
23///
24/// # Examples
25///
26/// ```
27/// # use nthash_rs::util::canonical;
28/// let fwd = 0xFFFF_FFFF_FFFF_FFFF;
29/// let rev = 1;
30/// // wraps around to 0
31/// assert_eq!(canonical(fwd, rev), 0);
32/// ```
33#[inline(always)]
34pub const fn canonical(fwd: u64, rev: u64) -> u64 {
35    fwd.wrapping_add(rev)
36}
37
38/// Expand a single canonical hash into a user‐provided slice of additional
39/// hash values.
40///
41/// This implements the same scheme as the C++ ntHash reference:
42/// each extra hash `h_i` (for `i ≥ 1`) is computed as:
43///
44/// ```text
45///   mix   = (i as u64) ^ (k as u64 * MULTISEED)
46///   h_i   = base.wrapping_mul(mix)
47///   h_i  ^= h_i >> MULTISHIFT
48/// ```
49///
50/// - `fwd`, `rev`  — forward and reverse‐complement strand hashes.
51/// - `k`           — k‑mer span or seed weight, used in the mixing step.
52/// - `hashes`      — output slice; the length determines how many values
53///                   (including the canonical hash at index 0) are generated.
54///
55/// If `hashes` is empty, this function returns immediately, avoiding any
56/// unnecessary branching in callers.
57///
58/// # Examples
59///
60/// ```
61/// # use nthash_rs::util::extend_hashes;
62/// let mut out = [0u64; 4];
63/// extend_hashes(0x1234, 0x5678, 5, &mut out);
64/// assert_eq!(out[0], 0x1234u64.wrapping_add(0x5678));
65/// // subsequent elements are nonzero mixes
66/// assert!(out[1] != out[0]);
67/// ```
68#[inline]
69pub fn extend_hashes(fwd: u64, rev: u64, k: u32, hashes: &mut [u64]) {
70    match hashes.len() {
71        0 => return,
72        1 => {
73            hashes[0] = canonical(fwd, rev);
74            return;
75        }
76        _ => {}
77    }
78
79    // Base (canonical) hash at index 0
80    let base = canonical(fwd, rev);
81    hashes[0] = base;
82
83    let seed = (k as u64).wrapping_mul(MULTISEED);
84
85    // Compute extra hashes for i = 1 .. len−1
86    for (i, slot) in hashes.iter_mut().enumerate().skip(1) {
87        let mut h = base.wrapping_mul((i as u64) ^ seed);
88        h ^= h >> MULTISHIFT;
89        *slot = h;
90    }
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96
97    #[test]
98    fn canonical_wraps_on_overflow() {
99        let max = u64::MAX;
100        assert_eq!(canonical(max, 1), 0);
101    }
102
103    #[test]
104    fn extend_zero_length_slice() {
105        let mut out: [u64; 0] = [];
106        extend_hashes(123, 456, 7, &mut out);
107        // no panic, no change
108    }
109
110    #[test]
111    fn extend_matches_cpp_reference() {
112        const F: u64 = 0x1234_5678_9ABC_DEF0;
113        const R: u64 = 0x0FED_CBA9_8765_4321;
114        const K: u32 = 21;
115        let mut v = [0u64; 8];
116        extend_hashes(F, R, K, &mut v);
117        let base = F.wrapping_add(R);
118        for i in 0..v.len() {
119            let expected = if i == 0 {
120                base
121            } else {
122                let mut t = base.wrapping_mul((i as u64) ^ (K as u64).wrapping_mul(MULTISEED));
123                t ^= t >> MULTISHIFT;
124                t
125            };
126            assert_eq!(v[i], expected);
127        }
128    }
129}