seq_hash/
nthash.rs

1//! NtHash the kmers in a sequence.
2use std::array::from_fn;
3use std::hash::BuildHasher;
4use std::hash::BuildHasherDefault;
5use std::hash::DefaultHasher;
6
7use super::intrinsics;
8use crate::KmerHasher;
9use crate::S;
10use packed_seq::Seq;
11use packed_seq::complement_base;
12use wide::u32x8;
13
14type SeedHasher = BuildHasherDefault<DefaultHasher>;
15
16/// Original ntHash seed values.
17// TODO: Update to guarantee unique hash values for k<=16?
18const HASHES_F: [u32; 4] = [
19    0x3c8b_fbb3_95c6_0474u64 as u32,
20    0x3193_c185_62a0_2b4cu64 as u32,
21    0x2032_3ed0_8257_2324u64 as u32,
22    0x2955_49f5_4be2_4456u64 as u32,
23];
24
25/// A helper trait that hashes a single character.
26///
27/// Can be either via [`NtHasher`], which only works for 2-bit alphabets,
28/// or [`MulHasher`], which always works but is slightly slower.
29pub trait CharHasher: Clone {
30    /// Whether the underlying hasher is invariant under reverse-complement.
31    const CANONICAL: bool;
32    /// The number of bits to rotate the hash after each character.
33    const R: u32;
34    /// The (maximum) number of bits each character may have.
35    const BITS_PER_CHAR: usize;
36
37    fn new(k: usize) -> Self {
38        Self::new_with_seed(k, None)
39    }
40    /// Seeded version.
41    fn new_with_seed(k: usize, seed: Option<u32>) -> Self;
42    /// The underlying value of `k`.
43    fn k(&self) -> usize;
44    /// Hash `b`.
45    fn f(&self, b: u8) -> u32;
46    /// Hash the reverse complement of `b`.
47    fn c(&self, b: u8) -> u32;
48    /// Hash `b`, left rotated by `(k-1)*R` steps.
49    fn f_rot(&self, b: u8) -> u32;
50    /// Hash the reverse complement of `b`, right rotated by `(k-1)*R` steps.
51    fn c_rot(&self, b: u8) -> u32;
52    /// SIMD-version of [`f()`], looking up 8 characters at a time.
53    fn simd_f(&self, b: u32x8) -> u32x8;
54    /// SIMD-version of [`c()`], looking up 8 characters at a time.
55    fn simd_c(&self, b: u32x8) -> u32x8;
56    /// SIMD-version of [`f_rot()`], looking up 8 characters at a time.
57    fn simd_f_rot(&self, b: u32x8) -> u32x8;
58    /// SIMD-version of [`c_rot()`], looking up 8 characters at a time.
59    fn simd_c_rot(&self, b: u32x8) -> u32x8;
60
61    fn fw_init(&self) -> u32;
62    fn rc_init(&self) -> u32;
63}
64
65/// `u32` variant of NtHash.
66///
67/// `CANONICAL` by default by summing forward and reverse-complement hash values.
68/// Instead of the classical 1-bit rotation, this rotates by `R=7` bits by default,
69/// to reduce correlation between high bits of consecutive hashes.
70#[derive(Clone)]
71pub struct NtHasher<const CANONICAL: bool = true, const R: u32 = 7> {
72    k: usize,
73    f: [u32; 4],
74    c: [u32; 4],
75    f_rot: [u32; 4],
76    c_rot: [u32; 4],
77    simd_f: u32x8,
78    simd_c: u32x8,
79    simd_f_rot: u32x8,
80    simd_c_rot: u32x8,
81    fw_init: u32,
82    rc_init: u32,
83}
84
85impl<const CANONICAL: bool, const R: u32> NtHasher<CANONICAL, R> {
86    #[inline(always)]
87    pub fn new(k: usize) -> Self {
88        CharHasher::new(k)
89    }
90    #[inline(always)]
91    pub fn new_with_seed(k: usize, seed: u32) -> Self {
92        CharHasher::new_with_seed(k, Some(seed))
93    }
94}
95
96impl<const CANONICAL: bool, const R: u32> CharHasher for NtHasher<CANONICAL, R> {
97    const CANONICAL: bool = CANONICAL;
98    const R: u32 = R;
99    const BITS_PER_CHAR: usize = 2;
100
101    #[inline(always)]
102    fn new_with_seed(k: usize, seed: Option<u32>) -> Self {
103        let rot = k as u32 - 1;
104        let hasher = SeedHasher::new();
105        let f = match seed {
106            None => HASHES_F,
107            Some(seed) => from_fn(|i| hasher.hash_one(HASHES_F[i] ^ seed) as u32),
108        };
109        let c = from_fn(|i| f[complement_base(i as u8) as usize]);
110        let f_rot = f.map(|h| h.rotate_left(rot * R));
111        let c_rot = c.map(|h| h.rotate_left(rot * R));
112        let idx = [0, 1, 2, 3, 0, 1, 2, 3];
113        let simd_f = idx.map(|i| f[i]).into();
114        let simd_c = idx.map(|i| c[i]).into();
115        let simd_f_rot = idx.map(|i| f_rot[i]).into();
116        let simd_c_rot = idx.map(|i| c_rot[i]).into();
117
118        // Initial value of hashing `k-1` zeros.
119        let mut fw_init = 0u32;
120        for _ in 0..k - 1 {
121            fw_init = fw_init.rotate_left(Self::R) ^ f[0];
122        }
123
124        // Initial value of reverse-complement-hashing `k-1` zeros.
125        let mut rc_init = 0u32;
126        for _ in 0..k - 1 {
127            rc_init = rc_init.rotate_right(Self::R) ^ c_rot[0];
128        }
129
130        Self {
131            k,
132            f,
133            c,
134            f_rot,
135            c_rot,
136            simd_f,
137            simd_c,
138            simd_f_rot,
139            simd_c_rot,
140            fw_init,
141            rc_init,
142        }
143    }
144
145    #[inline(always)]
146    fn k(&self) -> usize {
147        self.k
148    }
149
150    #[inline(always)]
151    fn f(&self, b: u8) -> u32 {
152        unsafe { *self.f.get_unchecked(b as usize) }
153    }
154    #[inline(always)]
155    fn c(&self, b: u8) -> u32 {
156        unsafe { *self.c.get_unchecked(b as usize) }
157    }
158    #[inline(always)]
159    fn f_rot(&self, b: u8) -> u32 {
160        unsafe { *self.f_rot.get_unchecked(b as usize) }
161    }
162    #[inline(always)]
163    fn c_rot(&self, b: u8) -> u32 {
164        unsafe { *self.c_rot.get_unchecked(b as usize) }
165    }
166
167    #[inline(always)]
168    fn simd_f(&self, b: u32x8) -> u32x8 {
169        intrinsics::table_lookup(self.simd_f, b)
170    }
171    #[inline(always)]
172    fn simd_c(&self, b: u32x8) -> u32x8 {
173        intrinsics::table_lookup(self.simd_c, b)
174    }
175    #[inline(always)]
176    fn simd_f_rot(&self, b: u32x8) -> u32x8 {
177        intrinsics::table_lookup(self.simd_f_rot, b)
178    }
179    #[inline(always)]
180    fn simd_c_rot(&self, b: u32x8) -> u32x8 {
181        intrinsics::table_lookup(self.simd_c_rot, b)
182    }
183    #[inline(always)]
184    fn fw_init(&self) -> u32 {
185        self.fw_init
186    }
187    #[inline(always)]
188    fn rc_init(&self) -> u32 {
189        self.rc_init
190    }
191}
192
193/// `MulHasher` multiplies each character by a constant and xor's them together under rotations.
194///
195/// `CANONICAL` by default by summing forward and reverse-complement hash values.
196/// Instead of the classical 1-bit rotation, this rotates by `R=7` bits by default,
197/// to reduce correlation between high bits of consecutive hashes.
198#[derive(Clone)]
199pub struct MulHasher<const CANONICAL: bool = true, const R: u32 = 7> {
200    k: usize,
201    rot: u32,
202    mul: u32,
203    fw_init: u32,
204    rc_init: u32,
205}
206
207impl<const CANONICAL: bool, const R: u32> MulHasher<CANONICAL, R> {
208    #[inline(always)]
209    pub fn new(k: usize) -> Self {
210        CharHasher::new(k)
211    }
212    #[inline(always)]
213    pub fn new_with_seed(k: usize, seed: u32) -> Self {
214        CharHasher::new_with_seed(k, Some(seed))
215    }
216}
217
218// Mixing constant.
219const C: u32 = 0x517cc1b727220a95u64 as u32;
220
221impl<const CANONICAL: bool, const R: u32> CharHasher for MulHasher<CANONICAL, R> {
222    const CANONICAL: bool = CANONICAL;
223    const R: u32 = R;
224    const BITS_PER_CHAR: usize = 8;
225
226    #[inline(always)]
227    fn new_with_seed(k: usize, seed: Option<u32>) -> Self {
228        let rot = (k as u32 - 1) % 32;
229        let mul = C ^ match seed {
230            None => 0,
231            // don't change parity,
232            Some(seed) => (SeedHasher::new().hash_one(seed) as u32) << 1,
233        };
234
235        // Initial value of hashing `k-1` zeros.
236        let mut fw_init = 0u32;
237        for _ in 0..k - 1 {
238            fw_init = fw_init.rotate_left(Self::R) ^ (0 as u32).wrapping_mul(mul);
239        }
240
241        // Initial value of reverse-complement-hashing `k-1` zeros.
242        let mut rc_init = 0u32;
243        for _ in 0..k - 1 {
244            rc_init = rc_init.rotate_right(Self::R)
245                ^ (complement_base(0) as u32)
246                    .wrapping_mul(mul)
247                    .rotate_left(rot * R);
248        }
249
250        Self {
251            k,
252            rot,
253            mul,
254            fw_init,
255            rc_init,
256        }
257    }
258
259    #[inline(always)]
260    fn k(&self) -> usize {
261        self.k
262    }
263
264    #[inline(always)]
265    fn f(&self, b: u8) -> u32 {
266        (b as u32).wrapping_mul(self.mul)
267    }
268    #[inline(always)]
269    fn c(&self, b: u8) -> u32 {
270        (complement_base(b) as u32).wrapping_mul(self.mul)
271    }
272    #[inline(always)]
273    fn f_rot(&self, b: u8) -> u32 {
274        (b as u32).wrapping_mul(self.mul).rotate_left(self.rot * R)
275    }
276    #[inline(always)]
277    fn c_rot(&self, b: u8) -> u32 {
278        (complement_base(b) as u32)
279            .wrapping_mul(self.mul)
280            .rotate_left(self.rot * R)
281    }
282
283    #[inline(always)]
284    fn simd_f(&self, b: u32x8) -> u32x8 {
285        b * self.mul.into()
286    }
287    #[inline(always)]
288    fn simd_c(&self, b: u32x8) -> u32x8 {
289        packed_seq::complement_base_simd(b) * self.mul.into()
290    }
291    #[inline(always)]
292    fn simd_f_rot(&self, b: u32x8) -> u32x8 {
293        let r = b * self.mul.into();
294        let rot = self.rot * R % 32;
295        (r << rot) | (r >> (32 - rot))
296    }
297    #[inline(always)]
298    fn simd_c_rot(&self, b: u32x8) -> u32x8 {
299        let r = packed_seq::complement_base_simd(b) * self.mul.into();
300        let rot = self.rot * R % 32;
301        (r << rot) | (r >> (32 - rot))
302    }
303    #[inline(always)]
304    fn fw_init(&self) -> u32 {
305        self.fw_init
306    }
307    #[inline(always)]
308    fn rc_init(&self) -> u32 {
309        self.rc_init
310    }
311}
312
313impl<CH: CharHasher> KmerHasher for CH {
314    const CANONICAL: bool = CH::CANONICAL;
315
316    fn new(k: usize) -> Self {
317        Self::new(k)
318    }
319
320    fn k(&self) -> usize {
321        self.k()
322    }
323
324    #[inline(always)]
325    fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32 {
326        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
327
328        let mut fw = self.fw_init();
329        let mut rc = self.rc_init();
330
331        move |(a, r)| {
332            let fw_out = fw.rotate_left(CH::R) ^ self.f(a);
333            fw = fw_out ^ self.f_rot(r);
334            if Self::CANONICAL {
335                let rc_out = rc.rotate_right(CH::R) ^ self.c_rot(a);
336                rc = rc_out ^ self.c(r);
337                fw_out.wrapping_add(rc_out)
338            } else {
339                fw_out
340            }
341        }
342    }
343
344    #[inline(always)]
345    fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S {
346        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
347        let mut fw = S::splat(self.fw_init());
348        let mut rc = S::splat(self.rc_init());
349        let shl = S::splat(CH::R);
350        let shr = S::splat(32 - CH::R);
351
352        move |(a, r)| {
353            let fw_out = ((fw << shl) | (fw >> shr)) ^ self.simd_f(a);
354            fw = fw_out ^ self.simd_f_rot(r);
355            if Self::CANONICAL {
356                let rc_out = ((rc >> shl) | (rc << shr)) ^ self.simd_c_rot(a);
357                rc = rc_out ^ self.simd_c(r);
358                // Wrapping SIMD add
359                fw_out + rc_out
360            } else {
361                fw_out
362            }
363        }
364    }
365
366    #[inline(always)]
367    fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32 {
368        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
369
370        let mut fw = 0u32;
371        let mut rc = 0u32;
372        move |a| {
373            fw = fw.rotate_left(CH::R) ^ self.f(a);
374            if Self::CANONICAL {
375                rc = rc.rotate_right(CH::R) ^ self.c_rot(a);
376                fw.wrapping_add(rc)
377            } else {
378                fw
379            }
380        }
381    }
382}