seq_hash/
nthash.rs

1//! NtHash the kmers in a sequence.
2use std::array::from_fn;
3use std::hash::BuildHasher;
4use std::hash::BuildHasherDefault;
5use std::hash::DefaultHasher;
6
7use super::intrinsics;
8use crate::KmerHasher;
9use crate::S;
10use packed_seq::Seq;
11use packed_seq::complement_base;
12use wide::u32x8;
13
14type SeedHasher = BuildHasherDefault<DefaultHasher>;
15
16/// Original ntHash seed values.
17// TODO: Update to guarantee unique hash values for k<=16?
18const HASHES_F: [u32; 4] = [
19    0x3c8b_fbb3_95c6_0474u64 as u32,
20    0x3193_c185_62a0_2b4cu64 as u32,
21    0x2032_3ed0_8257_2324u64 as u32,
22    0x2955_49f5_4be2_4456u64 as u32,
23];
24
25/// A helper trait that hashes a single character.
26///
27/// Can be either via [`NtHasher`], which only works for 2-bit alphabets,
28/// or [`MulHasher`], which always works but is slightly slower.
29pub trait CharHasher: Clone {
30    /// Whether the underlying hasher is invariant under reverse-complement.
31    const CANONICAL: bool;
32    /// The number of bits to rotate the hash after each character.
33    const R: u32;
34    /// The (maximum) number of bits each character may have.
35    const BITS_PER_CHAR: usize;
36
37    fn new(k: usize) -> Self {
38        Self::new_with_seed(k, None)
39    }
40    /// Seeded version.
41    fn new_with_seed(k: usize, seed: Option<u32>) -> Self;
42    /// The underlying value of `k`.
43    fn k(&self) -> usize;
44    /// Hash `b`.
45    fn f(&self, b: u8) -> u32;
46    /// Hash the reverse complement of `b`.
47    fn c(&self, b: u8) -> u32;
48    /// Hash `b`, left rotated by `(k-1)*R` steps.
49    fn f_rot(&self, b: u8) -> u32;
50    /// Hash the reverse complement of `b`, right rotated by `(k-1)*R` steps.
51    fn c_rot(&self, b: u8) -> u32;
52    /// SIMD-version of [`f()`], looking up 8 characters at a time.
53    fn simd_f(&self, b: u32x8) -> u32x8;
54    /// SIMD-version of [`c()`], looking up 8 characters at a time.
55    fn simd_c(&self, b: u32x8) -> u32x8;
56    /// SIMD-version of [`f_rot()`], looking up 8 characters at a time.
57    fn simd_f_rot(&self, b: u32x8) -> u32x8;
58    /// SIMD-version of [`c_rot()`], looking up 8 characters at a time.
59    fn simd_c_rot(&self, b: u32x8) -> u32x8;
60
61    /// Initial value of hashing `k-1` zeros.
62    #[inline(always)]
63    fn fw_init(&self) -> u32 {
64        let mut fw = 0u32;
65        for _ in 0..self.k() - 1 {
66            fw = fw.rotate_left(Self::R) ^ self.f(0);
67        }
68        fw
69    }
70
71    /// Initial value of reverse-complement-hashing `k-1` zeros.
72    #[inline(always)]
73    fn rc_init(&self) -> u32 {
74        let mut rc = 0u32;
75        for _ in 0..self.k() - 1 {
76            rc = rc.rotate_right(Self::R) ^ self.c_rot(0);
77        }
78        rc
79    }
80}
81
82/// `u32` variant of NtHash.
83///
84/// `CANONICAL` by default by summing forward and reverse-complement hash values.
85/// Instead of the classical 1-bit rotation, this rotates by `R=7` bits by default,
86/// to reduce correlation between high bits of consecutive hashes.
87#[derive(Clone)]
88pub struct NtHasher<const CANONICAL: bool = true, const R: u32 = 7> {
89    k: usize,
90    f: [u32; 4],
91    c: [u32; 4],
92    f_rot: [u32; 4],
93    c_rot: [u32; 4],
94    simd_f: u32x8,
95    simd_c: u32x8,
96    simd_f_rot: u32x8,
97    simd_c_rot: u32x8,
98}
99
100impl<const CANONICAL: bool, const R: u32> NtHasher<CANONICAL, R> {
101    #[inline(always)]
102    pub fn new(k: usize) -> Self {
103        CharHasher::new(k)
104    }
105    #[inline(always)]
106    pub fn new_with_seed(k: usize, seed: u32) -> Self {
107        CharHasher::new_with_seed(k, Some(seed))
108    }
109}
110
111impl<const CANONICAL: bool, const R: u32> CharHasher for NtHasher<CANONICAL, R> {
112    const CANONICAL: bool = CANONICAL;
113    const R: u32 = R;
114    const BITS_PER_CHAR: usize = 2;
115
116    #[inline(always)]
117    fn new_with_seed(k: usize, seed: Option<u32>) -> Self {
118        let rot = k as u32 - 1;
119        let hasher = SeedHasher::new();
120        let f = match seed {
121            None => HASHES_F,
122            Some(seed) => from_fn(|i| hasher.hash_one(HASHES_F[i] ^ seed) as u32),
123        };
124        let c = from_fn(|i| f[complement_base(i as u8) as usize]);
125        let f_rot = f.map(|h| h.rotate_left(rot * R));
126        let c_rot = c.map(|h| h.rotate_left(rot * R));
127        let idx = [0, 1, 2, 3, 0, 1, 2, 3];
128        let simd_f = idx.map(|i| f[i]).into();
129        let simd_c = idx.map(|i| c[i]).into();
130        let simd_f_rot = idx.map(|i| f_rot[i]).into();
131        let simd_c_rot = idx.map(|i| c_rot[i]).into();
132
133        Self {
134            k,
135            f,
136            c,
137            f_rot,
138            c_rot,
139            simd_f,
140            simd_c,
141            simd_f_rot,
142            simd_c_rot,
143        }
144    }
145
146    #[inline(always)]
147    fn k(&self) -> usize {
148        self.k
149    }
150
151    #[inline(always)]
152    fn f(&self, b: u8) -> u32 {
153        unsafe { *self.f.get_unchecked(b as usize) }
154    }
155    #[inline(always)]
156    fn c(&self, b: u8) -> u32 {
157        unsafe { *self.c.get_unchecked(b as usize) }
158    }
159    #[inline(always)]
160    fn f_rot(&self, b: u8) -> u32 {
161        unsafe { *self.f_rot.get_unchecked(b as usize) }
162    }
163    #[inline(always)]
164    fn c_rot(&self, b: u8) -> u32 {
165        unsafe { *self.c_rot.get_unchecked(b as usize) }
166    }
167
168    #[inline(always)]
169    fn simd_f(&self, b: u32x8) -> u32x8 {
170        intrinsics::table_lookup(self.simd_f, b)
171    }
172    #[inline(always)]
173    fn simd_c(&self, b: u32x8) -> u32x8 {
174        intrinsics::table_lookup(self.simd_c, b)
175    }
176    #[inline(always)]
177    fn simd_f_rot(&self, b: u32x8) -> u32x8 {
178        intrinsics::table_lookup(self.simd_f_rot, b)
179    }
180    #[inline(always)]
181    fn simd_c_rot(&self, b: u32x8) -> u32x8 {
182        intrinsics::table_lookup(self.simd_c_rot, b)
183    }
184}
185
186/// `MulHasher` multiplies each character by a constant and xor's them together under rotations.
187///
188/// `CANONICAL` by default by summing forward and reverse-complement hash values.
189/// Instead of the classical 1-bit rotation, this rotates by `R=7` bits by default,
190/// to reduce correlation between high bits of consecutive hashes.
191#[derive(Clone)]
192pub struct MulHasher<const CANONICAL: bool = true, const R: u32 = 7> {
193    k: usize,
194    rot: u32,
195    mul: u32,
196}
197
198impl<const CANONICAL: bool, const R: u32> MulHasher<CANONICAL, R> {
199    #[inline(always)]
200    pub fn new(k: usize) -> Self {
201        CharHasher::new(k)
202    }
203    #[inline(always)]
204    pub fn new_with_seed(k: usize, seed: u32) -> Self {
205        CharHasher::new_with_seed(k, Some(seed))
206    }
207}
208
209// Mixing constant.
210const C: u32 = 0x517cc1b727220a95u64 as u32;
211
212impl<const CANONICAL: bool, const R: u32> CharHasher for MulHasher<CANONICAL, R> {
213    const CANONICAL: bool = CANONICAL;
214    const R: u32 = R;
215    const BITS_PER_CHAR: usize = 8;
216
217    #[inline(always)]
218    fn new_with_seed(k: usize, seed: Option<u32>) -> Self {
219        Self {
220            k,
221            rot: (k as u32 - 1) % 32,
222            mul: C ^ match seed {
223                None => 0,
224                // don't change parity,
225                Some(seed) => (SeedHasher::new().hash_one(seed) as u32) << 1,
226            },
227        }
228    }
229
230    #[inline(always)]
231    fn k(&self) -> usize {
232        self.k
233    }
234
235    #[inline(always)]
236    fn f(&self, b: u8) -> u32 {
237        (b as u32).wrapping_mul(self.mul)
238    }
239    #[inline(always)]
240    fn c(&self, b: u8) -> u32 {
241        (complement_base(b) as u32).wrapping_mul(self.mul)
242    }
243    #[inline(always)]
244    fn f_rot(&self, b: u8) -> u32 {
245        (b as u32).wrapping_mul(self.mul).rotate_left(self.rot * R)
246    }
247    #[inline(always)]
248    fn c_rot(&self, b: u8) -> u32 {
249        (complement_base(b) as u32)
250            .wrapping_mul(self.mul)
251            .rotate_left(self.rot * R)
252    }
253
254    #[inline(always)]
255    fn simd_f(&self, b: u32x8) -> u32x8 {
256        b * self.mul.into()
257    }
258    #[inline(always)]
259    fn simd_c(&self, b: u32x8) -> u32x8 {
260        packed_seq::complement_base_simd(b) * self.mul.into()
261    }
262    #[inline(always)]
263    fn simd_f_rot(&self, b: u32x8) -> u32x8 {
264        let r = b * self.mul.into();
265        let rot = self.rot * R % 32;
266        (r << rot) | (r >> (32 - rot))
267    }
268    #[inline(always)]
269    fn simd_c_rot(&self, b: u32x8) -> u32x8 {
270        let r = packed_seq::complement_base_simd(b) * self.mul.into();
271        let rot = self.rot * R % 32;
272        (r << rot) | (r >> (32 - rot))
273    }
274}
275
276impl<CH: CharHasher> KmerHasher for CH {
277    const CANONICAL: bool = CH::CANONICAL;
278
279    fn new(k: usize) -> Self {
280        Self::new(k)
281    }
282
283    fn k(&self) -> usize {
284        self.k()
285    }
286
287    #[inline(always)]
288    fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32 {
289        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
290
291        let mut fw = self.fw_init();
292        let mut rc = self.rc_init();
293
294        move |(a, r)| {
295            let fw_out = fw.rotate_left(CH::R) ^ self.f(a);
296            fw = fw_out ^ self.f_rot(r);
297            if Self::CANONICAL {
298                let rc_out = rc.rotate_right(CH::R) ^ self.c_rot(a);
299                rc = rc_out ^ self.c(r);
300                fw_out.wrapping_add(rc_out)
301            } else {
302                fw_out
303            }
304        }
305    }
306
307    #[inline(always)]
308    fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S {
309        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
310        let mut fw = S::splat(self.fw_init());
311        let mut rc = S::splat(self.rc_init());
312
313        move |(a, r)| {
314            let fw_out = ((fw << CH::R) | (fw >> (32 - CH::R))) ^ self.simd_f(a);
315            fw = fw_out ^ self.simd_f_rot(r);
316            if Self::CANONICAL {
317                let rc_out = ((rc >> CH::R) | (rc << (32 - CH::R))) ^ self.simd_c_rot(a);
318                rc = rc_out ^ self.simd_c(r);
319                // Wrapping SIMD add
320                fw_out + rc_out
321            } else {
322                fw_out
323            }
324        }
325    }
326
327    #[inline(always)]
328    fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32 {
329        assert!(seq.bits_per_char() <= CH::BITS_PER_CHAR);
330
331        let mut fw = 0u32;
332        let mut rc = 0u32;
333        move |a| {
334            fw = fw.rotate_left(CH::R) ^ self.f(a);
335            if Self::CANONICAL {
336                rc = rc.rotate_right(CH::R) ^ self.c_rot(a);
337                fw.wrapping_add(rc)
338            } else {
339                fw
340            }
341        }
342    }
343}