seq_hash/
lib.rs

1//! A crate for streaming hashing of k-mers via `KmerHasher`.
2//!
3//! This builds on [`packed_seq`] and is used by e.g. [`simd_minimizers`].
4//!
5//! The default [`NtHasher`] is canonical.
6//! If that's not needed, [`NtHasher<false>`] will be slightly faster.
7//! For non-DNA sequences with >2-bit alphabets, use [`MulHasher`] instead.
8//!
9//! Note that [`KmerHasher`] objects need `k` on their construction, so that they can precompute required constants.
10//! Prefer reusing the same [`KmerHasher`].
11//!
12//! This crate also includes [`AntiLexHasher`], see [this blogpost](https://curiouscoding.nl/posts/practical-minimizers/).
13//!
14//! ## Typical usage
15//!
16//! Construct a default [`NtHasher`] via `let hasher = <NtHasher>::new(k)`.
17//! Then call either `hasher.hash_kmers_simd(seq, context)`,
18//! or use the underlying 'mapper' via `hasher.in_out_mapper_simd(seq)`.
19//! ```
20//! use packed_seq::{AsciiSeqVec, PackedSeqVec, SeqVec};
21//! use seq_hash::{KmerHasher, NtHasher};
22//! let k = 3;
23//!
24//! // Default `NtHasher` is canonical.
25//! let hasher = <NtHasher>::new(k);
26//! let kmer = PackedSeqVec::from_ascii(b"ACG");
27//! let kmer_rc = PackedSeqVec::from_ascii(b"CGT");
28//! // Normally, prefer `hash_kmers_simd` over `hash_seq`.
29//! assert_eq!(
30//!     hasher.hash_seq(kmer.as_slice()),
31//!     hasher.hash_seq(kmer_rc.as_slice())
32//! );
33//!
34//! let fwd_hasher = NtHasher::<false>::new(k);
35//! assert_ne!(
36//!     fwd_hasher.hash_seq(kmer.as_slice()),
37//!     fwd_hasher.hash_seq(kmer_rc.as_slice())
38//! );
39//!
40//! let seq = b"ACGGCAGCGCATATGTAGT";
41//! let ascii_seq = AsciiSeqVec::from_ascii(seq);
42//! let packed_seq = PackedSeqVec::from_ascii(seq);
43//!
44//! // hasher.hash_kmers_scalar(seq.as_slice()); // Panics since `NtHasher` does not support ASCII.
45//! let hashes_1: Vec<_> = hasher.hash_kmers_scalar(ascii_seq.as_slice()).collect();
46//! let hashes_2: Vec<_> = hasher.hash_kmers_scalar(packed_seq.as_slice()).collect();
47//! // Hashes are equal for [`packed_seq::AsciiSeq`] and [`packed_seq::PackedSeq`].
48//! assert_eq!(hashes_1, hashes_2);
49//! assert_eq!(hashes_1.len(), seq.len() - (k-1));
50//!
51//! // Consider a 'context' of a single kmer.
52//! let hashes_3: Vec<_> = hasher.hash_kmers_simd(ascii_seq.as_slice(), 1).collect();
53//! let hashes_4: Vec<_> = hasher.hash_kmers_simd(packed_seq.as_slice(), 1).collect();
54//! assert_eq!(hashes_1, hashes_3);
55//! assert_eq!(hashes_1, hashes_4);
56//! ```
57
58mod anti_lex;
59mod intrinsics;
60mod nthash;
61#[cfg(test)]
62mod test;
63
64pub use anti_lex::AntiLexHasher;
65pub use nthash::{MulHasher, NtHasher};
66
67/// Re-export of the `packed-seq` crate.
68pub use packed_seq;
69
70use packed_seq::{ChunkIt, Delay, PackedNSeq, PaddedIt, Seq};
71use std::iter::{repeat, zip};
72
73type S = wide::u32x8;
74
75/// A hasher that can hash all k-mers in a string.
76///
77/// Note that a `KmerHasher` must be initialized with a specific `k`,
78/// so that it can precompute associated constants.
79pub trait KmerHasher {
80    /// True when the hash function is invariant under reverse-complement.
81    const CANONICAL: bool;
82
83    fn new(k: usize) -> Self;
84
85    /// Helper function returning [`Self::CANONICAL`].
86    #[inline(always)]
87    fn is_canonical(&self) -> bool {
88        Self::CANONICAL
89    }
90
91    /// The value of `k` for this hasher.
92    fn k(&self) -> usize;
93
94    /// The delay of the 'out' character passed to the `in_out_mapper` functions.
95    /// Defaults to `k-1`.
96    #[inline(always)]
97    fn delay(&self) -> Delay {
98        Delay(self.k() - 1)
99    }
100
101    /// A scalar mapper function that should be called with each `(in, out)` base.
102    ///
103    /// The delay should be [`Self::delay()`]. The first `delay` calls should have `out=0`.
104    /// `seq` is only used to ensure that the hasher can handle the underlying alphabet.
105    fn in_out_mapper_scalar<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((u8, u8)) -> u32;
106    /// A SIMD mapper function that should be called with a `(in, out)` base per lane.
107    ///
108    /// The delay should be [`Self::delay()`]. The first `delay` calls should have `out=u32x8::splat(0)`.
109    /// `seq` is only used to ensure that the hasher can handle the underlying alphabet.
110    fn in_out_mapper_simd<'s>(&self, seq: impl Seq<'s>) -> impl FnMut((S, S)) -> S;
111
112    fn in_out_mapper_ambiguous_scalar<'s>(
113        &self,
114        nseq: PackedNSeq<'s>,
115    ) -> impl FnMut((u8, u8)) -> u32 {
116        let mut mapper = self.in_out_mapper_scalar(nseq.seq);
117        let mut ambiguous = nseq.ambiguous.iter_kmer_ambiguity(self.k());
118        let k = self.k();
119        let mut i = 0;
120        move |(a, r)| {
121            let hash = mapper((a, r));
122            let ambiguous = if i > k - 1 {
123                ambiguous.next().unwrap()
124            } else {
125                false
126            };
127            i += 1;
128            if ambiguous { u32::MAX } else { hash }
129        }
130    }
131
132    #[inline(always)]
133    fn in_out_mapper_ambiguous_simd<'s>(
134        &self,
135        nseq: PackedNSeq<'s>,
136        context: usize,
137    ) -> impl FnMut((S, S)) -> S {
138        let mut mapper = self.in_out_mapper_simd(nseq.seq);
139        let mut ambiguous = nseq.ambiguous.par_iter_kmer_ambiguity(self.k(), context, 0);
140        move |(a, r)| {
141            let hash = mapper((a, r));
142            let ambiguous = ambiguous.it.next().unwrap();
143            ambiguous.blend(S::MAX, hash)
144        }
145    }
146
147    /// A scalar iterator over all k-mer hashes in `seq`.
148    #[inline(always)]
149    fn hash_kmers_scalar<'s>(&self, seq: impl Seq<'s>) -> impl ExactSizeIterator<Item = u32> {
150        let k = self.k();
151        let delay = self.delay();
152        let mut add = seq.iter_bp();
153        let mut remove = seq.iter_bp();
154        let mut mapper = self.in_out_mapper_scalar(seq);
155        zip(add.by_ref().take(delay.0), repeat(0)).for_each(|a| {
156            mapper(a);
157        });
158        zip(add.by_ref(), remove.by_ref())
159            .take(k - 1 - delay.0)
160            .for_each(|a| {
161                mapper(a);
162            });
163        zip(add, remove).map(mapper)
164    }
165
166    /// A SIMD-parallel iterator over all k-mer hashes in `seq`.
167    #[inline(always)]
168    fn hash_kmers_simd<'s>(&self, seq: impl Seq<'s>, context: usize) -> PaddedIt<impl ChunkIt<S>> {
169        let k = self.k();
170        let delay = self.delay();
171        seq.par_iter_bp_delayed(context + k - 1, delay)
172            .map(self.in_out_mapper_simd(seq))
173            .advance(k - 1)
174    }
175
176    /// An iterator over all k-mer hashes in `seq`.
177    /// Ambiguous kmers get hash `u32::MAX`.
178    #[inline(always)]
179    fn hash_valid_kmers_scalar<'s>(
180        &self,
181        nseq: PackedNSeq<'s>,
182    ) -> impl ExactSizeIterator<Item = u32> {
183        let k = self.k();
184        let delay = self.delay();
185        assert!(delay.0 < k);
186
187        let mut mapper = self.in_out_mapper_scalar(nseq.seq);
188
189        let mut a = nseq.seq.iter_bp();
190        let mut r = nseq.seq.iter_bp();
191
192        a.by_ref().take(delay.0).for_each(
193            #[inline(always)]
194            |a| {
195                mapper((a, 0));
196            },
197        );
198
199        zip(a.by_ref(), r.by_ref())
200            .take((k - 1) - delay.0)
201            .for_each(
202                #[inline(always)]
203                |(a, r)| {
204                    mapper((a, r));
205                },
206            );
207
208        zip(zip(a, r), nseq.ambiguous.iter_kmer_ambiguity(k)).map(
209            #[inline(always)]
210            move |(ar, ambiguous)| {
211                let hash = mapper(ar);
212                if ambiguous { u32::MAX } else { hash }
213            },
214        )
215    }
216
217    /// A SIMD-parallel iterator over all k-mer hashes in `seq`.
218    /// Ambiguous kmers get hash `u32::MAX`.
219    #[inline(always)]
220    fn hash_valid_kmers_simd<'s, 't>(
221        &'t self,
222        nseq: PackedNSeq<'s>,
223        context: usize,
224    ) -> PaddedIt<impl ChunkIt<S> + use<'s, 't, Self>> {
225        let k = self.k();
226        let delay = self.delay();
227        let mut hash_mapper = self.in_out_mapper_simd(nseq.seq);
228        nseq.seq
229            .par_iter_bp_delayed_with_factor(context + k - 1, delay, 2)
230            .zip(
231                nseq.ambiguous
232                    .par_iter_kmer_ambiguity(k, context + k - 1, 0),
233            )
234            .map(
235                #[inline(always)]
236                move |((a, r), is_ambiguous)| {
237                    let hash = hash_mapper((a, r));
238                    is_ambiguous.blend(S::MAX, hash)
239                },
240            )
241            .advance(k - 1)
242    }
243
244    /// Hash a sequence one character at a time. Ignores `k`.
245    ///
246    /// `seq` is only used to ensure that the hasher can handle the underlying alphabet.
247    fn mapper<'s>(&self, seq: impl Seq<'s>) -> impl FnMut(u8) -> u32;
248
249    /// Hash the given sequence. Ignores `k`.
250    ///
251    /// This is slightly inefficient because it recomputes the constants based on the sequence length.
252    #[inline(always)]
253    fn hash_seq<'s>(&self, seq: impl Seq<'s>) -> u32 {
254        seq.iter_bp().map(self.mapper(seq)).last().unwrap_or(0)
255    }
256    /// Hash all non-empty prefixes of the given sequence. Ignores `k`.
257    #[inline(always)]
258    fn hash_prefixes<'s>(&self, seq: impl Seq<'s>) -> impl ExactSizeIterator<Item = u32> {
259        seq.iter_bp().map(self.mapper(seq))
260    }
261}