find_simdoc/lsh/
minhash.rs

1//! 1-bit minwise hashing for the Jaccard similarity.
2use rand_xoshiro::rand_core::{RngCore, SeedableRng};
3
4/// [1-bit minwise hashing](https://dl.acm.org/doi/abs/10.1145/1772690.1772759) for the Jaccard similarity.
5pub struct MinHasher {
6    seed: u64,
7}
8
9impl MinHasher {
10    /// Creates an instance.
11    pub const fn new(seed: u64) -> Self {
12        Self { seed }
13    }
14
15    /// Creates an iterator to generate sketches from an input feature.
16    pub fn iter<'a>(&self, feature: &'a [u64]) -> MinHashIter<'a> {
17        MinHashIter {
18            feature,
19            seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed),
20        }
21    }
22}
23
24/// Iterator to generate sketches with the 1-bit minwise hashing.
25pub struct MinHashIter<'a> {
26    feature: &'a [u64],
27    seeder: rand_xoshiro::SplitMix64,
28}
29
30impl<'a> Iterator for MinHashIter<'a> {
31    type Item = u64;
32
33    fn next(&mut self) -> Option<Self::Item> {
34        let mut x = 0;
35        for _ in 0..64 {
36            let seed = self.seeder.next_u64();
37            let h = self
38                .feature
39                .iter()
40                .map(|&i| crate::lsh::hash_u64(i, seed))
41                .min()
42                .unwrap();
43            x = (x << 1) | (h & 1);
44        }
45        Some(x)
46    }
47}