find_simdoc/lsh/
simhash.rs

1//! Simplified simhash for the Cosine similarity.
2use rand_xoshiro::rand_core::{RngCore, SeedableRng};
3
4/// [Simplified simhash](https://dl.acm.org/doi/10.1145/2063576.2063737) for Cosine similarity.
5pub struct SimHasher {
6    seed: u64,
7}
8
9impl SimHasher {
10    /// Creates an instance.
11    pub const fn new(seed: u64) -> Self {
12        Self { seed }
13    }
14
15    /// Creates an iterator to generate sketches from an input feature.
16    pub fn iter<'a>(&self, feature: &'a [(u64, f64)]) -> SimHashIter<'a> {
17        SimHashIter {
18            feature,
19            seeder: rand_xoshiro::SplitMix64::seed_from_u64(self.seed),
20            weights: [0.; 64],
21        }
22    }
23}
24
25/// Iterator to generate sketches with the simplified simhash.
26pub struct SimHashIter<'a> {
27    feature: &'a [(u64, f64)],
28    seeder: rand_xoshiro::SplitMix64,
29    weights: [f64; 64],
30}
31
32impl<'a> Iterator for SimHashIter<'a> {
33    type Item = u64;
34
35    fn next(&mut self) -> Option<Self::Item> {
36        self.weights.fill(0.);
37        let seed = self.seeder.next_u64();
38        for (h, x) in self
39            .feature
40            .iter()
41            .map(|&(i, x)| (crate::lsh::hash_u64(i, seed), x))
42        {
43            for (j, w) in self.weights.iter_mut().enumerate() {
44                if (h >> j) & 1 == 0 {
45                    *w += x;
46                } else {
47                    *w -= x;
48                }
49            }
50        }
51        Some(
52            self.weights
53                .iter()
54                .fold(0, |acc, w| if *w >= 0. { (acc << 1) | 1 } else { acc << 1 }),
55        )
56    }
57}