superbit/simhash/
sim_hash.rs

1use super::SimHashBits;
2use super::sim_hasher::SimHasher;
3use std::hash::Hash;
4use std::marker::PhantomData;
5
6pub struct SimHash<H, S, const L: usize>
7where
8    H: SimHasher<T = S>,
9    S: SimHashBits,
10{
11    hasher: H,
12    marker: PhantomData<(S, H)>,
13}
14
15impl<H, S, const L: usize> SimHash<H, S, L>
16where
17    H: SimHasher<T = S>,
18    S: SimHashBits,
19{
20    pub fn new(hasher: H) -> Self {
21        SimHash {
22            hasher,
23            marker: PhantomData,
24        }
25    }
26
27    pub fn create_signature<T, U>(&self, iter: T) -> S
28    where
29        T: Iterator<Item = U>,
30        U: Hash,
31    {
32        let mut counts = [0i64; L];
33
34        for mut hash in iter.map(|item| self.hasher.hash(&item)) {
35            for (_i, count) in counts.iter_mut().enumerate() {
36                if hash & S::one() == S::zero() {
37                    *count += 1;
38                } else {
39                    *count -= 1;
40                }
41                hash >>= 1;
42            }
43        }
44
45        let mut result = S::zero();
46        for i in 0..L {
47            if counts[i] > 0 {
48                result |= S::one() << i;
49            }
50        }
51        result
52    }
53
54    /// Weighted SimHash: each feature contributes ±w instead of ±1.
55    /// If the i-th hash bit is 1, add -w; if 0, add +w. Final bit = sign(count).
56    pub fn create_signature_weighted<T, U, W>(&self, iter: T) -> S
57    where
58        T: IntoIterator<Item = (U, W)>,
59        U: std::hash::Hash,
60        W: Into<f32> + Copy,
61    {
62        let mut counts = [0f32; L];
63
64        for (item, w0) in iter {
65            let mut hash = self.hasher.hash(&item);
66            let w = w0.into();
67
68            for i in 0..L {
69                if (hash & S::one()) == S::zero() {
70                    counts[i] += w;
71                } else {
72                    counts[i] -= w;
73                }
74                hash = hash >> 1;
75            }
76        }
77
78        let mut out = S::zero();
79        for i in 0..L {
80            if counts[i] > 0.0 {
81                out |= S::one() << i;
82            }
83        }
84        out
85    }
86
87    pub fn create_centroid<T>(&self, signatures: T) -> S
88    where
89        T: Iterator<Item = S>,
90    {
91        let mut counts = [0u64; L];
92        let mut len = 0;
93        for signature in signatures {
94            for i in 0..L {
95                if signature >> i & S::one() == S::one() {
96                    counts[i] += 1;
97                }
98            }
99            len += 1;
100        }
101        let mut centroid = S::zero();
102        for i in 0..L {
103            if counts[i] > len / 2 {
104                centroid |= S::one() << i;
105            }
106        }
107        centroid
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::super::sim_hasher::{SimSipHasher64, SimSipHasher128, Xxh3Hasher64, Xxh3Hasher128};
114    use super::SimHash;
115    use super::SimHashBits;
116
117    fn whitespace_split(s: &str) -> impl Iterator<Item = &str> {
118        s.split_whitespace()
119    }
120
121    static S1: &str = "SimHash is a technique used for detecting near-duplicates or for locality sensitive hashing. It was developed by Moses Charikar and is often used in large-scale applications to reduce the dimensionality of high-dimensional data, making it easier to process";
122    static S2: &str = "SimHash is a technique used for detecting near-duplicates or for locality sensitive hashing. It was developed by Moses Charikar and is often utilized in large-scale applications to reduce the dimensionality of high-dimensional data, making it easier to analyze";
123
124    #[test]
125    pub fn test_sim_hash_sip64() {
126        let sim_hash = SimHash::<SimSipHasher64, u64, 64>::new(SimSipHasher64::new(1, 2));
127        let s1 = sim_hash.create_signature(whitespace_split(S1));
128        let s2 = sim_hash.create_signature(whitespace_split(S2));
129        assert!(s1.hamming_distance(&s2) < 20);
130    }
131
132    #[test]
133    pub fn test_sim_hash_sip128() {
134        let sim_hash = SimHash::<SimSipHasher128, u128, 128>::new(SimSipHasher128::new(1, 2));
135        let s1 = sim_hash.create_signature(whitespace_split(S1));
136        let s2 = sim_hash.create_signature(whitespace_split(S2));
137        assert!(s1.hamming_distance(&s2) < 40);
138    }
139
140    #[test]
141    pub fn test_sim_hash_xxh3_64() {
142        let sim_hash = SimHash::<Xxh3Hasher64, u64, 64>::new(Xxh3Hasher64::new());
143        let s1 = sim_hash.create_signature(whitespace_split(S1));
144        let s2 = sim_hash.create_signature(whitespace_split(S2));
145        assert!(s1.hamming_distance(&s2) < 20);
146    }
147
148    #[test]
149    pub fn test_sim_hash_xxh3_128() {
150        let sim_hash = SimHash::<Xxh3Hasher128, u128, 128>::new(Xxh3Hasher128::new());
151        let s1 = sim_hash.create_signature(whitespace_split(S1));
152        let s2 = sim_hash.create_signature(whitespace_split(S2));
153        assert!(s1.hamming_distance(&s2) < 40);
154    }
155
156    // cargo test --release simhash_large -- --nocapture
157    #[test]
158    fn simhash_large() {
159        use rand::rngs::StdRng;
160        use rand::{Rng, SeedableRng};
161
162        const N: usize = 100_000;
163        let mut rng = StdRng::seed_from_u64(42);
164
165        // Make two vectors that differ in ~25% of bits.
166        let data1: Vec<u8> = (0..N).map(|_| rng.gen_range(0..2)).collect();
167        let mut data2 = data1.clone();
168        for i in (0..N).step_by(4) {
169            data2[i] = 1 - data2[i];
170        }
171
172        let sim_hash = SimHash::<Xxh3Hasher128, u128, 128>::new(Xxh3Hasher128::new());
173        let h1 = sim_hash.create_signature((0..N).map(|i| (i as u64, data1[i])));
174        let h2 = sim_hash.create_signature((0..N).map(|i| (i as u64, data2[i])));
175        let hd = h1.hamming_distance(&h2);
176
177        // ~12–13% of 128 ≈ 15–17 in expectation; allow slack
178        assert!(hd < 40);
179    }
180}