superbit/simhash/
sim_hash.rs1use super::SimHashBits;
2use super::sim_hasher::SimHasher;
3use std::hash::Hash;
4use std::marker::PhantomData;
5
6pub struct SimHash<H, S, const L: usize>
7where
8 H: SimHasher<T = S>,
9 S: SimHashBits,
10{
11 hasher: H,
12 marker: PhantomData<(S, H)>,
13}
14
15impl<H, S, const L: usize> SimHash<H, S, L>
16where
17 H: SimHasher<T = S>,
18 S: SimHashBits,
19{
20 pub fn new(hasher: H) -> Self {
21 SimHash {
22 hasher,
23 marker: PhantomData,
24 }
25 }
26
27 pub fn create_signature<T, U>(&self, iter: T) -> S
28 where
29 T: Iterator<Item = U>,
30 U: Hash,
31 {
32 let mut counts = [0i64; L];
33
34 for mut hash in iter.map(|item| self.hasher.hash(&item)) {
35 for (_i, count) in counts.iter_mut().enumerate() {
36 if hash & S::one() == S::zero() {
37 *count += 1;
38 } else {
39 *count -= 1;
40 }
41 hash >>= 1;
42 }
43 }
44
45 let mut result = S::zero();
46 for i in 0..L {
47 if counts[i] > 0 {
48 result |= S::one() << i;
49 }
50 }
51 result
52 }
53
54 pub fn create_signature_weighted<T, U, W>(&self, iter: T) -> S
57 where
58 T: IntoIterator<Item = (U, W)>,
59 U: std::hash::Hash,
60 W: Into<f32> + Copy,
61 {
62 let mut counts = [0f32; L];
63
64 for (item, w0) in iter {
65 let mut hash = self.hasher.hash(&item);
66 let w = w0.into();
67
68 for i in 0..L {
69 if (hash & S::one()) == S::zero() {
70 counts[i] += w;
71 } else {
72 counts[i] -= w;
73 }
74 hash = hash >> 1;
75 }
76 }
77
78 let mut out = S::zero();
79 for i in 0..L {
80 if counts[i] > 0.0 {
81 out |= S::one() << i;
82 }
83 }
84 out
85 }
86
87 pub fn create_centroid<T>(&self, signatures: T) -> S
88 where
89 T: Iterator<Item = S>,
90 {
91 let mut counts = [0u64; L];
92 let mut len = 0;
93 for signature in signatures {
94 for i in 0..L {
95 if signature >> i & S::one() == S::one() {
96 counts[i] += 1;
97 }
98 }
99 len += 1;
100 }
101 let mut centroid = S::zero();
102 for i in 0..L {
103 if counts[i] > len / 2 {
104 centroid |= S::one() << i;
105 }
106 }
107 centroid
108 }
109}
110
111#[cfg(test)]
112mod tests {
113 use super::super::sim_hasher::{SimSipHasher64, SimSipHasher128, Xxh3Hasher64, Xxh3Hasher128};
114 use super::SimHash;
115 use super::SimHashBits;
116
117 fn whitespace_split(s: &str) -> impl Iterator<Item = &str> {
118 s.split_whitespace()
119 }
120
121 static S1: &str = "SimHash is a technique used for detecting near-duplicates or for locality sensitive hashing. It was developed by Moses Charikar and is often used in large-scale applications to reduce the dimensionality of high-dimensional data, making it easier to process";
122 static S2: &str = "SimHash is a technique used for detecting near-duplicates or for locality sensitive hashing. It was developed by Moses Charikar and is often utilized in large-scale applications to reduce the dimensionality of high-dimensional data, making it easier to analyze";
123
124 #[test]
125 pub fn test_sim_hash_sip64() {
126 let sim_hash = SimHash::<SimSipHasher64, u64, 64>::new(SimSipHasher64::new(1, 2));
127 let s1 = sim_hash.create_signature(whitespace_split(S1));
128 let s2 = sim_hash.create_signature(whitespace_split(S2));
129 assert!(s1.hamming_distance(&s2) < 20);
130 }
131
132 #[test]
133 pub fn test_sim_hash_sip128() {
134 let sim_hash = SimHash::<SimSipHasher128, u128, 128>::new(SimSipHasher128::new(1, 2));
135 let s1 = sim_hash.create_signature(whitespace_split(S1));
136 let s2 = sim_hash.create_signature(whitespace_split(S2));
137 assert!(s1.hamming_distance(&s2) < 40);
138 }
139
140 #[test]
141 pub fn test_sim_hash_xxh3_64() {
142 let sim_hash = SimHash::<Xxh3Hasher64, u64, 64>::new(Xxh3Hasher64::new());
143 let s1 = sim_hash.create_signature(whitespace_split(S1));
144 let s2 = sim_hash.create_signature(whitespace_split(S2));
145 assert!(s1.hamming_distance(&s2) < 20);
146 }
147
148 #[test]
149 pub fn test_sim_hash_xxh3_128() {
150 let sim_hash = SimHash::<Xxh3Hasher128, u128, 128>::new(Xxh3Hasher128::new());
151 let s1 = sim_hash.create_signature(whitespace_split(S1));
152 let s2 = sim_hash.create_signature(whitespace_split(S2));
153 assert!(s1.hamming_distance(&s2) < 40);
154 }
155
156 #[test]
158 fn simhash_large() {
159 use rand::rngs::StdRng;
160 use rand::{Rng, SeedableRng};
161
162 const N: usize = 100_000;
163 let mut rng = StdRng::seed_from_u64(42);
164
165 let data1: Vec<u8> = (0..N).map(|_| rng.gen_range(0..2)).collect();
167 let mut data2 = data1.clone();
168 for i in (0..N).step_by(4) {
169 data2[i] = 1 - data2[i];
170 }
171
172 let sim_hash = SimHash::<Xxh3Hasher128, u128, 128>::new(Xxh3Hasher128::new());
173 let h1 = sim_hash.create_signature((0..N).map(|i| (i as u64, data1[i])));
174 let h2 = sim_hash.create_signature((0..N).map(|i| (i as u64, data2[i])));
175 let hd = h1.hamming_distance(&h2);
176
177 assert!(hd < 40);
179 }
180}