hermes_core/segment/builder/
simhash.rs1#[inline]
4pub(crate) fn stafford_mix(dim_id: u32) -> u64 {
5 let mut h = dim_id as u64;
6 h = h.wrapping_mul(0x517cc1b727220a95);
7 h ^= h >> 32;
8 h = h.wrapping_mul(0x6c62272e07bb0142);
9 h
10}
11
12#[inline]
22pub fn simhash_from_sparse_vector(entries: &[(u32, f32)]) -> u64 {
23 let mut acc = [0.0f64; 64];
24 for &(dim, weight) in entries {
25 let h = stafford_mix(dim);
26 let w = weight as f64;
27 let mut mask = h;
29 for chunk in acc.chunks_exact_mut(4) {
30 chunk[0] += if mask & 1 != 0 { w } else { -w };
31 chunk[1] += if mask & 2 != 0 { w } else { -w };
32 chunk[2] += if mask & 4 != 0 { w } else { -w };
33 chunk[3] += if mask & 8 != 0 { w } else { -w };
34 mask >>= 4;
35 }
36 }
37 let mut hash = 0u64;
38 for (bit, &a) in acc.iter().enumerate() {
39 if a > 0.0 {
40 hash |= 1u64 << bit;
41 }
42 }
43 hash
44}
45
46pub fn majority_simhash(hashes: &[u64]) -> u64 {
53 if hashes.is_empty() {
54 return 0;
55 }
56 let threshold = hashes.len() / 2;
57 let mut result: u64 = 0;
58 for bit in 0..64 {
59 let count = hashes.iter().filter(|&&h| h & (1u64 << bit) != 0).count();
60 if count > threshold {
61 result |= 1u64 << bit;
62 }
63 }
64 result
65}
66
67#[cfg(test)]
68mod tests {
69 use super::*;
70
71 fn hamming_distance(a: u64, b: u64) -> u32 {
72 (a ^ b).count_ones()
73 }
74
75 #[test]
77 fn test_simhash_identical() {
78 let v = vec![(10, 1.0), (20, 0.5), (100, 2.0)];
79 let h1 = simhash_from_sparse_vector(&v);
80 let h2 = simhash_from_sparse_vector(&v);
81 assert_eq!(h1, h2);
82 }
83
84 #[test]
86 fn test_simhash_similar_vectors_close() {
87 let base: Vec<(u32, f32)> = (100..200).map(|d| (d, 1.0)).collect();
89
90 let mut similar = base[..90].to_vec();
92 similar.extend((200..210).map(|d| (d, 1.0)));
93
94 let dissimilar: Vec<(u32, f32)> = (5000..5100).map(|d| (d, 1.0)).collect();
96
97 let h_base = simhash_from_sparse_vector(&base);
98 let h_similar = simhash_from_sparse_vector(&similar);
99 let h_dissimilar = simhash_from_sparse_vector(&dissimilar);
100
101 let dist_similar = hamming_distance(h_base, h_similar);
102 let dist_dissimilar = hamming_distance(h_base, h_dissimilar);
103
104 assert!(
105 dist_similar < dist_dissimilar,
106 "Similar vectors should have smaller Hamming distance: similar={}, dissimilar={}",
107 dist_similar,
108 dist_dissimilar,
109 );
110 assert!(
112 dist_similar <= 16,
113 "90% overlap should produce Hamming distance ≤ 16, got {}",
114 dist_similar,
115 );
116 }
117
118 #[test]
120 fn test_simhash_topic_clustering() {
121 let mut rng_seed = 42u64;
122 let mut next = || -> u32 {
123 rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
124 (rng_seed >> 33) as u32
125 };
126
127 let make_topic_a = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
129 let mut v: Vec<(u32, f32)> = (0..80)
130 .map(|_| {
131 let d = next() % 500;
132 (d, 0.5 + (next() % 100) as f32 / 100.0)
133 })
134 .collect();
135 v.sort_by_key(|&(d, _)| d);
136 v.dedup_by_key(|e| e.0);
137 v
138 };
139
140 let make_topic_b = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
142 let mut v: Vec<(u32, f32)> = (0..80)
143 .map(|_| {
144 let d = 10000 + next() % 500;
145 (d, 0.5 + (next() % 100) as f32 / 100.0)
146 })
147 .collect();
148 v.sort_by_key(|&(d, _)| d);
149 v.dedup_by_key(|e| e.0);
150 v
151 };
152
153 let topic_a: Vec<u64> = (0..10)
155 .map(|_| simhash_from_sparse_vector(&make_topic_a(&mut next)))
156 .collect();
157 let topic_b: Vec<u64> = (0..10)
158 .map(|_| simhash_from_sparse_vector(&make_topic_b(&mut next)))
159 .collect();
160
161 let mut intra_dists = Vec::new();
163 for i in 0..topic_a.len() {
164 for j in (i + 1)..topic_a.len() {
165 intra_dists.push(hamming_distance(topic_a[i], topic_a[j]));
166 }
167 for j in (i + 1)..topic_b.len() {
168 intra_dists.push(hamming_distance(topic_b[i], topic_b[j]));
169 }
170 }
171
172 let mut inter_dists = Vec::new();
174 for &ha in &topic_a {
175 for &hb in &topic_b {
176 inter_dists.push(hamming_distance(ha, hb));
177 }
178 }
179
180 let avg_intra = intra_dists.iter().sum::<u32>() as f64 / intra_dists.len() as f64;
181 let avg_inter = inter_dists.iter().sum::<u32>() as f64 / inter_dists.len() as f64;
182
183 assert!(
185 avg_intra < avg_inter,
186 "Intra-topic avg Hamming ({:.1}) should be less than inter-topic ({:.1})",
187 avg_intra,
188 avg_inter,
189 );
190 assert!(
192 avg_inter > 28.0,
193 "Inter-topic avg Hamming should be > 28 (near-random), got {:.1}",
194 avg_inter,
195 );
196 }
197
198 #[test]
200 fn test_simhash_weight_sensitivity() {
201 let mut v1: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
203 v1.push((100, 5.0)); let mut v2: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
206 v2.push((200, 5.0)); let mut v3: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
209 v3.push((100, 5.0)); let h1 = simhash_from_sparse_vector(&v1);
212 let h2 = simhash_from_sparse_vector(&v2);
213 let h3 = simhash_from_sparse_vector(&v3);
214
215 assert_eq!(h1, h3, "Identical vectors should produce identical hashes");
216 assert!(
217 hamming_distance(h1, h2) > 0,
218 "Different heavy dims should produce different hashes"
219 );
220 }
221
222 #[test]
224 fn test_simhash_empty() {
225 assert_eq!(simhash_from_sparse_vector(&[]), 0);
226 }
227
228 #[test]
230 fn test_majority_simhash_basic() {
231 let hashes = vec![0xFF00FF00_FF00FF00u64; 5];
233 assert_eq!(majority_simhash(&hashes), 0xFF00FF00_FF00FF00);
234
235 let hashes = vec![0b1, 0b1, 0b1, 0b0, 0b0];
237 assert_eq!(majority_simhash(&hashes) & 1, 1);
238
239 let hashes = vec![0b1, 0b1, 0b0, 0b0, 0b0];
241 assert_eq!(majority_simhash(&hashes) & 1, 0);
242 }
243
244 #[test]
246 fn test_stafford_mix_avalanche() {
247 let h0 = stafford_mix(0);
248 let h1 = stafford_mix(1);
249 let h2 = stafford_mix(2);
250
251 assert!(hamming_distance(h0, h1) > 20, "Poor avalanche: 0 vs 1");
253 assert!(hamming_distance(h1, h2) > 20, "Poor avalanche: 1 vs 2");
254
255 assert_ne!(h0, h1);
257 assert_ne!(h1, h2);
258 assert_ne!(h0, h2);
259 }
260}