hermes_core/segment/builder/
simhash.rs1#[inline]
7pub(crate) fn stafford_mix(dim_id: u32) -> u64 {
8 let mut h = dim_id as u64 + 1; h ^= h >> 30;
10 h = h.wrapping_mul(0xbf58476d1ce4e5b9);
11 h ^= h >> 27;
12 h = h.wrapping_mul(0x94d049bb133111eb); h ^= h >> 31;
14 h
15}
16
17#[inline]
22pub(crate) fn simhash_accumulate(acc: &mut [i32; 64], dim_id: u32, impact: i32) {
23 let mut mask = stafford_mix(dim_id);
24 for chunk in acc.chunks_exact_mut(4) {
25 chunk[0] += if mask & 1 != 0 { impact } else { -impact };
26 chunk[1] += if mask & 2 != 0 { impact } else { -impact };
27 chunk[2] += if mask & 4 != 0 { impact } else { -impact };
28 chunk[3] += if mask & 8 != 0 { impact } else { -impact };
29 mask >>= 4;
30 }
31}
32
33#[inline]
36pub(crate) fn simhash_finalize(acc: &[i32; 64]) -> u64 {
37 let mut hash = 0u64;
38 for (bit, &a) in acc.iter().enumerate() {
39 if a > 0 {
40 hash |= 1u64 << bit;
41 }
42 }
43 hash
44}
45
46#[inline]
53pub fn simhash_from_sparse_vector(
54 entries: &[(u32, f32)],
55 weight_threshold: f32,
56 max_weight: f32,
57) -> u64 {
58 let mut acc = [0i32; 64];
59 for &(dim, weight) in entries {
60 let abs_w = weight.abs();
61 if abs_w < weight_threshold {
62 continue;
63 }
64 let impact = if max_weight <= 0.0 {
66 0u8
67 } else {
68 (abs_w / max_weight * 255.0).round().clamp(0.0, 255.0) as u8
69 };
70 if impact == 0 {
71 continue;
72 }
73 simhash_accumulate(&mut acc, dim, impact as i32);
74 }
75 simhash_finalize(&acc)
76}
77
78pub fn majority_simhash(hashes: &[u64]) -> u64 {
85 if hashes.is_empty() {
86 return 0;
87 }
88 let threshold = hashes.len() / 2;
89 let mut result: u64 = 0;
90 for bit in 0..64 {
91 let count = hashes.iter().filter(|&&h| h & (1u64 << bit) != 0).count();
92 if count > threshold {
93 result |= 1u64 << bit;
94 }
95 }
96 result
97}
98
99#[cfg(test)]
100mod tests {
101 use super::*;
102
103 fn hamming_distance(a: u64, b: u64) -> u32 {
104 (a ^ b).count_ones()
105 }
106
107 #[test]
109 fn test_simhash_identical() {
110 let v = vec![(10, 1.0), (20, 0.5), (100, 2.0)];
111 let h1 = simhash_from_sparse_vector(&v, 0.0, 5.0);
112 let h2 = simhash_from_sparse_vector(&v, 0.0, 5.0);
113 assert_eq!(h1, h2);
114 }
115
116 #[test]
118 fn test_simhash_similar_vectors_close() {
119 let base: Vec<(u32, f32)> = (100..200).map(|d| (d, 1.0)).collect();
121
122 let mut similar = base[..90].to_vec();
124 similar.extend((200..210).map(|d| (d, 1.0)));
125
126 let dissimilar: Vec<(u32, f32)> = (5000..5100).map(|d| (d, 1.0)).collect();
128
129 let h_base = simhash_from_sparse_vector(&base, 0.0, 5.0);
130 let h_similar = simhash_from_sparse_vector(&similar, 0.0, 5.0);
131 let h_dissimilar = simhash_from_sparse_vector(&dissimilar, 0.0, 5.0);
132
133 let dist_similar = hamming_distance(h_base, h_similar);
134 let dist_dissimilar = hamming_distance(h_base, h_dissimilar);
135
136 assert!(
137 dist_similar < dist_dissimilar,
138 "Similar vectors should have smaller Hamming distance: similar={}, dissimilar={}",
139 dist_similar,
140 dist_dissimilar,
141 );
142 assert!(
144 dist_similar <= 16,
145 "90% overlap should produce Hamming distance ≤ 16, got {}",
146 dist_similar,
147 );
148 }
149
150 #[test]
152 fn test_simhash_topic_clustering() {
153 let mut rng_seed = 42u64;
154 let mut next = || -> u32 {
155 rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
156 (rng_seed >> 33) as u32
157 };
158
159 let make_topic_a = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
161 let mut v: Vec<(u32, f32)> = (0..80)
162 .map(|_| {
163 let d = next() % 500;
164 (d, 0.5 + (next() % 100) as f32 / 100.0)
165 })
166 .collect();
167 v.sort_by_key(|&(d, _)| d);
168 v.dedup_by_key(|e| e.0);
169 v
170 };
171
172 let make_topic_b = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
174 let mut v: Vec<(u32, f32)> = (0..80)
175 .map(|_| {
176 let d = 10000 + next() % 500;
177 (d, 0.5 + (next() % 100) as f32 / 100.0)
178 })
179 .collect();
180 v.sort_by_key(|&(d, _)| d);
181 v.dedup_by_key(|e| e.0);
182 v
183 };
184
185 let topic_a: Vec<u64> = (0..10)
187 .map(|_| simhash_from_sparse_vector(&make_topic_a(&mut next), 0.0, 5.0))
188 .collect();
189 let topic_b: Vec<u64> = (0..10)
190 .map(|_| simhash_from_sparse_vector(&make_topic_b(&mut next), 0.0, 5.0))
191 .collect();
192
193 let mut intra_dists = Vec::new();
195 for i in 0..topic_a.len() {
196 for j in (i + 1)..topic_a.len() {
197 intra_dists.push(hamming_distance(topic_a[i], topic_a[j]));
198 }
199 for j in (i + 1)..topic_b.len() {
200 intra_dists.push(hamming_distance(topic_b[i], topic_b[j]));
201 }
202 }
203
204 let mut inter_dists = Vec::new();
206 for &ha in &topic_a {
207 for &hb in &topic_b {
208 inter_dists.push(hamming_distance(ha, hb));
209 }
210 }
211
212 let avg_intra = intra_dists.iter().sum::<u32>() as f64 / intra_dists.len() as f64;
213 let avg_inter = inter_dists.iter().sum::<u32>() as f64 / inter_dists.len() as f64;
214
215 assert!(
217 avg_intra < avg_inter,
218 "Intra-topic avg Hamming ({:.1}) should be less than inter-topic ({:.1})",
219 avg_intra,
220 avg_inter,
221 );
222 assert!(
224 avg_inter > 28.0,
225 "Inter-topic avg Hamming should be > 28 (near-random), got {:.1}",
226 avg_inter,
227 );
228 }
229
230 #[test]
232 fn test_simhash_weight_sensitivity() {
233 let mut v1: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
235 v1.push((100, 5.0)); let mut v2: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
238 v2.push((200, 5.0)); let mut v3: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
241 v3.push((100, 5.0)); let h1 = simhash_from_sparse_vector(&v1, 0.0, 5.0);
244 let h2 = simhash_from_sparse_vector(&v2, 0.0, 5.0);
245 let h3 = simhash_from_sparse_vector(&v3, 0.0, 5.0);
246
247 assert_eq!(h1, h3, "Identical vectors should produce identical hashes");
248 assert!(
249 hamming_distance(h1, h2) > 0,
250 "Different heavy dims should produce different hashes"
251 );
252 }
253
254 #[test]
256 fn test_simhash_empty() {
257 assert_eq!(simhash_from_sparse_vector(&[], 0.0, 5.0), 0);
258 }
259
260 #[test]
262 fn test_majority_simhash_basic() {
263 let hashes = vec![0xFF00FF00_FF00FF00u64; 5];
265 assert_eq!(majority_simhash(&hashes), 0xFF00FF00_FF00FF00);
266
267 let hashes = vec![0b1, 0b1, 0b1, 0b0, 0b0];
269 assert_eq!(majority_simhash(&hashes) & 1, 1);
270
271 let hashes = vec![0b1, 0b1, 0b0, 0b0, 0b0];
273 assert_eq!(majority_simhash(&hashes) & 1, 0);
274 }
275
276 #[test]
278 fn test_stafford_mix_avalanche() {
279 let h0 = stafford_mix(0);
280 let h1 = stafford_mix(1);
281 let h2 = stafford_mix(2);
282
283 assert!(hamming_distance(h0, h1) > 20, "Poor avalanche: 0 vs 1");
285 assert!(hamming_distance(h1, h2) > 20, "Poor avalanche: 1 vs 2");
286
287 assert_ne!(h0, h1);
289 assert_ne!(h1, h2);
290 assert_ne!(h0, h2);
291 }
292
293 #[test]
295 fn test_stafford_mix_no_fixed_point() {
296 assert_ne!(
297 stafford_mix(0),
298 0,
299 "stafford_mix(0) must not be a fixed point"
300 );
301
302 let bit0_set = (0..100).filter(|&d| stafford_mix(d) & 1 != 0).count();
304 assert!(
305 bit0_set > 30,
306 "Bit 0 should be set ~50% of the time, got {}/100",
307 bit0_set
308 );
309 }
310}