hermes_core/segment/builder/
simhash.rs1#[inline]
7pub(crate) fn stafford_mix(dim_id: u32) -> u64 {
8 let mut h = dim_id as u64 + 1; h ^= h >> 30;
10 h = h.wrapping_mul(0xbf58476d1ce4e5b9);
11 h ^= h >> 27;
12 h = h.wrapping_mul(0x94d049bb133111eb); h ^= h >> 31;
14 h
15}
16
17#[inline]
29pub fn simhash_from_sparse_vector(
30 entries: &[(u32, f32)],
31 weight_threshold: f32,
32 max_weight: f32,
33) -> u64 {
34 let mut acc = [0i32; 64];
35 for &(dim, weight) in entries {
36 let abs_w = weight.abs();
37 if abs_w < weight_threshold {
38 continue;
39 }
40 let impact = if max_weight <= 0.0 {
42 0u8
43 } else {
44 (abs_w / max_weight * 255.0).round().clamp(0.0, 255.0) as u8
45 };
46 if impact == 0 {
47 continue;
48 }
49 let h = stafford_mix(dim);
50 let w = impact as i32;
51 let mut mask = h;
52 for chunk in acc.chunks_exact_mut(4) {
53 chunk[0] += if mask & 1 != 0 { w } else { -w };
54 chunk[1] += if mask & 2 != 0 { w } else { -w };
55 chunk[2] += if mask & 4 != 0 { w } else { -w };
56 chunk[3] += if mask & 8 != 0 { w } else { -w };
57 mask >>= 4;
58 }
59 }
60 let mut hash = 0u64;
61 for (bit, &a) in acc.iter().enumerate() {
62 if a > 0 {
63 hash |= 1u64 << bit;
64 }
65 }
66 hash
67}
68
69pub fn majority_simhash(hashes: &[u64]) -> u64 {
76 if hashes.is_empty() {
77 return 0;
78 }
79 let threshold = hashes.len() / 2;
80 let mut result: u64 = 0;
81 for bit in 0..64 {
82 let count = hashes.iter().filter(|&&h| h & (1u64 << bit) != 0).count();
83 if count > threshold {
84 result |= 1u64 << bit;
85 }
86 }
87 result
88}
89
90#[cfg(test)]
91mod tests {
92 use super::*;
93
94 fn hamming_distance(a: u64, b: u64) -> u32 {
95 (a ^ b).count_ones()
96 }
97
98 #[test]
100 fn test_simhash_identical() {
101 let v = vec![(10, 1.0), (20, 0.5), (100, 2.0)];
102 let h1 = simhash_from_sparse_vector(&v, 0.0, 5.0);
103 let h2 = simhash_from_sparse_vector(&v, 0.0, 5.0);
104 assert_eq!(h1, h2);
105 }
106
107 #[test]
109 fn test_simhash_similar_vectors_close() {
110 let base: Vec<(u32, f32)> = (100..200).map(|d| (d, 1.0)).collect();
112
113 let mut similar = base[..90].to_vec();
115 similar.extend((200..210).map(|d| (d, 1.0)));
116
117 let dissimilar: Vec<(u32, f32)> = (5000..5100).map(|d| (d, 1.0)).collect();
119
120 let h_base = simhash_from_sparse_vector(&base, 0.0, 5.0);
121 let h_similar = simhash_from_sparse_vector(&similar, 0.0, 5.0);
122 let h_dissimilar = simhash_from_sparse_vector(&dissimilar, 0.0, 5.0);
123
124 let dist_similar = hamming_distance(h_base, h_similar);
125 let dist_dissimilar = hamming_distance(h_base, h_dissimilar);
126
127 assert!(
128 dist_similar < dist_dissimilar,
129 "Similar vectors should have smaller Hamming distance: similar={}, dissimilar={}",
130 dist_similar,
131 dist_dissimilar,
132 );
133 assert!(
135 dist_similar <= 16,
136 "90% overlap should produce Hamming distance ≤ 16, got {}",
137 dist_similar,
138 );
139 }
140
141 #[test]
143 fn test_simhash_topic_clustering() {
144 let mut rng_seed = 42u64;
145 let mut next = || -> u32 {
146 rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
147 (rng_seed >> 33) as u32
148 };
149
150 let make_topic_a = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
152 let mut v: Vec<(u32, f32)> = (0..80)
153 .map(|_| {
154 let d = next() % 500;
155 (d, 0.5 + (next() % 100) as f32 / 100.0)
156 })
157 .collect();
158 v.sort_by_key(|&(d, _)| d);
159 v.dedup_by_key(|e| e.0);
160 v
161 };
162
163 let make_topic_b = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
165 let mut v: Vec<(u32, f32)> = (0..80)
166 .map(|_| {
167 let d = 10000 + next() % 500;
168 (d, 0.5 + (next() % 100) as f32 / 100.0)
169 })
170 .collect();
171 v.sort_by_key(|&(d, _)| d);
172 v.dedup_by_key(|e| e.0);
173 v
174 };
175
176 let topic_a: Vec<u64> = (0..10)
178 .map(|_| simhash_from_sparse_vector(&make_topic_a(&mut next), 0.0, 5.0))
179 .collect();
180 let topic_b: Vec<u64> = (0..10)
181 .map(|_| simhash_from_sparse_vector(&make_topic_b(&mut next), 0.0, 5.0))
182 .collect();
183
184 let mut intra_dists = Vec::new();
186 for i in 0..topic_a.len() {
187 for j in (i + 1)..topic_a.len() {
188 intra_dists.push(hamming_distance(topic_a[i], topic_a[j]));
189 }
190 for j in (i + 1)..topic_b.len() {
191 intra_dists.push(hamming_distance(topic_b[i], topic_b[j]));
192 }
193 }
194
195 let mut inter_dists = Vec::new();
197 for &ha in &topic_a {
198 for &hb in &topic_b {
199 inter_dists.push(hamming_distance(ha, hb));
200 }
201 }
202
203 let avg_intra = intra_dists.iter().sum::<u32>() as f64 / intra_dists.len() as f64;
204 let avg_inter = inter_dists.iter().sum::<u32>() as f64 / inter_dists.len() as f64;
205
206 assert!(
208 avg_intra < avg_inter,
209 "Intra-topic avg Hamming ({:.1}) should be less than inter-topic ({:.1})",
210 avg_intra,
211 avg_inter,
212 );
213 assert!(
215 avg_inter > 28.0,
216 "Inter-topic avg Hamming should be > 28 (near-random), got {:.1}",
217 avg_inter,
218 );
219 }
220
221 #[test]
223 fn test_simhash_weight_sensitivity() {
224 let mut v1: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
226 v1.push((100, 5.0)); let mut v2: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
229 v2.push((200, 5.0)); let mut v3: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
232 v3.push((100, 5.0)); let h1 = simhash_from_sparse_vector(&v1, 0.0, 5.0);
235 let h2 = simhash_from_sparse_vector(&v2, 0.0, 5.0);
236 let h3 = simhash_from_sparse_vector(&v3, 0.0, 5.0);
237
238 assert_eq!(h1, h3, "Identical vectors should produce identical hashes");
239 assert!(
240 hamming_distance(h1, h2) > 0,
241 "Different heavy dims should produce different hashes"
242 );
243 }
244
245 #[test]
247 fn test_simhash_empty() {
248 assert_eq!(simhash_from_sparse_vector(&[], 0.0, 5.0), 0);
249 }
250
251 #[test]
253 fn test_majority_simhash_basic() {
254 let hashes = vec![0xFF00FF00_FF00FF00u64; 5];
256 assert_eq!(majority_simhash(&hashes), 0xFF00FF00_FF00FF00);
257
258 let hashes = vec![0b1, 0b1, 0b1, 0b0, 0b0];
260 assert_eq!(majority_simhash(&hashes) & 1, 1);
261
262 let hashes = vec![0b1, 0b1, 0b0, 0b0, 0b0];
264 assert_eq!(majority_simhash(&hashes) & 1, 0);
265 }
266
267 #[test]
269 fn test_stafford_mix_avalanche() {
270 let h0 = stafford_mix(0);
271 let h1 = stafford_mix(1);
272 let h2 = stafford_mix(2);
273
274 assert!(hamming_distance(h0, h1) > 20, "Poor avalanche: 0 vs 1");
276 assert!(hamming_distance(h1, h2) > 20, "Poor avalanche: 1 vs 2");
277
278 assert_ne!(h0, h1);
280 assert_ne!(h1, h2);
281 assert_ne!(h0, h2);
282 }
283
284 #[test]
286 fn test_stafford_mix_no_fixed_point() {
287 assert_ne!(
288 stafford_mix(0),
289 0,
290 "stafford_mix(0) must not be a fixed point"
291 );
292
293 let bit0_set = (0..100).filter(|&d| stafford_mix(d) & 1 != 0).count();
295 assert!(
296 bit0_set > 30,
297 "Bit 0 should be set ~50% of the time, got {}/100",
298 bit0_set
299 );
300 }
301}