hermes_core/segment/builder/
simhash.rs

1/// SplitMix64 finalizer — good avalanche for 32→64 bit expansion.
2/// Used to derive SimHash bit patterns from dimension IDs.
3///
4/// Based on the standard splitmix64 output function (Steele et al.).
5/// Starts from `dim_id + 1` to avoid the fixed point at zero.
6#[inline]
7pub(crate) fn stafford_mix(dim_id: u32) -> u64 {
8    let mut h = dim_id as u64 + 1; // +1 avoids fixed point: stafford_mix(0) != 0
9    h ^= h >> 30;
10    h = h.wrapping_mul(0xbf58476d1ce4e5b9);
11    h ^= h >> 27;
12    h = h.wrapping_mul(0x94d049bb133111eb); // odd multiplier — all 64 output bits active
13    h ^= h >> 31;
14    h
15}
16
17/// Compute SimHash from a sparse vector using quantized u8 impacts.
18///
19/// This matches exactly what `reorder_bmp_blob` computes from BMP block data,
20/// ensuring build-time and reorder-time SimHash are identical. Entries below
21/// `weight_threshold` or that quantize to zero impact are excluded — same as
22/// what survives into the BMP block postings.
23///
24/// Each surviving (dim_id, impact) pair contributes to a 64-bit fingerprint:
25/// - Hash dim_id with splitmix64 to get a 64-bit pseudo-random mask
26/// - For each bit, accumulate +impact or -impact based on that bit
27/// - Final hash: bit i = 1 iff accumulator[i] > 0
28#[inline]
29pub fn simhash_from_sparse_vector(
30    entries: &[(u32, f32)],
31    weight_threshold: f32,
32    max_weight: f32,
33) -> u64 {
34    let mut acc = [0i32; 64];
35    for &(dim, weight) in entries {
36        let abs_w = weight.abs();
37        if abs_w < weight_threshold {
38            continue;
39        }
40        // Same quantization as build_bmp_blob / quantize_weight
41        let impact = if max_weight <= 0.0 {
42            0u8
43        } else {
44            (abs_w / max_weight * 255.0).round().clamp(0.0, 255.0) as u8
45        };
46        if impact == 0 {
47            continue;
48        }
49        let h = stafford_mix(dim);
50        let w = impact as i32;
51        let mut mask = h;
52        for chunk in acc.chunks_exact_mut(4) {
53            chunk[0] += if mask & 1 != 0 { w } else { -w };
54            chunk[1] += if mask & 2 != 0 { w } else { -w };
55            chunk[2] += if mask & 4 != 0 { w } else { -w };
56            chunk[3] += if mask & 8 != 0 { w } else { -w };
57            mask >>= 4;
58        }
59    }
60    let mut hash = 0u64;
61    for (bit, &a) in acc.iter().enumerate() {
62        if a > 0 {
63            hash |= 1u64 << bit;
64        }
65    }
66    hash
67}
68
69/// Compute per-bit majority vote SimHash from a set of hashes.
70///
71/// For each of the 64 bit positions, sets the output bit to 1 if more than
72/// half the input hashes have that bit set. This produces the "centroid"
73/// in SimHash space — the representative that minimizes Hamming distance
74/// to the input set.
75pub fn majority_simhash(hashes: &[u64]) -> u64 {
76    if hashes.is_empty() {
77        return 0;
78    }
79    let threshold = hashes.len() / 2;
80    let mut result: u64 = 0;
81    for bit in 0..64 {
82        let count = hashes.iter().filter(|&&h| h & (1u64 << bit) != 0).count();
83        if count > threshold {
84            result |= 1u64 << bit;
85        }
86    }
87    result
88}
89
90#[cfg(test)]
91mod tests {
92    use super::*;
93
94    fn hamming_distance(a: u64, b: u64) -> u32 {
95        (a ^ b).count_ones()
96    }
97
98    /// Identical vectors produce the same hash.
99    #[test]
100    fn test_simhash_identical() {
101        let v = vec![(10, 1.0), (20, 0.5), (100, 2.0)];
102        let h1 = simhash_from_sparse_vector(&v, 0.0, 5.0);
103        let h2 = simhash_from_sparse_vector(&v, 0.0, 5.0);
104        assert_eq!(h1, h2);
105    }
106
107    /// Similar vectors (large overlap) produce close hashes (small Hamming distance).
108    #[test]
109    fn test_simhash_similar_vectors_close() {
110        // Base vector: dims 100..200 with weight 1.0
111        let base: Vec<(u32, f32)> = (100..200).map(|d| (d, 1.0)).collect();
112
113        // Similar: 90% overlap (dims 100..190 shared, 10 new dims)
114        let mut similar = base[..90].to_vec();
115        similar.extend((200..210).map(|d| (d, 1.0)));
116
117        // Dissimilar: completely different dims
118        let dissimilar: Vec<(u32, f32)> = (5000..5100).map(|d| (d, 1.0)).collect();
119
120        let h_base = simhash_from_sparse_vector(&base, 0.0, 5.0);
121        let h_similar = simhash_from_sparse_vector(&similar, 0.0, 5.0);
122        let h_dissimilar = simhash_from_sparse_vector(&dissimilar, 0.0, 5.0);
123
124        let dist_similar = hamming_distance(h_base, h_similar);
125        let dist_dissimilar = hamming_distance(h_base, h_dissimilar);
126
127        assert!(
128            dist_similar < dist_dissimilar,
129            "Similar vectors should have smaller Hamming distance: similar={}, dissimilar={}",
130            dist_similar,
131            dist_dissimilar,
132        );
133        // 90% overlap should yield very close hashes (< 16 out of 64 bits differ)
134        assert!(
135            dist_similar <= 16,
136            "90% overlap should produce Hamming distance ≤ 16, got {}",
137            dist_similar,
138        );
139    }
140
141    /// Vectors from the same "topic" (shared dim range) cluster together.
142    #[test]
143    fn test_simhash_topic_clustering() {
144        let mut rng_seed = 42u64;
145        let mut next = || -> u32 {
146            rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
147            (rng_seed >> 33) as u32
148        };
149
150        // Topic A: dims 0..500
151        let make_topic_a = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
152            let mut v: Vec<(u32, f32)> = (0..80)
153                .map(|_| {
154                    let d = next() % 500;
155                    (d, 0.5 + (next() % 100) as f32 / 100.0)
156                })
157                .collect();
158            v.sort_by_key(|&(d, _)| d);
159            v.dedup_by_key(|e| e.0);
160            v
161        };
162
163        // Topic B: dims 10000..10500
164        let make_topic_b = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
165            let mut v: Vec<(u32, f32)> = (0..80)
166                .map(|_| {
167                    let d = 10000 + next() % 500;
168                    (d, 0.5 + (next() % 100) as f32 / 100.0)
169                })
170                .collect();
171            v.sort_by_key(|&(d, _)| d);
172            v.dedup_by_key(|e| e.0);
173            v
174        };
175
176        // Generate 10 vectors from each topic
177        let topic_a: Vec<u64> = (0..10)
178            .map(|_| simhash_from_sparse_vector(&make_topic_a(&mut next), 0.0, 5.0))
179            .collect();
180        let topic_b: Vec<u64> = (0..10)
181            .map(|_| simhash_from_sparse_vector(&make_topic_b(&mut next), 0.0, 5.0))
182            .collect();
183
184        // Intra-topic distances should be small
185        let mut intra_dists = Vec::new();
186        for i in 0..topic_a.len() {
187            for j in (i + 1)..topic_a.len() {
188                intra_dists.push(hamming_distance(topic_a[i], topic_a[j]));
189            }
190            for j in (i + 1)..topic_b.len() {
191                intra_dists.push(hamming_distance(topic_b[i], topic_b[j]));
192            }
193        }
194
195        // Inter-topic distances should be large
196        let mut inter_dists = Vec::new();
197        for &ha in &topic_a {
198            for &hb in &topic_b {
199                inter_dists.push(hamming_distance(ha, hb));
200            }
201        }
202
203        let avg_intra = intra_dists.iter().sum::<u32>() as f64 / intra_dists.len() as f64;
204        let avg_inter = inter_dists.iter().sum::<u32>() as f64 / inter_dists.len() as f64;
205
206        // Key property: same-topic vectors are closer than cross-topic vectors
207        assert!(
208            avg_intra < avg_inter,
209            "Intra-topic avg Hamming ({:.1}) should be less than inter-topic ({:.1})",
210            avg_intra,
211            avg_inter,
212        );
213        // Inter-topic with disjoint dim ranges should be near-random (~32 bits)
214        assert!(
215            avg_inter > 28.0,
216            "Inter-topic avg Hamming should be > 28 (near-random), got {:.1}",
217            avg_inter,
218        );
219    }
220
221    /// Weight magnitude affects SimHash — heavy dims dominate.
222    #[test]
223    fn test_simhash_weight_sensitivity() {
224        // Two vectors share dims 0..50 but differ on dim 100
225        let mut v1: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
226        v1.push((100, 5.0)); // heavy dim A
227
228        let mut v2: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
229        v2.push((200, 5.0)); // heavy dim B
230
231        let mut v3: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
232        v3.push((100, 5.0)); // heavy dim A (same as v1)
233
234        let h1 = simhash_from_sparse_vector(&v1, 0.0, 5.0);
235        let h2 = simhash_from_sparse_vector(&v2, 0.0, 5.0);
236        let h3 = simhash_from_sparse_vector(&v3, 0.0, 5.0);
237
238        assert_eq!(h1, h3, "Identical vectors should produce identical hashes");
239        assert!(
240            hamming_distance(h1, h2) > 0,
241            "Different heavy dims should produce different hashes"
242        );
243    }
244
245    /// Empty vector produces zero hash.
246    #[test]
247    fn test_simhash_empty() {
248        assert_eq!(simhash_from_sparse_vector(&[], 0.0, 5.0), 0);
249    }
250
251    /// Majority SimHash: basic correctness.
252    #[test]
253    fn test_majority_simhash_basic() {
254        // All same hash → majority = that hash
255        let hashes = vec![0xFF00FF00_FF00FF00u64; 5];
256        assert_eq!(majority_simhash(&hashes), 0xFF00FF00_FF00FF00);
257
258        // 3 votes for bit 0, 2 votes against → bit 0 set
259        let hashes = vec![0b1, 0b1, 0b1, 0b0, 0b0];
260        assert_eq!(majority_simhash(&hashes) & 1, 1);
261
262        // 2 votes for bit 0, 3 votes against → bit 0 unset
263        let hashes = vec![0b1, 0b1, 0b0, 0b0, 0b0];
264        assert_eq!(majority_simhash(&hashes) & 1, 0);
265    }
266
267    /// Stafford mix produces distinct values for adjacent dim IDs.
268    #[test]
269    fn test_stafford_mix_avalanche() {
270        let h0 = stafford_mix(0);
271        let h1 = stafford_mix(1);
272        let h2 = stafford_mix(2);
273
274        // Adjacent inputs should produce very different outputs (good avalanche)
275        assert!(hamming_distance(h0, h1) > 20, "Poor avalanche: 0 vs 1");
276        assert!(hamming_distance(h1, h2) > 20, "Poor avalanche: 1 vs 2");
277
278        // All distinct
279        assert_ne!(h0, h1);
280        assert_ne!(h1, h2);
281        assert_ne!(h0, h2);
282    }
283
284    /// stafford_mix(0) must not be zero (fixed-point bug) and bit 0 must be active.
285    #[test]
286    fn test_stafford_mix_no_fixed_point() {
287        assert_ne!(
288            stafford_mix(0),
289            0,
290            "stafford_mix(0) must not be a fixed point"
291        );
292
293        // Bit 0 should be set for at least some dim_ids (not always zero)
294        let bit0_set = (0..100).filter(|&d| stafford_mix(d) & 1 != 0).count();
295        assert!(
296            bit0_set > 30,
297            "Bit 0 should be set ~50% of the time, got {}/100",
298            bit0_set
299        );
300    }
301}
hermes_core/segment/builder/simhash.rs

hermes_core/segment/builder/
simhash.rs