Skip to main content

hermes_core/segment/builder/
simhash.rs

1/// Stafford mix13 variant — good avalanche for 32→64 bit expansion.
2/// Used to derive SimHash bit patterns from dimension IDs.
3#[inline]
4pub(crate) fn stafford_mix(dim_id: u32) -> u64 {
5    let mut h = dim_id as u64;
6    h = h.wrapping_mul(0x517cc1b727220a95);
7    h ^= h >> 32;
8    h = h.wrapping_mul(0x6c62272e07bb0142);
9    h
10}
11
12/// Compute SimHash from a sparse vector's dimension IDs and weights.
13///
14/// Each (dim_id, weight) pair contributes to a 64-bit fingerprint:
15/// - Hash dim_id with Stafford mix13 to get a 64-bit pseudo-random mask
16/// - For each bit, accumulate +weight or -weight based on that bit
17/// - Final hash: bit i = 1 iff accumulator[i] > 0
18///
19/// Documents with similar dimension sets produce similar hashes
20/// (small Hamming distance), enabling block-reorder clustering.
21#[inline]
22pub fn simhash_from_sparse_vector(entries: &[(u32, f32)]) -> u64 {
23    let mut acc = [0.0f64; 64];
24    for &(dim, weight) in entries {
25        let h = stafford_mix(dim);
26        let w = weight as f64;
27        // Process 4 bits at a time to reduce loop overhead
28        let mut mask = h;
29        for chunk in acc.chunks_exact_mut(4) {
30            chunk[0] += if mask & 1 != 0 { w } else { -w };
31            chunk[1] += if mask & 2 != 0 { w } else { -w };
32            chunk[2] += if mask & 4 != 0 { w } else { -w };
33            chunk[3] += if mask & 8 != 0 { w } else { -w };
34            mask >>= 4;
35        }
36    }
37    let mut hash = 0u64;
38    for (bit, &a) in acc.iter().enumerate() {
39        if a > 0.0 {
40            hash |= 1u64 << bit;
41        }
42    }
43    hash
44}
45
46/// Compute per-bit majority vote SimHash from a set of hashes.
47///
48/// For each of the 64 bit positions, sets the output bit to 1 if more than
49/// half the input hashes have that bit set. This produces the "centroid"
50/// in SimHash space — the representative that minimizes Hamming distance
51/// to the input set.
52pub fn majority_simhash(hashes: &[u64]) -> u64 {
53    if hashes.is_empty() {
54        return 0;
55    }
56    let threshold = hashes.len() / 2;
57    let mut result: u64 = 0;
58    for bit in 0..64 {
59        let count = hashes.iter().filter(|&&h| h & (1u64 << bit) != 0).count();
60        if count > threshold {
61            result |= 1u64 << bit;
62        }
63    }
64    result
65}
66
67#[cfg(test)]
68mod tests {
69    use super::*;
70
71    fn hamming_distance(a: u64, b: u64) -> u32 {
72        (a ^ b).count_ones()
73    }
74
75    /// Identical vectors produce the same hash.
76    #[test]
77    fn test_simhash_identical() {
78        let v = vec![(10, 1.0), (20, 0.5), (100, 2.0)];
79        let h1 = simhash_from_sparse_vector(&v);
80        let h2 = simhash_from_sparse_vector(&v);
81        assert_eq!(h1, h2);
82    }
83
84    /// Similar vectors (large overlap) produce close hashes (small Hamming distance).
85    #[test]
86    fn test_simhash_similar_vectors_close() {
87        // Base vector: dims 100..200 with weight 1.0
88        let base: Vec<(u32, f32)> = (100..200).map(|d| (d, 1.0)).collect();
89
90        // Similar: 90% overlap (dims 100..190 shared, 10 new dims)
91        let mut similar = base[..90].to_vec();
92        similar.extend((200..210).map(|d| (d, 1.0)));
93
94        // Dissimilar: completely different dims
95        let dissimilar: Vec<(u32, f32)> = (5000..5100).map(|d| (d, 1.0)).collect();
96
97        let h_base = simhash_from_sparse_vector(&base);
98        let h_similar = simhash_from_sparse_vector(&similar);
99        let h_dissimilar = simhash_from_sparse_vector(&dissimilar);
100
101        let dist_similar = hamming_distance(h_base, h_similar);
102        let dist_dissimilar = hamming_distance(h_base, h_dissimilar);
103
104        assert!(
105            dist_similar < dist_dissimilar,
106            "Similar vectors should have smaller Hamming distance: similar={}, dissimilar={}",
107            dist_similar,
108            dist_dissimilar,
109        );
110        // 90% overlap should yield very close hashes (< 16 out of 64 bits differ)
111        assert!(
112            dist_similar <= 16,
113            "90% overlap should produce Hamming distance ≤ 16, got {}",
114            dist_similar,
115        );
116    }
117
118    /// Vectors from the same "topic" (shared dim range) cluster together.
119    #[test]
120    fn test_simhash_topic_clustering() {
121        let mut rng_seed = 42u64;
122        let mut next = || -> u32 {
123            rng_seed = rng_seed.wrapping_mul(6364136223846793005).wrapping_add(1);
124            (rng_seed >> 33) as u32
125        };
126
127        // Topic A: dims 0..500
128        let make_topic_a = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
129            let mut v: Vec<(u32, f32)> = (0..80)
130                .map(|_| {
131                    let d = next() % 500;
132                    (d, 0.5 + (next() % 100) as f32 / 100.0)
133                })
134                .collect();
135            v.sort_by_key(|&(d, _)| d);
136            v.dedup_by_key(|e| e.0);
137            v
138        };
139
140        // Topic B: dims 10000..10500
141        let make_topic_b = |next: &mut dyn FnMut() -> u32| -> Vec<(u32, f32)> {
142            let mut v: Vec<(u32, f32)> = (0..80)
143                .map(|_| {
144                    let d = 10000 + next() % 500;
145                    (d, 0.5 + (next() % 100) as f32 / 100.0)
146                })
147                .collect();
148            v.sort_by_key(|&(d, _)| d);
149            v.dedup_by_key(|e| e.0);
150            v
151        };
152
153        // Generate 10 vectors from each topic
154        let topic_a: Vec<u64> = (0..10)
155            .map(|_| simhash_from_sparse_vector(&make_topic_a(&mut next)))
156            .collect();
157        let topic_b: Vec<u64> = (0..10)
158            .map(|_| simhash_from_sparse_vector(&make_topic_b(&mut next)))
159            .collect();
160
161        // Intra-topic distances should be small
162        let mut intra_dists = Vec::new();
163        for i in 0..topic_a.len() {
164            for j in (i + 1)..topic_a.len() {
165                intra_dists.push(hamming_distance(topic_a[i], topic_a[j]));
166            }
167            for j in (i + 1)..topic_b.len() {
168                intra_dists.push(hamming_distance(topic_b[i], topic_b[j]));
169            }
170        }
171
172        // Inter-topic distances should be large
173        let mut inter_dists = Vec::new();
174        for &ha in &topic_a {
175            for &hb in &topic_b {
176                inter_dists.push(hamming_distance(ha, hb));
177            }
178        }
179
180        let avg_intra = intra_dists.iter().sum::<u32>() as f64 / intra_dists.len() as f64;
181        let avg_inter = inter_dists.iter().sum::<u32>() as f64 / inter_dists.len() as f64;
182
183        // Key property: same-topic vectors are closer than cross-topic vectors
184        assert!(
185            avg_intra < avg_inter,
186            "Intra-topic avg Hamming ({:.1}) should be less than inter-topic ({:.1})",
187            avg_intra,
188            avg_inter,
189        );
190        // Inter-topic with disjoint dim ranges should be near-random (~32 bits)
191        assert!(
192            avg_inter > 28.0,
193            "Inter-topic avg Hamming should be > 28 (near-random), got {:.1}",
194            avg_inter,
195        );
196    }
197
198    /// Weight magnitude affects SimHash — heavy dims dominate.
199    #[test]
200    fn test_simhash_weight_sensitivity() {
201        // Two vectors share dims 0..50 but differ on dim 100
202        let mut v1: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
203        v1.push((100, 5.0)); // heavy dim A
204
205        let mut v2: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
206        v2.push((200, 5.0)); // heavy dim B
207
208        let mut v3: Vec<(u32, f32)> = (0..50).map(|d| (d, 0.1)).collect();
209        v3.push((100, 5.0)); // heavy dim A (same as v1)
210
211        let h1 = simhash_from_sparse_vector(&v1);
212        let h2 = simhash_from_sparse_vector(&v2);
213        let h3 = simhash_from_sparse_vector(&v3);
214
215        assert_eq!(h1, h3, "Identical vectors should produce identical hashes");
216        assert!(
217            hamming_distance(h1, h2) > 0,
218            "Different heavy dims should produce different hashes"
219        );
220    }
221
222    /// Empty vector produces zero hash.
223    #[test]
224    fn test_simhash_empty() {
225        assert_eq!(simhash_from_sparse_vector(&[]), 0);
226    }
227
228    /// Majority SimHash: basic correctness.
229    #[test]
230    fn test_majority_simhash_basic() {
231        // All same hash → majority = that hash
232        let hashes = vec![0xFF00FF00_FF00FF00u64; 5];
233        assert_eq!(majority_simhash(&hashes), 0xFF00FF00_FF00FF00);
234
235        // 3 votes for bit 0, 2 votes against → bit 0 set
236        let hashes = vec![0b1, 0b1, 0b1, 0b0, 0b0];
237        assert_eq!(majority_simhash(&hashes) & 1, 1);
238
239        // 2 votes for bit 0, 3 votes against → bit 0 unset
240        let hashes = vec![0b1, 0b1, 0b0, 0b0, 0b0];
241        assert_eq!(majority_simhash(&hashes) & 1, 0);
242    }
243
244    /// Stafford mix produces distinct values for adjacent dim IDs.
245    #[test]
246    fn test_stafford_mix_avalanche() {
247        let h0 = stafford_mix(0);
248        let h1 = stafford_mix(1);
249        let h2 = stafford_mix(2);
250
251        // Adjacent inputs should produce very different outputs (good avalanche)
252        assert!(hamming_distance(h0, h1) > 20, "Poor avalanche: 0 vs 1");
253        assert!(hamming_distance(h1, h2) > 20, "Poor avalanche: 1 vs 2");
254
255        // All distinct
256        assert_ne!(h0, h1);
257        assert_ne!(h1, h2);
258        assert_ne!(h0, h2);
259    }
260}