Skip to main content

iscc_lib/
minhash.rs

1//! MinHash algorithm for similarity-preserving hashing.
2//!
3//! Implements 64-dimensional MinHash with universal hash functions and
4//! bit-interleaved compression, ported from `bio-codes/iscc-sum`.
5
6/// Maximum 64-bit unsigned value (2^64 - 1).
7const MAXI64: u64 = u64::MAX;
8
9/// Mersenne prime 2^61 - 1.
10const MPRIME: u64 = (1 << 61) - 1;
11
12/// Maximum hash value mask (2^32 - 1).
13const MAXH: u64 = (1 << 32) - 1;
14
15/// MinHash permutation parameter A (64 universal hash function multipliers).
16const MPA: [u64; 64] = [
17    853146490016488653,
18    1849332765672628665,
19    1131688930666554379,
20    1936485333668353377,
21    890837126813020267,
22    1988249303247129861,
23    1408894512544874755,
24    2140251716176616185,
25    1755124413189049421,
26    1355916793659431597,
27    546586563822844083,
28    497603761441203021,
29    2000709902557454173,
30    1057597903350092207,
31    1576204252850880253,
32    2078784234495706739,
33    1022616668454863635,
34    2150082342606334489,
35    712341150087765807,
36    1511757510246096559,
37    1525853819909660573,
38    1263771796138990131,
39    1215963627200985263,
40    590069150281426443,
41    130824646248385081,
42    962725325544728503,
43    1702561325943522847,
44    296074222435072629,
45    490211158716051523,
46    1255327197241792767,
47    699458998727907367,
48    32930168991409845,
49    1985097843455124585,
50    362027841570125531,
51    1903252144040897835,
52    900391845076405289,
53    547470123601853551,
54    1689373724032359119,
55    845594231933442371,
56    400331968021206285,
57    174967108345233429,
58    876513700861085019,
59    505848386844809885,
60    1920468508342256199,
61    1292611725303815789,
62    963317239501343903,
63    1730880032297268007,
64    284614929850059717,
65    1185026248283273081,
66    2167288823816985197,
67    1214905315086686483,
68    1555253098157439857,
69    1048013650291539723,
70    1238618594841147605,
71    1213502582686547311,
72    286300733803129311,
73    1250358511639043529,
74    407534797452854371,
75    960869149538623787,
76    1722699901467253087,
77    1325704236119824319,
78    196979859428570839,
79    1669408735473259699,
80    781336617016068757,
81];
82
83/// MinHash permutation parameter B (64 universal hash function addends).
84const MPB: [u64; 64] = [
85    1089606993368836715,
86    726972438868274737,
87    66204585613901025,
88    1078410179646709132,
89    1343470117098523467,
90    698653121981343911,
91    1248486536592473639,
92    1447963007834012793,
93    1034598851883537815,
94    1474008409379745934,
95    793773480906057541,
96    980501101461882479,
97    963941556313537655,
98    233651787311327325,
99    243905121737149907,
100    570269452476776142,
101    297633284648631084,
102    1516796967247398557,
103    1494795672066692649,
104    1728741177365151059,
105    1029197538967983408,
106    1660732464170610344,
107    1399769594446678069,
108    506465470557005705,
109    1279720146829545181,
110    860096419955634036,
111    411519685280832908,
112    69539191273403207,
113    1960489729088056217,
114    605092075716397684,
115    1017496016211653149,
116    1304834535101321372,
117    949013511180032347,
118    1142776242221098779,
119    576980004709031232,
120    1071272177143100544,
121    1494527341093835499,
122    1073290814142727850,
123    1285904200674942617,
124    1277176606329477335,
125    343788427301735585,
126    2100915269685487331,
127    1227711252031557450,
128    18593166391963377,
129    2101884148332688233,
130    191808277534686888,
131    2170124912729392024,
132    918430470748151293,
133    1831024560113812361,
134    1951365515851067694,
135    744352348473654499,
136    1921518311887826722,
137    2020165648600700886,
138    1764930142256726985,
139    1903893374912839788,
140    1449378957774802122,
141    1435825328374066345,
142    833197549717762813,
143    2238991044337210799,
144    748955638857938366,
145    1834583747494146901,
146    222012292803592982,
147    901238460725547841,
148    1501611130776083278,
149];
150
151/// Compute a 64-dimensional MinHash from 32-bit integer features.
152///
153/// Uses 64 universal hash functions parameterized by MPA/MPB to compute
154/// the minimum hash value for each dimension. Returns `MAXH` for each
155/// dimension when features is empty.
156fn minhash(features: &[u32]) -> Vec<u64> {
157    MPA.iter()
158        .zip(MPB.iter())
159        .map(|(&a, &b)| {
160            features
161                .iter()
162                .map(|&f| (((a.wrapping_mul(f as u64).wrapping_add(b)) & MAXI64) % MPRIME) & MAXH)
163                .min()
164                .unwrap_or(MAXH)
165        })
166        .collect()
167}
168
169/// Compress a MinHash vector by extracting and interleaving LSB bits.
170///
171/// Extracts `lsb` least-significant bits from each hash value. Iterates
172/// bit positions 0..lsb, then over all hash values, packing bits MSB-first
173/// into output bytes.
174fn minhash_compress(mhash: &[u64], lsb: u32) -> Vec<u8> {
175    let total_bits = mhash.len() * lsb as usize;
176    let mut bits = vec![0u8; total_bits];
177    let mut bit_index = 0;
178    for bitpos in 0..lsb {
179        for &h in mhash {
180            bits[bit_index] = ((h >> bitpos) & 1) as u8;
181            bit_index += 1;
182        }
183    }
184    let total_bytes = total_bits.div_ceil(8);
185    let mut out = vec![0u8; total_bytes];
186    for (i, &bit) in bits.iter().enumerate() {
187        if bit != 0 {
188            let byte_index = i / 8;
189            let bit_in_byte = 7 - (i % 8);
190            out[byte_index] |= 1 << bit_in_byte;
191        }
192    }
193    out
194}
195
196/// Compute a 256-bit MinHash digest from 32-bit integer features.
197///
198/// Calls `minhash` to get 64-dimensional hash vector, then compresses
199/// with `lsb=4` to produce 32 bytes (64 × 4 bits = 256 bits).
200pub fn alg_minhash_256(features: &[u32]) -> Vec<u8> {
201    let mhash = minhash(features);
202    minhash_compress(&mhash, 4)
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    #[test]
210    fn test_minhash_empty_features() {
211        let result = minhash(&[]);
212        assert_eq!(result.len(), 64);
213        assert!(result.iter().all(|&v| v == MAXH));
214    }
215
216    #[test]
217    fn test_minhash_single_feature() {
218        let result = minhash(&[42]);
219        assert_eq!(result.len(), 64);
220        // Each dimension should have a specific hash value
221        assert!(result.iter().all(|&v| v <= MAXH));
222    }
223
224    #[test]
225    fn test_minhash_compress_basic() {
226        // With lsb=1, extracting 1 bit from each of 64 hash values → 8 bytes
227        let mhash = vec![0u64; 64];
228        let result = minhash_compress(&mhash, 1);
229        assert_eq!(result.len(), 8);
230        assert!(result.iter().all(|&b| b == 0));
231    }
232
233    #[test]
234    fn test_minhash_compress_all_ones() {
235        // All hash values are 0xFFFFFFFF, lsb=4 → 32 bytes, all bits set
236        let mhash = vec![MAXH; 64];
237        let result = minhash_compress(&mhash, 4);
238        assert_eq!(result.len(), 32);
239        assert!(result.iter().all(|&b| b == 0xFF));
240    }
241
242    #[test]
243    fn test_alg_minhash_256_empty() {
244        let result = alg_minhash_256(&[]);
245        assert_eq!(result.len(), 32);
246        // Empty features → all MAXH → all bits set in LSB 4 → all 0xFF
247        assert!(result.iter().all(|&b| b == 0xFF));
248    }
249
250    #[test]
251    fn test_alg_minhash_256_single() {
252        let result = alg_minhash_256(&[1]);
253        assert_eq!(result.len(), 32);
254    }
255
256    #[test]
257    fn test_alg_minhash_256_deterministic() {
258        let features = vec![100, 200, 300, 400, 500];
259        let result1 = alg_minhash_256(&features);
260        let result2 = alg_minhash_256(&features);
261        assert_eq!(result1, result2);
262    }
263}