1const MAXI64: u64 = u64::MAX;
8
9const MPRIME: u64 = (1 << 61) - 1;
11
12const MAXH: u64 = (1 << 32) - 1;
14
15const MPA: [u64; 64] = [
17 853146490016488653,
18 1849332765672628665,
19 1131688930666554379,
20 1936485333668353377,
21 890837126813020267,
22 1988249303247129861,
23 1408894512544874755,
24 2140251716176616185,
25 1755124413189049421,
26 1355916793659431597,
27 546586563822844083,
28 497603761441203021,
29 2000709902557454173,
30 1057597903350092207,
31 1576204252850880253,
32 2078784234495706739,
33 1022616668454863635,
34 2150082342606334489,
35 712341150087765807,
36 1511757510246096559,
37 1525853819909660573,
38 1263771796138990131,
39 1215963627200985263,
40 590069150281426443,
41 130824646248385081,
42 962725325544728503,
43 1702561325943522847,
44 296074222435072629,
45 490211158716051523,
46 1255327197241792767,
47 699458998727907367,
48 32930168991409845,
49 1985097843455124585,
50 362027841570125531,
51 1903252144040897835,
52 900391845076405289,
53 547470123601853551,
54 1689373724032359119,
55 845594231933442371,
56 400331968021206285,
57 174967108345233429,
58 876513700861085019,
59 505848386844809885,
60 1920468508342256199,
61 1292611725303815789,
62 963317239501343903,
63 1730880032297268007,
64 284614929850059717,
65 1185026248283273081,
66 2167288823816985197,
67 1214905315086686483,
68 1555253098157439857,
69 1048013650291539723,
70 1238618594841147605,
71 1213502582686547311,
72 286300733803129311,
73 1250358511639043529,
74 407534797452854371,
75 960869149538623787,
76 1722699901467253087,
77 1325704236119824319,
78 196979859428570839,
79 1669408735473259699,
80 781336617016068757,
81];
82
83const MPB: [u64; 64] = [
85 1089606993368836715,
86 726972438868274737,
87 66204585613901025,
88 1078410179646709132,
89 1343470117098523467,
90 698653121981343911,
91 1248486536592473639,
92 1447963007834012793,
93 1034598851883537815,
94 1474008409379745934,
95 793773480906057541,
96 980501101461882479,
97 963941556313537655,
98 233651787311327325,
99 243905121737149907,
100 570269452476776142,
101 297633284648631084,
102 1516796967247398557,
103 1494795672066692649,
104 1728741177365151059,
105 1029197538967983408,
106 1660732464170610344,
107 1399769594446678069,
108 506465470557005705,
109 1279720146829545181,
110 860096419955634036,
111 411519685280832908,
112 69539191273403207,
113 1960489729088056217,
114 605092075716397684,
115 1017496016211653149,
116 1304834535101321372,
117 949013511180032347,
118 1142776242221098779,
119 576980004709031232,
120 1071272177143100544,
121 1494527341093835499,
122 1073290814142727850,
123 1285904200674942617,
124 1277176606329477335,
125 343788427301735585,
126 2100915269685487331,
127 1227711252031557450,
128 18593166391963377,
129 2101884148332688233,
130 191808277534686888,
131 2170124912729392024,
132 918430470748151293,
133 1831024560113812361,
134 1951365515851067694,
135 744352348473654499,
136 1921518311887826722,
137 2020165648600700886,
138 1764930142256726985,
139 1903893374912839788,
140 1449378957774802122,
141 1435825328374066345,
142 833197549717762813,
143 2238991044337210799,
144 748955638857938366,
145 1834583747494146901,
146 222012292803592982,
147 901238460725547841,
148 1501611130776083278,
149];
150
151fn minhash(features: &[u32]) -> Vec<u64> {
157 MPA.iter()
158 .zip(MPB.iter())
159 .map(|(&a, &b)| {
160 features
161 .iter()
162 .map(|&f| (((a.wrapping_mul(f as u64).wrapping_add(b)) & MAXI64) % MPRIME) & MAXH)
163 .min()
164 .unwrap_or(MAXH)
165 })
166 .collect()
167}
168
169fn minhash_compress(mhash: &[u64], lsb: u32) -> Vec<u8> {
175 let total_bits = mhash.len() * lsb as usize;
176 let mut bits = vec![0u8; total_bits];
177 let mut bit_index = 0;
178 for bitpos in 0..lsb {
179 for &h in mhash {
180 bits[bit_index] = ((h >> bitpos) & 1) as u8;
181 bit_index += 1;
182 }
183 }
184 let total_bytes = total_bits.div_ceil(8);
185 let mut out = vec![0u8; total_bytes];
186 for (i, &bit) in bits.iter().enumerate() {
187 if bit != 0 {
188 let byte_index = i / 8;
189 let bit_in_byte = 7 - (i % 8);
190 out[byte_index] |= 1 << bit_in_byte;
191 }
192 }
193 out
194}
195
196pub fn alg_minhash_256(features: &[u32]) -> Vec<u8> {
201 let mhash = minhash(features);
202 minhash_compress(&mhash, 4)
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 #[test]
210 fn test_minhash_empty_features() {
211 let result = minhash(&[]);
212 assert_eq!(result.len(), 64);
213 assert!(result.iter().all(|&v| v == MAXH));
214 }
215
216 #[test]
217 fn test_minhash_single_feature() {
218 let result = minhash(&[42]);
219 assert_eq!(result.len(), 64);
220 assert!(result.iter().all(|&v| v <= MAXH));
222 }
223
224 #[test]
225 fn test_minhash_compress_basic() {
226 let mhash = vec![0u64; 64];
228 let result = minhash_compress(&mhash, 1);
229 assert_eq!(result.len(), 8);
230 assert!(result.iter().all(|&b| b == 0));
231 }
232
233 #[test]
234 fn test_minhash_compress_all_ones() {
235 let mhash = vec![MAXH; 64];
237 let result = minhash_compress(&mhash, 4);
238 assert_eq!(result.len(), 32);
239 assert!(result.iter().all(|&b| b == 0xFF));
240 }
241
242 #[test]
243 fn test_alg_minhash_256_empty() {
244 let result = alg_minhash_256(&[]);
245 assert_eq!(result.len(), 32);
246 assert!(result.iter().all(|&b| b == 0xFF));
248 }
249
250 #[test]
251 fn test_alg_minhash_256_single() {
252 let result = alg_minhash_256(&[1]);
253 assert_eq!(result.len(), 32);
254 }
255
256 #[test]
257 fn test_alg_minhash_256_deterministic() {
258 let features = vec![100, 200, 300, 400, 500];
259 let result1 = alg_minhash_256(&features);
260 let result2 = alg_minhash_256(&features);
261 assert_eq!(result1, result2);
262 }
263}