1use alloc::collections::BTreeSet;
2use alloc::vec;
3use alloc::vec::Vec;
4
5#[derive(Debug, Clone, Copy)]
6pub struct FastCoverParams {
7 pub k: usize,
8 pub d: usize,
9 pub f: u32,
10 pub accel: usize,
11}
12
13#[derive(Debug, Clone, Copy)]
14pub struct FastCoverTuned {
15 pub k: usize,
16 pub d: usize,
17 pub f: u32,
18 pub accel: usize,
19 pub score: usize,
20}
21
22pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048];
23pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16];
24pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20];
25
26const PRIME_4_BYTES: u32 = 2_654_435_761;
30const PRIME_5_BYTES: u64 = 889_523_592_379;
31const PRIME_6_BYTES: u64 = 227_718_039_650_203;
32const PRIME_7_BYTES: u64 = 58_295_818_150_454_627;
33const PRIME_8_BYTES: u64 = 0xCF1B_BCDC_B7A5_6463;
34
35#[inline]
39fn dmer_read_len(d: usize) -> usize {
40 d.max(8)
41}
42
43#[inline]
47fn hash_dmer_index(sample: &[u8], pos: usize, f: u32, d: usize) -> usize {
48 if d.min(8) == 4 {
49 let v = u32::from_le_bytes(sample[pos..pos + 4].try_into().unwrap());
50 return (v.wrapping_mul(PRIME_4_BYTES) >> (32 - f)) as usize;
51 }
52 let v = u64::from_le_bytes(sample[pos..pos + 8].try_into().unwrap());
53 let h = match d.min(8) {
54 5 => (v << 24).wrapping_mul(PRIME_5_BYTES),
55 6 => (v << 16).wrapping_mul(PRIME_6_BYTES),
56 7 => (v << 8).wrapping_mul(PRIME_7_BYTES),
57 _ => v.wrapping_mul(PRIME_8_BYTES),
58 };
59 (h >> (64 - f)) as usize
60}
61
62fn clamp_table_bits(f: u32) -> u32 {
63 f.clamp(8, 20)
64}
65
66pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams {
67 params.d = params.d.clamp(4, 32);
68 params.k = params.k.max(params.d).max(16);
69 params.f = clamp_table_bits(params.f);
70 params.accel = params.accel.clamp(1, 10);
71 params
72}
73
74fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec<u32> {
75 let bits = clamp_table_bits(f);
76 let size = 1usize << bits;
77 let step = accel.max(1);
80 let mut table = vec![0u32; size];
81
82 let read_len = dmer_read_len(d);
83 if sample.len() < read_len {
84 return table;
85 }
86
87 let mut i = 0usize;
88 while i + read_len <= sample.len() {
89 table[hash_dmer_index(sample, i, bits, d)] += 1;
92 i += step;
93 }
94 table
95}
96
97fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
98 if sample.is_empty() || dict_size == 0 {
99 return Vec::new();
100 }
101
102 let params = normalize_fastcover_params(params);
103 let k = params.k;
104 let d = params.d;
105 let f = clamp_table_bits(params.f);
106 let read_len = dmer_read_len(d);
107 if sample.len() < read_len {
108 return Vec::new();
111 }
112
113 let nb_dmers = sample.len() - read_len + 1;
125 let mut freqs = build_frequency_table(sample, d, f, params.accel);
126 let dmers_in_k = k - d + 1; let min_epoch_size = k * 10;
132 let mut epoch_count = (dict_size / k).max(1);
133 let mut epoch_size = nb_dmers / epoch_count;
134 if epoch_size < min_epoch_size {
135 epoch_size = min_epoch_size.min(nb_dmers);
136 epoch_count = (nb_dmers / epoch_size).max(1);
137 }
138
139 let mut segment_freqs = vec![0u16; 1usize << f];
142 let mut out = vec![0u8; dict_size];
145 let mut tail = dict_size;
146 const MAX_ZERO_SCORE_RUN: usize = 10;
147 let mut zero_score_run = 0usize;
148 let mut epoch = 0usize;
149
150 while tail > 0 {
151 let epoch_begin = epoch * epoch_size;
152 let epoch_end = epoch_begin + epoch_size;
153 epoch = (epoch + 1) % epoch_count;
154
155 let mut best_begin = 0usize;
158 let mut best_end = 0usize;
159 let mut best_score = 0u64;
160 let mut active_begin = epoch_begin;
161 let mut active_end = epoch_begin;
162 let mut active_score = 0u64;
163 while active_end < epoch_end {
164 let idx = hash_dmer_index(sample, active_end, f, d);
165 if segment_freqs[idx] == 0 {
166 active_score += u64::from(freqs[idx]);
167 }
168 active_end += 1;
169 segment_freqs[idx] += 1;
170 if active_end - active_begin == dmers_in_k + 1 {
171 let del = hash_dmer_index(sample, active_begin, f, d);
172 segment_freqs[del] -= 1;
173 if segment_freqs[del] == 0 {
174 active_score -= u64::from(freqs[del]);
175 }
176 active_begin += 1;
177 }
178 if active_score > best_score {
179 best_begin = active_begin;
180 best_end = active_end;
181 best_score = active_score;
182 }
183 }
184 while active_begin < epoch_end {
186 let del = hash_dmer_index(sample, active_begin, f, d);
187 segment_freqs[del] -= 1;
188 active_begin += 1;
189 }
190 for pos in best_begin..best_end {
192 freqs[hash_dmer_index(sample, pos, f, d)] = 0;
193 }
194
195 if best_score == 0 {
196 zero_score_run += 1;
199 if zero_score_run >= MAX_ZERO_SCORE_RUN {
200 break;
201 }
202 continue;
203 }
204 zero_score_run = 0;
205
206 let segment_size = (best_end - best_begin + d - 1).min(tail);
207 if segment_size < d {
208 break;
209 }
210 tail -= segment_size;
211 out[tail..tail + segment_size]
212 .copy_from_slice(&sample[best_begin..best_begin + segment_size]);
213 }
214
215 out.drain(..tail);
216 out
217}
218
219fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize {
220 let read_len = dmer_read_len(d);
221 if dict.len() < read_len || eval.len() < read_len || d == 0 {
222 return 0;
223 }
224 const COVERAGE_F: u32 = 20;
225 let mut seen = BTreeSet::new();
226 for i in 0..=(dict.len() - read_len) {
227 seen.insert(hash_dmer_index(dict, i, COVERAGE_F, d));
228 }
229
230 let mut hits = 0usize;
231 let step = accel.max(1);
232 let mut i = 0usize;
233 while i + read_len <= eval.len() {
234 if seen.contains(&hash_dmer_index(eval, i, COVERAGE_F, d)) {
235 hits += 1;
236 }
237 i += step;
238 }
239 hits
240}
241
242pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
243 build_raw_dict(sample, dict_size, params)
244}
245
246pub fn optimize_fastcover_raw(
247 sample: &[u8],
248 dict_size: usize,
249 split_point: f64,
250 accel: usize,
251 d_candidates: &[usize],
252 f_candidates: &[u32],
253 k_values: &[usize],
254) -> (Vec<u8>, FastCoverTuned) {
255 let d_values = if d_candidates.is_empty() {
256 DEFAULT_D_CANDIDATES
257 } else {
258 d_candidates
259 };
260 let f_values = if f_candidates.is_empty() {
261 DEFAULT_F_CANDIDATES
262 } else {
263 f_candidates
264 };
265 let k_candidates = if k_values.is_empty() {
266 DEFAULT_K_CANDIDATES
267 } else {
268 k_values
269 };
270
271 if sample.len() < 2 {
272 let params = normalize_fastcover_params(FastCoverParams {
273 k: k_candidates[0],
274 d: d_values[0],
275 f: f_values[0],
276 accel,
277 });
278 let mut dict = build_raw_dict(sample, dict_size, params);
279 if dict.is_empty() && dict_size > 0 {
280 let take = sample.len().min(dict_size);
281 dict.extend_from_slice(&sample[..take]);
282 }
283 return (
284 dict,
285 FastCoverTuned {
286 k: params.k,
287 d: params.d,
288 f: params.f,
289 accel: params.accel,
290 score: 0,
291 },
292 );
293 }
294
295 let split = split_point.clamp(0.1, 0.95);
296 let split_idx = ((sample.len() as f64) * split) as usize;
297 let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1));
298 let (train, eval) = sample.split_at(split_idx);
299
300 let mut best_dict = Vec::new();
301 let mut best = FastCoverTuned {
302 k: 0,
303 d: 0,
304 f: 0,
305 accel: accel.clamp(1, 10),
306 score: 0,
307 };
308
309 for &f in f_values {
310 for &d in d_values {
311 for &k in k_candidates {
312 let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel });
313 let dict = build_raw_dict(train, dict_size, params);
314 let score = coverage_score(dict.as_slice(), eval, params.d, params.accel);
315 if best_dict.is_empty() || score > best.score {
316 best.score = score;
317 best.k = params.k;
318 best.d = params.d;
319 best.f = params.f;
320 best.accel = params.accel;
321 best_dict = dict;
322 }
323 }
324 }
325 }
326
327 (best_dict, best)
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333 use std::format;
334
335 fn corpus() -> Vec<u8> {
336 let mut data = Vec::new();
337 for i in 0..500u32 {
338 data.extend_from_slice(
339 format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n")
340 .as_bytes(),
341 );
342 }
343 data
344 }
345
346 #[test]
347 fn fastcover_raw_produces_non_empty_dict() {
348 let sample = corpus();
349 let dict = train_fastcover_raw(
350 sample.as_slice(),
351 4096,
352 FastCoverParams {
353 k: 256,
354 d: 8,
355 f: 20,
356 accel: 1,
357 },
358 );
359 assert!(!dict.is_empty());
360 assert!(dict.len() <= 4096);
361 }
362
363 #[test]
364 fn fastcover_raw_returns_empty_for_empty_or_zero_budget() {
365 let sample = corpus();
366 let params = FastCoverParams {
367 k: 256,
368 d: 8,
369 f: 20,
370 accel: 1,
371 };
372 assert!(train_fastcover_raw(&[], 1024, params).is_empty());
373 assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty());
374 }
375
376 #[test]
377 fn fastcover_optimizer_selects_valid_params() {
378 let sample = corpus();
379 let (dict, tuned) = optimize_fastcover_raw(
380 sample.as_slice(),
381 4096,
382 0.75,
383 1,
384 &[6, 8],
385 &[18, 20],
386 &[128, 256],
387 );
388 assert!(!dict.is_empty());
389 assert!([6, 8].contains(&tuned.d));
390 assert!([18, 20].contains(&tuned.f));
391 assert!([128, 256].contains(&tuned.k));
392 }
393
394 #[test]
395 fn fastcover_optimizer_falls_back_when_k_candidates_empty() {
396 let sample = corpus();
397 let (dict, tuned) =
398 optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]);
399 assert!(!dict.is_empty());
400 assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
401 }
402
403 #[test]
404 fn fastcover_optimizer_handles_one_byte_sample_without_panic() {
405 let sample = [0xAB];
406 let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]);
407 assert!(!dict.is_empty());
408 assert!(dict.len() <= 16);
409 assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
410 assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d));
411 assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f));
412 }
413
414 #[test]
415 fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() {
416 let sample = b"abcdefghijklmnopqrst";
417 let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]);
418 assert!(!dict.is_empty());
419 assert_eq!(tuned.k, 16);
420 assert_eq!(tuned.d, 6);
421 assert_eq!(tuned.f, 16);
422 assert_eq!(tuned.score, 0);
423 }
424
425 #[test]
426 fn fastcover_optimizer_handles_zero_dict_budget() {
427 let sample = corpus();
428 let (dict, tuned) = optimize_fastcover_raw(
429 sample.as_slice(),
430 0,
431 0.75,
432 1,
433 &[6, 8],
434 &[18, 20],
435 &[128, 256],
436 );
437 assert!(dict.is_empty());
438 assert!([6, 8].contains(&tuned.d));
439 assert!([18, 20].contains(&tuned.f));
440 assert!([128, 256].contains(&tuned.k));
441 }
442
443 #[test]
444 fn fastcover_optimizer_clamps_extreme_split_points() {
445 let sample = corpus();
446 let (dict_low, tuned_low) =
447 optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]);
448 let (dict_high, tuned_high) =
449 optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]);
450 assert!(!dict_low.is_empty());
451 assert!(!dict_high.is_empty());
452 assert_eq!(tuned_low.k, 128);
453 assert_eq!(tuned_high.k, 128);
454 }
455
456 #[test]
457 fn fastcover_optimizer_reports_normalized_params() {
458 let sample = corpus();
459 let (dict, tuned) =
460 optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]);
461 assert!(!dict.is_empty());
462 assert_eq!(tuned.d, 32);
463 assert_eq!(tuned.f, 20);
464 assert_eq!(tuned.k, 32);
465 }
466}