1use alloc::vec;
2use alloc::vec::Vec;
3
4#[derive(Debug, Clone, Copy)]
5pub struct FastCoverParams {
6 pub k: usize,
7 pub d: usize,
8 pub f: u32,
9 pub accel: usize,
10}
11
12#[derive(Debug, Clone, Copy)]
13pub struct FastCoverTuned {
14 pub k: usize,
15 pub d: usize,
16 pub f: u32,
17 pub accel: usize,
18 pub score: usize,
19}
20
21pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048];
22pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16];
23pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20];
24
25fn hash_dmer(dmer: &[u8]) -> u64 {
26 let mut h = 0xcbf29ce484222325u64;
28 for &b in dmer {
29 h ^= u64::from(b);
30 h = h.wrapping_mul(0x100000001b3);
31 }
32 h
33}
34
35fn clamp_table_bits(f: u32) -> u32 {
36 f.clamp(8, 20)
37}
38
39pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams {
40 params.d = params.d.clamp(4, 32);
41 params.k = params.k.max(params.d).max(16);
42 params.f = clamp_table_bits(params.f);
43 params.accel = params.accel.clamp(1, 10);
44 params
45}
46
47fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec<u32> {
48 let bits = clamp_table_bits(f);
49 let size = 1usize << bits;
50 let mask = size - 1;
51 let step = accel.max(1);
52 let mut table = vec![0u32; size];
53
54 if sample.len() < d || d == 0 {
55 return table;
56 }
57
58 let mut i = 0usize;
59 while i + d <= sample.len() {
60 let slot = (hash_dmer(&sample[i..i + d]) as usize) & mask;
61 table[slot] = table[slot].saturating_add(1);
62 i += step;
63 }
64 table
65}
66
67fn score_segment(segment: &[u8], d: usize, mask: usize, table: &[u32]) -> usize {
68 if segment.len() < d || d == 0 {
69 return 0;
70 }
71 let mut score = 0usize;
72 for i in 0..=(segment.len() - d) {
73 let slot = (hash_dmer(&segment[i..i + d]) as usize) & mask;
74 score += table[slot] as usize;
75 }
76 score
77}
78
79fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
80 if sample.is_empty() || dict_size == 0 {
81 return Vec::new();
82 }
83
84 let params = normalize_fastcover_params(params);
85 let k = params.k;
86 let d = params.d;
87 let table = build_frequency_table(sample, d, params.f, params.accel);
88 let mask = table.len().saturating_sub(1);
89
90 let mut segments: Vec<(usize, &[u8])> = sample
91 .chunks(k)
92 .filter(|seg| seg.len() >= d)
93 .map(|seg| (score_segment(seg, d, mask, &table), seg))
94 .collect();
95 segments.sort_by_key(|segment| core::cmp::Reverse(segment.0));
96
97 let mut out = Vec::with_capacity(dict_size);
98 for (_, seg) in segments {
99 if out.len() >= dict_size {
100 break;
101 }
102 let remaining = dict_size - out.len();
103 if seg.len() <= remaining {
104 out.extend_from_slice(seg);
105 } else {
106 out.extend_from_slice(&seg[..remaining]);
107 }
108 }
109 out
110}
111
112fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize {
113 if dict.len() < d || eval.len() < d || d == 0 {
114 return 0;
115 }
116 let mut seen = std::collections::HashSet::with_capacity(dict.len() / d + 1);
117 for i in 0..=(dict.len() - d) {
118 seen.insert(hash_dmer(&dict[i..i + d]));
119 }
120
121 let mut hits = 0usize;
122 let step = accel.max(1);
123 let mut i = 0usize;
124 while i + d <= eval.len() {
125 if seen.contains(&hash_dmer(&eval[i..i + d])) {
126 hits += 1;
127 }
128 i += step;
129 }
130 hits
131}
132
133pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
134 build_raw_dict(sample, dict_size, params)
135}
136
137pub fn optimize_fastcover_raw(
138 sample: &[u8],
139 dict_size: usize,
140 split_point: f64,
141 accel: usize,
142 d_candidates: &[usize],
143 f_candidates: &[u32],
144 k_values: &[usize],
145) -> (Vec<u8>, FastCoverTuned) {
146 let d_values = if d_candidates.is_empty() {
147 DEFAULT_D_CANDIDATES
148 } else {
149 d_candidates
150 };
151 let f_values = if f_candidates.is_empty() {
152 DEFAULT_F_CANDIDATES
153 } else {
154 f_candidates
155 };
156 let k_candidates = if k_values.is_empty() {
157 DEFAULT_K_CANDIDATES
158 } else {
159 k_values
160 };
161
162 if sample.len() < 2 {
163 let params = normalize_fastcover_params(FastCoverParams {
164 k: k_candidates[0],
165 d: d_values[0],
166 f: f_values[0],
167 accel,
168 });
169 let mut dict = build_raw_dict(sample, dict_size, params);
170 if dict.is_empty() && dict_size > 0 {
171 let take = sample.len().min(dict_size);
172 dict.extend_from_slice(&sample[..take]);
173 }
174 return (
175 dict,
176 FastCoverTuned {
177 k: params.k,
178 d: params.d,
179 f: params.f,
180 accel: params.accel,
181 score: 0,
182 },
183 );
184 }
185
186 let split = split_point.clamp(0.1, 0.95);
187 let split_idx = ((sample.len() as f64) * split) as usize;
188 let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1));
189 let (train, eval) = sample.split_at(split_idx);
190
191 let mut best_dict = Vec::new();
192 let mut best = FastCoverTuned {
193 k: 0,
194 d: 0,
195 f: 0,
196 accel: accel.clamp(1, 10),
197 score: 0,
198 };
199
200 for &f in f_values {
201 for &d in d_values {
202 for &k in k_candidates {
203 let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel });
204 let dict = build_raw_dict(train, dict_size, params);
205 let score = coverage_score(dict.as_slice(), eval, params.d, params.accel);
206 if best_dict.is_empty() || score > best.score {
207 best.score = score;
208 best.k = params.k;
209 best.d = params.d;
210 best.f = params.f;
211 best.accel = params.accel;
212 best_dict = dict;
213 }
214 }
215 }
216 }
217
218 (best_dict, best)
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224 use std::format;
225
226 fn corpus() -> Vec<u8> {
227 let mut data = Vec::new();
228 for i in 0..500u32 {
229 data.extend_from_slice(
230 format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n")
231 .as_bytes(),
232 );
233 }
234 data
235 }
236
237 #[test]
238 fn fastcover_raw_produces_non_empty_dict() {
239 let sample = corpus();
240 let dict = train_fastcover_raw(
241 sample.as_slice(),
242 4096,
243 FastCoverParams {
244 k: 256,
245 d: 8,
246 f: 20,
247 accel: 1,
248 },
249 );
250 assert!(!dict.is_empty());
251 assert!(dict.len() <= 4096);
252 }
253
254 #[test]
255 fn fastcover_raw_returns_empty_for_empty_or_zero_budget() {
256 let sample = corpus();
257 let params = FastCoverParams {
258 k: 256,
259 d: 8,
260 f: 20,
261 accel: 1,
262 };
263 assert!(train_fastcover_raw(&[], 1024, params).is_empty());
264 assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty());
265 }
266
267 #[test]
268 fn fastcover_optimizer_selects_valid_params() {
269 let sample = corpus();
270 let (dict, tuned) = optimize_fastcover_raw(
271 sample.as_slice(),
272 4096,
273 0.75,
274 1,
275 &[6, 8],
276 &[18, 20],
277 &[128, 256],
278 );
279 assert!(!dict.is_empty());
280 assert!([6, 8].contains(&tuned.d));
281 assert!([18, 20].contains(&tuned.f));
282 assert!([128, 256].contains(&tuned.k));
283 }
284
285 #[test]
286 fn fastcover_optimizer_falls_back_when_k_candidates_empty() {
287 let sample = corpus();
288 let (dict, tuned) =
289 optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]);
290 assert!(!dict.is_empty());
291 assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
292 }
293
294 #[test]
295 fn fastcover_optimizer_handles_one_byte_sample_without_panic() {
296 let sample = [0xAB];
297 let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]);
298 assert!(!dict.is_empty());
299 assert!(dict.len() <= 16);
300 assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
301 assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d));
302 assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f));
303 }
304
305 #[test]
306 fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() {
307 let sample = b"abcdefghijklmnopqrst";
308 let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]);
309 assert!(!dict.is_empty());
310 assert_eq!(tuned.k, 16);
311 assert_eq!(tuned.d, 6);
312 assert_eq!(tuned.f, 16);
313 assert_eq!(tuned.score, 0);
314 }
315
316 #[test]
317 fn fastcover_optimizer_handles_zero_dict_budget() {
318 let sample = corpus();
319 let (dict, tuned) = optimize_fastcover_raw(
320 sample.as_slice(),
321 0,
322 0.75,
323 1,
324 &[6, 8],
325 &[18, 20],
326 &[128, 256],
327 );
328 assert!(dict.is_empty());
329 assert!([6, 8].contains(&tuned.d));
330 assert!([18, 20].contains(&tuned.f));
331 assert!([128, 256].contains(&tuned.k));
332 }
333
334 #[test]
335 fn fastcover_optimizer_clamps_extreme_split_points() {
336 let sample = corpus();
337 let (dict_low, tuned_low) =
338 optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]);
339 let (dict_high, tuned_high) =
340 optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]);
341 assert!(!dict_low.is_empty());
342 assert!(!dict_high.is_empty());
343 assert_eq!(tuned_low.k, 128);
344 assert_eq!(tuned_high.k, 128);
345 }
346
347 #[test]
348 fn fastcover_optimizer_reports_normalized_params() {
349 let sample = corpus();
350 let (dict, tuned) =
351 optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]);
352 assert!(!dict.is_empty());
353 assert_eq!(tuned.d, 32);
354 assert_eq!(tuned.f, 20);
355 assert_eq!(tuned.k, 32);
356 }
357}