Skip to main content

structured_zstd/dictionary/
fastcover.rs

1use alloc::vec;
2use alloc::vec::Vec;
3
4#[derive(Debug, Clone, Copy)]
5pub struct FastCoverParams {
6    pub k: usize,
7    pub d: usize,
8    pub f: u32,
9    pub accel: usize,
10}
11
12#[derive(Debug, Clone, Copy)]
13pub struct FastCoverTuned {
14    pub k: usize,
15    pub d: usize,
16    pub f: u32,
17    pub accel: usize,
18    pub score: usize,
19}
20
21pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048];
22pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16];
23pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20];
24
25fn hash_dmer(dmer: &[u8]) -> u64 {
26    // 64-bit FNV-1a, deterministic and cheap for d-mer hashing.
27    let mut h = 0xcbf29ce484222325u64;
28    for &b in dmer {
29        h ^= u64::from(b);
30        h = h.wrapping_mul(0x100000001b3);
31    }
32    h
33}
34
35fn clamp_table_bits(f: u32) -> u32 {
36    f.clamp(8, 20)
37}
38
39pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams {
40    params.d = params.d.clamp(4, 32);
41    params.k = params.k.max(params.d).max(16);
42    params.f = clamp_table_bits(params.f);
43    params.accel = params.accel.clamp(1, 10);
44    params
45}
46
47fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec<u32> {
48    let bits = clamp_table_bits(f);
49    let size = 1usize << bits;
50    let mask = size - 1;
51    let step = accel.max(1);
52    let mut table = vec![0u32; size];
53
54    if sample.len() < d || d == 0 {
55        return table;
56    }
57
58    let mut i = 0usize;
59    while i + d <= sample.len() {
60        let slot = (hash_dmer(&sample[i..i + d]) as usize) & mask;
61        table[slot] = table[slot].saturating_add(1);
62        i += step;
63    }
64    table
65}
66
67fn score_segment(segment: &[u8], d: usize, mask: usize, table: &[u32]) -> usize {
68    if segment.len() < d || d == 0 {
69        return 0;
70    }
71    let mut score = 0usize;
72    for i in 0..=(segment.len() - d) {
73        let slot = (hash_dmer(&segment[i..i + d]) as usize) & mask;
74        score += table[slot] as usize;
75    }
76    score
77}
78
79fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
80    if sample.is_empty() || dict_size == 0 {
81        return Vec::new();
82    }
83
84    let params = normalize_fastcover_params(params);
85    let k = params.k;
86    let d = params.d;
87    let table = build_frequency_table(sample, d, params.f, params.accel);
88    let mask = table.len().saturating_sub(1);
89
90    let mut segments: Vec<(usize, &[u8])> = sample
91        .chunks(k)
92        .filter(|seg| seg.len() >= d)
93        .map(|seg| (score_segment(seg, d, mask, &table), seg))
94        .collect();
95    segments.sort_by_key(|segment| core::cmp::Reverse(segment.0));
96
97    let mut out = Vec::with_capacity(dict_size);
98    for (_, seg) in segments {
99        if out.len() >= dict_size {
100            break;
101        }
102        let remaining = dict_size - out.len();
103        if seg.len() <= remaining {
104            out.extend_from_slice(seg);
105        } else {
106            out.extend_from_slice(&seg[..remaining]);
107        }
108    }
109    out
110}
111
112fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize {
113    if dict.len() < d || eval.len() < d || d == 0 {
114        return 0;
115    }
116    let mut seen = std::collections::HashSet::with_capacity(dict.len() / d + 1);
117    for i in 0..=(dict.len() - d) {
118        seen.insert(hash_dmer(&dict[i..i + d]));
119    }
120
121    let mut hits = 0usize;
122    let step = accel.max(1);
123    let mut i = 0usize;
124    while i + d <= eval.len() {
125        if seen.contains(&hash_dmer(&eval[i..i + d])) {
126            hits += 1;
127        }
128        i += step;
129    }
130    hits
131}
132
133pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
134    build_raw_dict(sample, dict_size, params)
135}
136
137pub fn optimize_fastcover_raw(
138    sample: &[u8],
139    dict_size: usize,
140    split_point: f64,
141    accel: usize,
142    d_candidates: &[usize],
143    f_candidates: &[u32],
144    k_values: &[usize],
145) -> (Vec<u8>, FastCoverTuned) {
146    let d_values = if d_candidates.is_empty() {
147        DEFAULT_D_CANDIDATES
148    } else {
149        d_candidates
150    };
151    let f_values = if f_candidates.is_empty() {
152        DEFAULT_F_CANDIDATES
153    } else {
154        f_candidates
155    };
156    let k_candidates = if k_values.is_empty() {
157        DEFAULT_K_CANDIDATES
158    } else {
159        k_values
160    };
161
162    if sample.len() < 2 {
163        let params = normalize_fastcover_params(FastCoverParams {
164            k: k_candidates[0],
165            d: d_values[0],
166            f: f_values[0],
167            accel,
168        });
169        let mut dict = build_raw_dict(sample, dict_size, params);
170        if dict.is_empty() && dict_size > 0 {
171            let take = sample.len().min(dict_size);
172            dict.extend_from_slice(&sample[..take]);
173        }
174        return (
175            dict,
176            FastCoverTuned {
177                k: params.k,
178                d: params.d,
179                f: params.f,
180                accel: params.accel,
181                score: 0,
182            },
183        );
184    }
185
186    let split = split_point.clamp(0.1, 0.95);
187    let split_idx = ((sample.len() as f64) * split) as usize;
188    let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1));
189    let (train, eval) = sample.split_at(split_idx);
190
191    let mut best_dict = Vec::new();
192    let mut best = FastCoverTuned {
193        k: 0,
194        d: 0,
195        f: 0,
196        accel: accel.clamp(1, 10),
197        score: 0,
198    };
199
200    for &f in f_values {
201        for &d in d_values {
202            for &k in k_candidates {
203                let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel });
204                let dict = build_raw_dict(train, dict_size, params);
205                let score = coverage_score(dict.as_slice(), eval, params.d, params.accel);
206                if best_dict.is_empty() || score > best.score {
207                    best.score = score;
208                    best.k = params.k;
209                    best.d = params.d;
210                    best.f = params.f;
211                    best.accel = params.accel;
212                    best_dict = dict;
213                }
214            }
215        }
216    }
217
218    (best_dict, best)
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224    use std::format;
225
226    fn corpus() -> Vec<u8> {
227        let mut data = Vec::new();
228        for i in 0..500u32 {
229            data.extend_from_slice(
230                format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n")
231                    .as_bytes(),
232            );
233        }
234        data
235    }
236
237    #[test]
238    fn fastcover_raw_produces_non_empty_dict() {
239        let sample = corpus();
240        let dict = train_fastcover_raw(
241            sample.as_slice(),
242            4096,
243            FastCoverParams {
244                k: 256,
245                d: 8,
246                f: 20,
247                accel: 1,
248            },
249        );
250        assert!(!dict.is_empty());
251        assert!(dict.len() <= 4096);
252    }
253
254    #[test]
255    fn fastcover_raw_returns_empty_for_empty_or_zero_budget() {
256        let sample = corpus();
257        let params = FastCoverParams {
258            k: 256,
259            d: 8,
260            f: 20,
261            accel: 1,
262        };
263        assert!(train_fastcover_raw(&[], 1024, params).is_empty());
264        assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty());
265    }
266
267    #[test]
268    fn fastcover_optimizer_selects_valid_params() {
269        let sample = corpus();
270        let (dict, tuned) = optimize_fastcover_raw(
271            sample.as_slice(),
272            4096,
273            0.75,
274            1,
275            &[6, 8],
276            &[18, 20],
277            &[128, 256],
278        );
279        assert!(!dict.is_empty());
280        assert!([6, 8].contains(&tuned.d));
281        assert!([18, 20].contains(&tuned.f));
282        assert!([128, 256].contains(&tuned.k));
283    }
284
285    #[test]
286    fn fastcover_optimizer_falls_back_when_k_candidates_empty() {
287        let sample = corpus();
288        let (dict, tuned) =
289            optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]);
290        assert!(!dict.is_empty());
291        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
292    }
293
294    #[test]
295    fn fastcover_optimizer_handles_one_byte_sample_without_panic() {
296        let sample = [0xAB];
297        let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]);
298        assert!(!dict.is_empty());
299        assert!(dict.len() <= 16);
300        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
301        assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d));
302        assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f));
303    }
304
305    #[test]
306    fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() {
307        let sample = b"abcdefghijklmnopqrst";
308        let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]);
309        assert!(!dict.is_empty());
310        assert_eq!(tuned.k, 16);
311        assert_eq!(tuned.d, 6);
312        assert_eq!(tuned.f, 16);
313        assert_eq!(tuned.score, 0);
314    }
315
316    #[test]
317    fn fastcover_optimizer_handles_zero_dict_budget() {
318        let sample = corpus();
319        let (dict, tuned) = optimize_fastcover_raw(
320            sample.as_slice(),
321            0,
322            0.75,
323            1,
324            &[6, 8],
325            &[18, 20],
326            &[128, 256],
327        );
328        assert!(dict.is_empty());
329        assert!([6, 8].contains(&tuned.d));
330        assert!([18, 20].contains(&tuned.f));
331        assert!([128, 256].contains(&tuned.k));
332    }
333
334    #[test]
335    fn fastcover_optimizer_clamps_extreme_split_points() {
336        let sample = corpus();
337        let (dict_low, tuned_low) =
338            optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]);
339        let (dict_high, tuned_high) =
340            optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]);
341        assert!(!dict_low.is_empty());
342        assert!(!dict_high.is_empty());
343        assert_eq!(tuned_low.k, 128);
344        assert_eq!(tuned_high.k, 128);
345    }
346
347    #[test]
348    fn fastcover_optimizer_reports_normalized_params() {
349        let sample = corpus();
350        let (dict, tuned) =
351            optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]);
352        assert!(!dict.is_empty());
353        assert_eq!(tuned.d, 32);
354        assert_eq!(tuned.f, 20);
355        assert_eq!(tuned.k, 32);
356    }
357}