Skip to main content

structured_zstd/dictionary/
fastcover.rs

1use alloc::collections::BTreeSet;
2use alloc::vec;
3use alloc::vec::Vec;
4
5#[derive(Debug, Clone, Copy)]
6pub struct FastCoverParams {
7    pub k: usize,
8    pub d: usize,
9    pub f: u32,
10    pub accel: usize,
11}
12
13#[derive(Debug, Clone, Copy)]
14pub struct FastCoverTuned {
15    pub k: usize,
16    pub d: usize,
17    pub f: u32,
18    pub accel: usize,
19    pub score: usize,
20}
21
22pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048];
23pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16];
24pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20];
25
26// Donor multiplicative hash primes (`ZSTD_hashXPtr` family,
27// `zstd/lib/common/zstd_internal.h`): one unaligned read + one multiply per
28// dmer instead of a per-byte FNV loop.
29const PRIME_4_BYTES: u32 = 2_654_435_761;
30const PRIME_5_BYTES: u64 = 889_523_592_379;
31const PRIME_6_BYTES: u64 = 227_718_039_650_203;
32const PRIME_7_BYTES: u64 = 58_295_818_150_454_627;
33const PRIME_8_BYTES: u64 = 0xCF1B_BCDC_B7A5_6463;
34
35/// Bytes a dmer hash reads at a position: the hash covers the first
36/// `min(d, 8)` bytes but the wide read is always 8 (donor
37/// `readLength = MAX(d, 8)`), except the pure 4-byte hash.
38#[inline]
39fn dmer_read_len(d: usize) -> usize {
40    d.max(8)
41}
42
43/// Donor `FASTCOVER_hashPtrToIndex`: hash the first `min(d, 8)` bytes of the
44/// dmer at `pos` into an `f`-bit table index. Caller guarantees
45/// `pos + dmer_read_len(d) <= sample.len()`.
46#[inline]
47fn hash_dmer_index(sample: &[u8], pos: usize, f: u32, d: usize) -> usize {
48    if d.min(8) == 4 {
49        let v = u32::from_le_bytes(sample[pos..pos + 4].try_into().unwrap());
50        return (v.wrapping_mul(PRIME_4_BYTES) >> (32 - f)) as usize;
51    }
52    let v = u64::from_le_bytes(sample[pos..pos + 8].try_into().unwrap());
53    let h = match d.min(8) {
54        5 => (v << 24).wrapping_mul(PRIME_5_BYTES),
55        6 => (v << 16).wrapping_mul(PRIME_6_BYTES),
56        7 => (v << 8).wrapping_mul(PRIME_7_BYTES),
57        _ => v.wrapping_mul(PRIME_8_BYTES),
58    };
59    (h >> (64 - f)) as usize
60}
61
62fn clamp_table_bits(f: u32) -> u32 {
63    f.clamp(8, 20)
64}
65
66pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams {
67    params.d = params.d.clamp(4, 32);
68    params.k = params.k.max(params.d).max(16);
69    params.f = clamp_table_bits(params.f);
70    params.accel = params.accel.clamp(1, 10);
71    params
72}
73
74fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec<u32> {
75    let bits = clamp_table_bits(f);
76    let size = 1usize << bits;
77    // Donor accel table: `skip = accel - 1` dmers between counted dmers
78    // (`FASTCOVER_defaultAccelParameters`), i.e. a stride of `accel`.
79    let step = accel.max(1);
80    let mut table = vec![0u32; size];
81
82    let read_len = dmer_read_len(d);
83    if sample.len() < read_len {
84        return table;
85    }
86
87    let mut i = 0usize;
88    while i + read_len <= sample.len() {
89        // A count is bounded by the dmer count (`sample.len()`), far below
90        // `u32::MAX` for any trainable corpus — plain increment.
91        table[hash_dmer_index(sample, i, bits, d)] += 1;
92        i += step;
93    }
94    table
95}
96
97fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
98    if sample.is_empty() || dict_size == 0 {
99        return Vec::new();
100    }
101
102    let params = normalize_fastcover_params(params);
103    let k = params.k;
104    let d = params.d;
105    let f = clamp_table_bits(params.f);
106    let read_len = dmer_read_len(d);
107    if sample.len() < read_len {
108        // Too short for even one wide-read dmer: no trainable content.
109        // Callers treat an empty raw dict as "sample too small".
110        return Vec::new();
111    }
112
113    // Donor `FASTCOVER_buildDictionary` epoch model: split the corpus into
114    // epochs of dmers and round-robin them, taking the best k-byte segment
115    // per visit. A segment's score is the sum of frequencies of its DISTINCT
116    // dmers, maintained incrementally while the candidate window slides
117    // (O(1) per position via the `segment_freqs` occurrence counts), and a
118    // chosen segment's dmer frequencies are zeroed so later picks value only
119    // new coverage. This replaced a global greedy set-cover with a per-
120    // segment inverted index (`BTreeMap` per segment + slot lists): that
121    // shape allocated millions of map nodes on a 1 MiB corpus and ran an
122    // order of magnitude slower than the reference trainer at equal
123    // coverage quality.
124    let nb_dmers = sample.len() - read_len + 1;
125    let mut freqs = build_frequency_table(sample, d, f, params.accel);
126    let dmers_in_k = k - d + 1; // `normalize` guarantees k >= d
127
128    // Donor `COVER_computeEpochs` (passes = 1): target one selection per
129    // epoch, with a floor so epochs stay large enough to contain useful
130    // segments.
131    let min_epoch_size = k * 10;
132    let mut epoch_count = (dict_size / k).max(1);
133    let mut epoch_size = nb_dmers / epoch_count;
134    if epoch_size < min_epoch_size {
135        epoch_size = min_epoch_size.min(nb_dmers);
136        epoch_count = (nb_dmers / epoch_size).max(1);
137    }
138
139    // Per-window dmer occurrence counts (donor `segmentFreqs`, u16: a window
140    // holds at most `dmers_in_k` <= k occurrences of one index).
141    let mut segment_freqs = vec![0u16; 1usize << f];
142    // Fill from the back (donor layout) so the best segments sit at the end
143    // of the dictionary and get referenced with the smallest offsets.
144    let mut out = vec![0u8; dict_size];
145    let mut tail = dict_size;
146    const MAX_ZERO_SCORE_RUN: usize = 10;
147    let mut zero_score_run = 0usize;
148    let mut epoch = 0usize;
149
150    while tail > 0 {
151        let epoch_begin = epoch * epoch_size;
152        let epoch_end = epoch_begin + epoch_size;
153        epoch = (epoch + 1) % epoch_count;
154
155        // Slide the candidate window across the epoch, tracking the best
156        // segment (donor `FASTCOVER_selectSegment`).
157        let mut best_begin = 0usize;
158        let mut best_end = 0usize;
159        let mut best_score = 0u64;
160        let mut active_begin = epoch_begin;
161        let mut active_end = epoch_begin;
162        let mut active_score = 0u64;
163        while active_end < epoch_end {
164            let idx = hash_dmer_index(sample, active_end, f, d);
165            if segment_freqs[idx] == 0 {
166                active_score += u64::from(freqs[idx]);
167            }
168            active_end += 1;
169            segment_freqs[idx] += 1;
170            if active_end - active_begin == dmers_in_k + 1 {
171                let del = hash_dmer_index(sample, active_begin, f, d);
172                segment_freqs[del] -= 1;
173                if segment_freqs[del] == 0 {
174                    active_score -= u64::from(freqs[del]);
175                }
176                active_begin += 1;
177            }
178            if active_score > best_score {
179                best_begin = active_begin;
180                best_end = active_end;
181                best_score = active_score;
182            }
183        }
184        // Reset the window counts for the next epoch.
185        while active_begin < epoch_end {
186            let del = hash_dmer_index(sample, active_begin, f, d);
187            segment_freqs[del] -= 1;
188            active_begin += 1;
189        }
190        // Zero the chosen segment's frequencies: its dmers are covered.
191        for pos in best_begin..best_end {
192            freqs[hash_dmer_index(sample, pos, f, d)] = 0;
193        }
194
195        if best_score == 0 {
196            // This epoch has no uncovered content left; other epochs may.
197            // Give up after a run of empty epochs (donor `maxZeroScoreRun`).
198            zero_score_run += 1;
199            if zero_score_run >= MAX_ZERO_SCORE_RUN {
200                break;
201            }
202            continue;
203        }
204        zero_score_run = 0;
205
206        let segment_size = (best_end - best_begin + d - 1).min(tail);
207        if segment_size < d {
208            break;
209        }
210        tail -= segment_size;
211        out[tail..tail + segment_size]
212            .copy_from_slice(&sample[best_begin..best_begin + segment_size]);
213    }
214
215    out.drain(..tail);
216    out
217}
218
219fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize {
220    let read_len = dmer_read_len(d);
221    if dict.len() < read_len || eval.len() < read_len || d == 0 {
222        return 0;
223    }
224    const COVERAGE_F: u32 = 20;
225    let mut seen = BTreeSet::new();
226    for i in 0..=(dict.len() - read_len) {
227        seen.insert(hash_dmer_index(dict, i, COVERAGE_F, d));
228    }
229
230    let mut hits = 0usize;
231    let step = accel.max(1);
232    let mut i = 0usize;
233    while i + read_len <= eval.len() {
234        if seen.contains(&hash_dmer_index(eval, i, COVERAGE_F, d)) {
235            hits += 1;
236        }
237        i += step;
238    }
239    hits
240}
241
242pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
243    build_raw_dict(sample, dict_size, params)
244}
245
246pub fn optimize_fastcover_raw(
247    sample: &[u8],
248    dict_size: usize,
249    split_point: f64,
250    accel: usize,
251    d_candidates: &[usize],
252    f_candidates: &[u32],
253    k_values: &[usize],
254) -> (Vec<u8>, FastCoverTuned) {
255    let d_values = if d_candidates.is_empty() {
256        DEFAULT_D_CANDIDATES
257    } else {
258        d_candidates
259    };
260    let f_values = if f_candidates.is_empty() {
261        DEFAULT_F_CANDIDATES
262    } else {
263        f_candidates
264    };
265    let k_candidates = if k_values.is_empty() {
266        DEFAULT_K_CANDIDATES
267    } else {
268        k_values
269    };
270
271    if sample.len() < 2 {
272        let params = normalize_fastcover_params(FastCoverParams {
273            k: k_candidates[0],
274            d: d_values[0],
275            f: f_values[0],
276            accel,
277        });
278        let mut dict = build_raw_dict(sample, dict_size, params);
279        if dict.is_empty() && dict_size > 0 {
280            let take = sample.len().min(dict_size);
281            dict.extend_from_slice(&sample[..take]);
282        }
283        return (
284            dict,
285            FastCoverTuned {
286                k: params.k,
287                d: params.d,
288                f: params.f,
289                accel: params.accel,
290                score: 0,
291            },
292        );
293    }
294
295    let split = split_point.clamp(0.1, 0.95);
296    let split_idx = ((sample.len() as f64) * split) as usize;
297    let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1));
298    let (train, eval) = sample.split_at(split_idx);
299
300    let mut best_dict = Vec::new();
301    let mut best = FastCoverTuned {
302        k: 0,
303        d: 0,
304        f: 0,
305        accel: accel.clamp(1, 10),
306        score: 0,
307    };
308
309    for &f in f_values {
310        for &d in d_values {
311            for &k in k_candidates {
312                let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel });
313                let dict = build_raw_dict(train, dict_size, params);
314                let score = coverage_score(dict.as_slice(), eval, params.d, params.accel);
315                if best_dict.is_empty() || score > best.score {
316                    best.score = score;
317                    best.k = params.k;
318                    best.d = params.d;
319                    best.f = params.f;
320                    best.accel = params.accel;
321                    best_dict = dict;
322                }
323            }
324        }
325    }
326
327    (best_dict, best)
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333    use std::format;
334
335    fn corpus() -> Vec<u8> {
336        let mut data = Vec::new();
337        for i in 0..500u32 {
338            data.extend_from_slice(
339                format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n")
340                    .as_bytes(),
341            );
342        }
343        data
344    }
345
346    #[test]
347    fn fastcover_raw_produces_non_empty_dict() {
348        let sample = corpus();
349        let dict = train_fastcover_raw(
350            sample.as_slice(),
351            4096,
352            FastCoverParams {
353                k: 256,
354                d: 8,
355                f: 20,
356                accel: 1,
357            },
358        );
359        assert!(!dict.is_empty());
360        assert!(dict.len() <= 4096);
361    }
362
363    #[test]
364    fn fastcover_raw_returns_empty_for_empty_or_zero_budget() {
365        let sample = corpus();
366        let params = FastCoverParams {
367            k: 256,
368            d: 8,
369            f: 20,
370            accel: 1,
371        };
372        assert!(train_fastcover_raw(&[], 1024, params).is_empty());
373        assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty());
374    }
375
376    #[test]
377    fn fastcover_optimizer_selects_valid_params() {
378        let sample = corpus();
379        let (dict, tuned) = optimize_fastcover_raw(
380            sample.as_slice(),
381            4096,
382            0.75,
383            1,
384            &[6, 8],
385            &[18, 20],
386            &[128, 256],
387        );
388        assert!(!dict.is_empty());
389        assert!([6, 8].contains(&tuned.d));
390        assert!([18, 20].contains(&tuned.f));
391        assert!([128, 256].contains(&tuned.k));
392    }
393
394    #[test]
395    fn fastcover_optimizer_falls_back_when_k_candidates_empty() {
396        let sample = corpus();
397        let (dict, tuned) =
398            optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]);
399        assert!(!dict.is_empty());
400        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
401    }
402
403    #[test]
404    fn fastcover_optimizer_handles_one_byte_sample_without_panic() {
405        let sample = [0xAB];
406        let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]);
407        assert!(!dict.is_empty());
408        assert!(dict.len() <= 16);
409        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
410        assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d));
411        assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f));
412    }
413
414    #[test]
415    fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() {
416        let sample = b"abcdefghijklmnopqrst";
417        let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]);
418        assert!(!dict.is_empty());
419        assert_eq!(tuned.k, 16);
420        assert_eq!(tuned.d, 6);
421        assert_eq!(tuned.f, 16);
422        assert_eq!(tuned.score, 0);
423    }
424
425    #[test]
426    fn fastcover_optimizer_handles_zero_dict_budget() {
427        let sample = corpus();
428        let (dict, tuned) = optimize_fastcover_raw(
429            sample.as_slice(),
430            0,
431            0.75,
432            1,
433            &[6, 8],
434            &[18, 20],
435            &[128, 256],
436        );
437        assert!(dict.is_empty());
438        assert!([6, 8].contains(&tuned.d));
439        assert!([18, 20].contains(&tuned.f));
440        assert!([128, 256].contains(&tuned.k));
441    }
442
443    #[test]
444    fn fastcover_optimizer_clamps_extreme_split_points() {
445        let sample = corpus();
446        let (dict_low, tuned_low) =
447            optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]);
448        let (dict_high, tuned_high) =
449            optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]);
450        assert!(!dict_low.is_empty());
451        assert!(!dict_high.is_empty());
452        assert_eq!(tuned_low.k, 128);
453        assert_eq!(tuned_high.k, 128);
454    }
455
456    #[test]
457    fn fastcover_optimizer_reports_normalized_params() {
458        let sample = corpus();
459        let (dict, tuned) =
460            optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]);
461        assert!(!dict.is_empty());
462        assert_eq!(tuned.d, 32);
463        assert_eq!(tuned.f, 20);
464        assert_eq!(tuned.k, 32);
465    }
466}