structured-zstd 0.0.40

Pure Rust zstd implementation — managed fork of ruzstd. Dictionary decompression, no FFI.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
use alloc::collections::BTreeSet;
use alloc::vec;
use alloc::vec::Vec;

#[derive(Debug, Clone, Copy)]
pub struct FastCoverParams {
    pub k: usize,
    pub d: usize,
    pub f: u32,
    pub accel: usize,
}

#[derive(Debug, Clone, Copy)]
pub struct FastCoverTuned {
    pub k: usize,
    pub d: usize,
    pub f: u32,
    pub accel: usize,
    pub score: usize,
}

pub const DEFAULT_K_CANDIDATES: &[usize] = &[64, 128, 256, 512, 1024, 2048];
pub const DEFAULT_D_CANDIDATES: &[usize] = &[6, 8, 12, 16];
pub const DEFAULT_F_CANDIDATES: &[u32] = &[16, 18, 20];

// Donor multiplicative hash primes (`ZSTD_hashXPtr` family,
// `zstd/lib/common/zstd_internal.h`): one unaligned read + one multiply per
// dmer instead of a per-byte FNV loop.
const PRIME_4_BYTES: u32 = 2_654_435_761;
const PRIME_5_BYTES: u64 = 889_523_592_379;
const PRIME_6_BYTES: u64 = 227_718_039_650_203;
const PRIME_7_BYTES: u64 = 58_295_818_150_454_627;
const PRIME_8_BYTES: u64 = 0xCF1B_BCDC_B7A5_6463;

/// Bytes a dmer hash reads at a position: the hash covers the first
/// `min(d, 8)` bytes but the wide read is always 8 (donor
/// `readLength = MAX(d, 8)`), except the pure 4-byte hash.
#[inline]
fn dmer_read_len(d: usize) -> usize {
    d.max(8)
}

/// Donor `FASTCOVER_hashPtrToIndex`: hash the first `min(d, 8)` bytes of the
/// dmer at `pos` into an `f`-bit table index. Caller guarantees
/// `pos + dmer_read_len(d) <= sample.len()`.
#[inline]
fn hash_dmer_index(sample: &[u8], pos: usize, f: u32, d: usize) -> usize {
    if d.min(8) == 4 {
        let v = u32::from_le_bytes(sample[pos..pos + 4].try_into().unwrap());
        return (v.wrapping_mul(PRIME_4_BYTES) >> (32 - f)) as usize;
    }
    let v = u64::from_le_bytes(sample[pos..pos + 8].try_into().unwrap());
    let h = match d.min(8) {
        5 => (v << 24).wrapping_mul(PRIME_5_BYTES),
        6 => (v << 16).wrapping_mul(PRIME_6_BYTES),
        7 => (v << 8).wrapping_mul(PRIME_7_BYTES),
        _ => v.wrapping_mul(PRIME_8_BYTES),
    };
    (h >> (64 - f)) as usize
}

fn clamp_table_bits(f: u32) -> u32 {
    f.clamp(8, 20)
}

pub(crate) fn normalize_fastcover_params(mut params: FastCoverParams) -> FastCoverParams {
    params.d = params.d.clamp(4, 32);
    params.k = params.k.max(params.d).max(16);
    params.f = clamp_table_bits(params.f);
    params.accel = params.accel.clamp(1, 10);
    params
}

fn build_frequency_table(sample: &[u8], d: usize, f: u32, accel: usize) -> Vec<u32> {
    let bits = clamp_table_bits(f);
    let size = 1usize << bits;
    // Donor accel table: `skip = accel - 1` dmers between counted dmers
    // (`FASTCOVER_defaultAccelParameters`), i.e. a stride of `accel`.
    let step = accel.max(1);
    let mut table = vec![0u32; size];

    let read_len = dmer_read_len(d);
    if sample.len() < read_len {
        return table;
    }

    let mut i = 0usize;
    while i + read_len <= sample.len() {
        // A count is bounded by the dmer count (`sample.len()`), far below
        // `u32::MAX` for any trainable corpus — plain increment.
        table[hash_dmer_index(sample, i, bits, d)] += 1;
        i += step;
    }
    table
}

fn build_raw_dict(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
    if sample.is_empty() || dict_size == 0 {
        return Vec::new();
    }

    let params = normalize_fastcover_params(params);
    let k = params.k;
    let d = params.d;
    let f = clamp_table_bits(params.f);
    let read_len = dmer_read_len(d);
    if sample.len() < read_len {
        // Too short for even one wide-read dmer: no trainable content.
        // Callers treat an empty raw dict as "sample too small".
        return Vec::new();
    }

    // Donor `FASTCOVER_buildDictionary` epoch model: split the corpus into
    // epochs of dmers and round-robin them, taking the best k-byte segment
    // per visit. A segment's score is the sum of frequencies of its DISTINCT
    // dmers, maintained incrementally while the candidate window slides
    // (O(1) per position via the `segment_freqs` occurrence counts), and a
    // chosen segment's dmer frequencies are zeroed so later picks value only
    // new coverage. This replaced a global greedy set-cover with a per-
    // segment inverted index (`BTreeMap` per segment + slot lists): that
    // shape allocated millions of map nodes on a 1 MiB corpus and ran an
    // order of magnitude slower than the reference trainer at equal
    // coverage quality.
    let nb_dmers = sample.len() - read_len + 1;
    let mut freqs = build_frequency_table(sample, d, f, params.accel);
    let dmers_in_k = k - d + 1; // `normalize` guarantees k >= d

    // Donor `COVER_computeEpochs` (passes = 1): target one selection per
    // epoch, with a floor so epochs stay large enough to contain useful
    // segments.
    let min_epoch_size = k * 10;
    let mut epoch_count = (dict_size / k).max(1);
    let mut epoch_size = nb_dmers / epoch_count;
    if epoch_size < min_epoch_size {
        epoch_size = min_epoch_size.min(nb_dmers);
        epoch_count = (nb_dmers / epoch_size).max(1);
    }

    // Per-window dmer occurrence counts (donor `segmentFreqs`, u16: a window
    // holds at most `dmers_in_k` <= k occurrences of one index).
    let mut segment_freqs = vec![0u16; 1usize << f];
    // Fill from the back (donor layout) so the best segments sit at the end
    // of the dictionary and get referenced with the smallest offsets.
    let mut out = vec![0u8; dict_size];
    let mut tail = dict_size;
    const MAX_ZERO_SCORE_RUN: usize = 10;
    let mut zero_score_run = 0usize;
    let mut epoch = 0usize;

    while tail > 0 {
        let epoch_begin = epoch * epoch_size;
        let epoch_end = epoch_begin + epoch_size;
        epoch = (epoch + 1) % epoch_count;

        // Slide the candidate window across the epoch, tracking the best
        // segment (donor `FASTCOVER_selectSegment`).
        let mut best_begin = 0usize;
        let mut best_end = 0usize;
        let mut best_score = 0u64;
        let mut active_begin = epoch_begin;
        let mut active_end = epoch_begin;
        let mut active_score = 0u64;
        while active_end < epoch_end {
            let idx = hash_dmer_index(sample, active_end, f, d);
            if segment_freqs[idx] == 0 {
                active_score += u64::from(freqs[idx]);
            }
            active_end += 1;
            segment_freqs[idx] += 1;
            if active_end - active_begin == dmers_in_k + 1 {
                let del = hash_dmer_index(sample, active_begin, f, d);
                segment_freqs[del] -= 1;
                if segment_freqs[del] == 0 {
                    active_score -= u64::from(freqs[del]);
                }
                active_begin += 1;
            }
            if active_score > best_score {
                best_begin = active_begin;
                best_end = active_end;
                best_score = active_score;
            }
        }
        // Reset the window counts for the next epoch.
        while active_begin < epoch_end {
            let del = hash_dmer_index(sample, active_begin, f, d);
            segment_freqs[del] -= 1;
            active_begin += 1;
        }
        // Zero the chosen segment's frequencies: its dmers are covered.
        for pos in best_begin..best_end {
            freqs[hash_dmer_index(sample, pos, f, d)] = 0;
        }

        if best_score == 0 {
            // This epoch has no uncovered content left; other epochs may.
            // Give up after a run of empty epochs (donor `maxZeroScoreRun`).
            zero_score_run += 1;
            if zero_score_run >= MAX_ZERO_SCORE_RUN {
                break;
            }
            continue;
        }
        zero_score_run = 0;

        let segment_size = (best_end - best_begin + d - 1).min(tail);
        if segment_size < d {
            break;
        }
        tail -= segment_size;
        out[tail..tail + segment_size]
            .copy_from_slice(&sample[best_begin..best_begin + segment_size]);
    }

    out.drain(..tail);
    out
}

fn coverage_score(dict: &[u8], eval: &[u8], d: usize, accel: usize) -> usize {
    let read_len = dmer_read_len(d);
    if dict.len() < read_len || eval.len() < read_len || d == 0 {
        return 0;
    }
    const COVERAGE_F: u32 = 20;
    let mut seen = BTreeSet::new();
    for i in 0..=(dict.len() - read_len) {
        seen.insert(hash_dmer_index(dict, i, COVERAGE_F, d));
    }

    let mut hits = 0usize;
    let step = accel.max(1);
    let mut i = 0usize;
    while i + read_len <= eval.len() {
        if seen.contains(&hash_dmer_index(eval, i, COVERAGE_F, d)) {
            hits += 1;
        }
        i += step;
    }
    hits
}

pub fn train_fastcover_raw(sample: &[u8], dict_size: usize, params: FastCoverParams) -> Vec<u8> {
    build_raw_dict(sample, dict_size, params)
}

pub fn optimize_fastcover_raw(
    sample: &[u8],
    dict_size: usize,
    split_point: f64,
    accel: usize,
    d_candidates: &[usize],
    f_candidates: &[u32],
    k_values: &[usize],
) -> (Vec<u8>, FastCoverTuned) {
    let d_values = if d_candidates.is_empty() {
        DEFAULT_D_CANDIDATES
    } else {
        d_candidates
    };
    let f_values = if f_candidates.is_empty() {
        DEFAULT_F_CANDIDATES
    } else {
        f_candidates
    };
    let k_candidates = if k_values.is_empty() {
        DEFAULT_K_CANDIDATES
    } else {
        k_values
    };

    if sample.len() < 2 {
        let params = normalize_fastcover_params(FastCoverParams {
            k: k_candidates[0],
            d: d_values[0],
            f: f_values[0],
            accel,
        });
        let mut dict = build_raw_dict(sample, dict_size, params);
        if dict.is_empty() && dict_size > 0 {
            let take = sample.len().min(dict_size);
            dict.extend_from_slice(&sample[..take]);
        }
        return (
            dict,
            FastCoverTuned {
                k: params.k,
                d: params.d,
                f: params.f,
                accel: params.accel,
                score: 0,
            },
        );
    }

    let split = split_point.clamp(0.1, 0.95);
    let split_idx = ((sample.len() as f64) * split) as usize;
    let split_idx = split_idx.clamp(1, sample.len().saturating_sub(1));
    let (train, eval) = sample.split_at(split_idx);

    let mut best_dict = Vec::new();
    let mut best = FastCoverTuned {
        k: 0,
        d: 0,
        f: 0,
        accel: accel.clamp(1, 10),
        score: 0,
    };

    for &f in f_values {
        for &d in d_values {
            for &k in k_candidates {
                let params = normalize_fastcover_params(FastCoverParams { k, d, f, accel });
                let dict = build_raw_dict(train, dict_size, params);
                let score = coverage_score(dict.as_slice(), eval, params.d, params.accel);
                if best_dict.is_empty() || score > best.score {
                    best.score = score;
                    best.k = params.k;
                    best.d = params.d;
                    best.f = params.f;
                    best.accel = params.accel;
                    best_dict = dict;
                }
            }
        }
    }

    (best_dict, best)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::format;

    fn corpus() -> Vec<u8> {
        let mut data = Vec::new();
        for i in 0..500u32 {
            data.extend_from_slice(
                format!("tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbccccdddd\n")
                    .as_bytes(),
            );
        }
        data
    }

    #[test]
    fn fastcover_raw_produces_non_empty_dict() {
        let sample = corpus();
        let dict = train_fastcover_raw(
            sample.as_slice(),
            4096,
            FastCoverParams {
                k: 256,
                d: 8,
                f: 20,
                accel: 1,
            },
        );
        assert!(!dict.is_empty());
        assert!(dict.len() <= 4096);
    }

    #[test]
    fn fastcover_raw_returns_empty_for_empty_or_zero_budget() {
        let sample = corpus();
        let params = FastCoverParams {
            k: 256,
            d: 8,
            f: 20,
            accel: 1,
        };
        assert!(train_fastcover_raw(&[], 1024, params).is_empty());
        assert!(train_fastcover_raw(sample.as_slice(), 0, params).is_empty());
    }

    #[test]
    fn fastcover_optimizer_selects_valid_params() {
        let sample = corpus();
        let (dict, tuned) = optimize_fastcover_raw(
            sample.as_slice(),
            4096,
            0.75,
            1,
            &[6, 8],
            &[18, 20],
            &[128, 256],
        );
        assert!(!dict.is_empty());
        assert!([6, 8].contains(&tuned.d));
        assert!([18, 20].contains(&tuned.f));
        assert!([128, 256].contains(&tuned.k));
    }

    #[test]
    fn fastcover_optimizer_falls_back_when_k_candidates_empty() {
        let sample = corpus();
        let (dict, tuned) =
            optimize_fastcover_raw(sample.as_slice(), 4096, 0.75, 1, &[6, 8], &[18, 20], &[]);
        assert!(!dict.is_empty());
        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
    }

    #[test]
    fn fastcover_optimizer_handles_one_byte_sample_without_panic() {
        let sample = [0xAB];
        let (dict, tuned) = optimize_fastcover_raw(&sample, 16, 0.75, 1, &[], &[], &[]);
        assert!(!dict.is_empty());
        assert!(dict.len() <= 16);
        assert!(DEFAULT_K_CANDIDATES.contains(&tuned.k));
        assert!(DEFAULT_D_CANDIDATES.contains(&tuned.d));
        assert!(DEFAULT_F_CANDIDATES.contains(&tuned.f));
    }

    #[test]
    fn fastcover_optimizer_seeds_winner_when_all_scores_are_zero() {
        let sample = b"abcdefghijklmnopqrst";
        let (dict, tuned) = optimize_fastcover_raw(sample, 16, 0.9, 1, &[6], &[16], &[8]);
        assert!(!dict.is_empty());
        assert_eq!(tuned.k, 16);
        assert_eq!(tuned.d, 6);
        assert_eq!(tuned.f, 16);
        assert_eq!(tuned.score, 0);
    }

    #[test]
    fn fastcover_optimizer_handles_zero_dict_budget() {
        let sample = corpus();
        let (dict, tuned) = optimize_fastcover_raw(
            sample.as_slice(),
            0,
            0.75,
            1,
            &[6, 8],
            &[18, 20],
            &[128, 256],
        );
        assert!(dict.is_empty());
        assert!([6, 8].contains(&tuned.d));
        assert!([18, 20].contains(&tuned.f));
        assert!([128, 256].contains(&tuned.k));
    }

    #[test]
    fn fastcover_optimizer_clamps_extreme_split_points() {
        let sample = corpus();
        let (dict_low, tuned_low) =
            optimize_fastcover_raw(sample.as_slice(), 2048, 0.0, 1, &[6], &[18], &[128]);
        let (dict_high, tuned_high) =
            optimize_fastcover_raw(sample.as_slice(), 2048, 1.0, 1, &[6], &[18], &[128]);
        assert!(!dict_low.is_empty());
        assert!(!dict_high.is_empty());
        assert_eq!(tuned_low.k, 128);
        assert_eq!(tuned_high.k, 128);
    }

    #[test]
    fn fastcover_optimizer_reports_normalized_params() {
        let sample = corpus();
        let (dict, tuned) =
            optimize_fastcover_raw(sample.as_slice(), 1024, 0.75, 1, &[64], &[42], &[8]);
        assert!(!dict.is_empty());
        assert_eq!(tuned.d, 32);
        assert_eq!(tuned.f, 20);
        assert_eq!(tuned.k, 32);
    }
}