Skip to main content

structured_zstd/dictionary/
mod.rs

1//! Code for creating a separate content dictionary.
2//!
3//! Effective dictionaries are up to 1% the size of the complete training body,
4//! and are trained on many examples of the original data.
5//!
6//! Implemented following the paper "Effective construction of
7//! Relative Lempel-Ziv Dictionaries", by Kewen Liao, Matthias Petri,
8//! Alistair Moffat, and Anthony Wirth
9
10// The algorithm is summarized here
11// 1. The text is split into "epochs", or chunks from the original source
12// 2. From within each epoch, we select the "segment", or 1 KiB contiguous section
13//    that's predicted to be the best option to include in the dictionary. Concatenated,
14//    these segments form the dictionary.
15//
16// This segment scoring algorithm operates as follows:
17// For a given epoch:
18//  - Run a reservoir sampler over the entire epoch, creating a
19//    reservoir of n/t, where `t` is the desired number of occurrences
20//    we want the most common k-mers to have
21//  - Have the ability to estimate
22//    the frequency of a given k-mer: `f(w: k-mer)` calculates
23//    the frequency of w in the reservoir using a rolling karp-rabin hash
24//  - The score of a segment is the sum of `f(w)` called on every kmer within the segment
25mod cover;
26mod fastcover;
27mod frequency;
28mod reservoir;
29
30use crate::bit_io::BitWriter;
31use crate::blocks::sequence_section::{
32    MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
33};
34use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM;
35use crate::decoding::sequence_section_decoder::{LL_MAX_LOG, ML_MAX_LOG, OF_MAX_LOG};
36use crate::dictionary::reservoir::create_sample;
37use crate::fse::fse_encoder::{self, build_table_from_symbol_counts};
38use crate::huff0::HuffmanTable as HuffmanDecoderTable;
39use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable};
40use core::cmp::Reverse;
41use cover::*;
42pub use fastcover::{
43    DEFAULT_D_CANDIDATES, DEFAULT_F_CANDIDATES, DEFAULT_K_CANDIDATES, FastCoverParams,
44    FastCoverTuned,
45};
46use std::{
47    boxed::Box,
48    collections::{BinaryHeap, HashMap},
49    format,
50    fs::{self, File},
51    io::{self, Read},
52    path::{Path, PathBuf},
53    // `vec` import covers the `vec![..]` macro used below: this crate is
54    // no_std-with-std-feature, so the std prelude isn't pulled in implicitly
55    // for top-level items in this module. Removing this import fails the
56    // build with `cannot find macro 'vec' in this scope` — verified.
57    vec,
58    vec::Vec,
59};
60
61const MAX_TRAINING_PREALLOC_BYTES: usize = 8 * 1024 * 1024;
62const MAX_HUFFMAN_STATS_BYTES: usize = 64 * 1024;
63
64/// Tuning knobs for pure-Rust FastCOVER training.
65#[derive(Debug, Clone)]
66pub struct FastCoverOptions {
67    pub optimize: bool,
68    pub split_point: f64,
69    pub accel: usize,
70    pub k: usize,
71    pub d: usize,
72    pub f: u32,
73    pub k_candidates: Vec<usize>,
74    pub d_candidates: Vec<usize>,
75    pub f_candidates: Vec<u32>,
76}
77
78impl Default for FastCoverOptions {
79    fn default() -> Self {
80        Self {
81            optimize: true,
82            split_point: 0.75,
83            accel: 1,
84            k: 256,
85            d: 8,
86            f: 20,
87            k_candidates: DEFAULT_K_CANDIDATES.to_vec(),
88            d_candidates: DEFAULT_D_CANDIDATES.to_vec(),
89            f_candidates: DEFAULT_F_CANDIDATES.to_vec(),
90        }
91    }
92}
93
94#[derive(Debug, Clone, Copy, Default)]
95pub struct FinalizeOptions {
96    pub dict_id: Option<u32>,
97}
98
99/// A set of values that are used during dictionary construction.
100///
101/// Changing these values can improve the resulting dictionary size for certain datasets.
102// TODO: move `k` here.
103pub(super) struct DictParams {
104    /// Segment size.
105    ///
106    /// As found under "4. Experiments - Varying Segment Size" in the original paper, a
107    /// segment size of 2 kiB was effective.
108    ///
109    /// "We explored a range of \[`segment_size`\] values and found the performance of LMC is insensitive
110    /// to \[`segment_size`\]. We fix \[`segment_size`\] to 2kiB
111    ///
112    /// Reasonable range: [16, 2048+]
113    pub segment_size: u32,
114}
115
116/// Creates a "raw content" dictionary, training off of every file in this directory and all
117/// sub-directories.
118///
119/// The resulting dictionary will be approximately `dict_size` or less, and written to `output`.
120///
121/// # Errors
122/// This function returns `Ok(())` if the dictionary was created successfully, and an
123/// `Err(io::Error)` if an error was encountered reading the input directory or
124/// writing dictionary bytes to `output`.
125///
126/// # Examples
127/// ```no_run
128/// use std::fs::File;
129/// // Create a roughly 1mb dictionary, training off of file in `sample_files`
130/// let input_folder = "sample_files/";
131/// let mut output = File::create("output.dict").unwrap();
132/// structured_zstd::dictionary::create_raw_dict_from_dir(input_folder, &mut output, 1_000_000)
133///     .expect("dictionary training from sample_files should succeed");
134/// ```
135pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
136    path: P,
137    output: &mut W,
138    dict_size: usize,
139) -> Result<(), io::Error> {
140    // Collect a list of a path to every file in the directory into `file_paths`
141    let mut file_paths: Vec<PathBuf> = Vec::new();
142    let dir: fs::ReadDir = fs::read_dir(path)?;
143    fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
144        for entry in dir {
145            let entry = entry?;
146            if entry.file_type()?.is_dir() {
147                recurse_read(fs::read_dir(entry.path())?, file_paths)?;
148            } else {
149                file_paths.push(entry.path());
150            }
151        }
152        Ok(())
153    }
154    recurse_read(dir, &mut file_paths)?;
155
156    // Open each file and chain the readers together
157    let mut total_file_len: u64 = 0;
158    let mut file_handles: Vec<fs::File> = Vec::new();
159    for path in file_paths {
160        let handle = File::open(path)?;
161        total_file_len += handle.metadata()?.len();
162        file_handles.push(handle);
163    }
164    let empty_reader: Box<dyn Read> = Box::new(io::empty());
165    let chained_files = file_handles
166        .iter()
167        .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader)));
168
169    // Create a dict using the new reader
170    create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size)?;
171    Ok(())
172}
173
174/// Read from `source` to create a "raw content" dictionary of `dict_size`.
175/// The completed dictionary is written to `output`.
176///
177/// - `source` will be used as training data for the entire dictionary.
178/// - `source_size` is used only as a preallocation hint before reading `source` and
179///   does not affect sampling once all data has been buffered.
180/// - `output` is where the completed dictionary will be written.
181/// - `dict_size` determines how large the complete dictionary should be. The completed
182///   dictionary will be this size or smaller.
183///
184/// This function reads the entire `source` into an in-memory `Vec<u8>` before building
185/// the dictionary. The provided reader need not be buffered, but callers should avoid
186/// sources too large to fit comfortably in memory.
187///
188/// # API note
189/// This public API returns `io::Result<()>` and propagates source/output I/O failures.
190pub fn create_raw_dict_from_source<R: io::Read, W: io::Write>(
191    mut source: R,
192    source_size: usize,
193    output: &mut W,
194    dict_size: usize,
195) -> io::Result<()> {
196    if dict_size == 0 {
197        return Ok(());
198    }
199    let prealloc = source_size.min(MAX_TRAINING_PREALLOC_BYTES);
200    let mut all = Vec::with_capacity(prealloc);
201    source.read_to_end(&mut all)?;
202    if all.is_empty() {
203        return Ok(());
204    }
205
206    if all.len() < K {
207        let keep = usize::min(all.len(), dict_size);
208        output.write_all(&all[all.len() - keep..])?;
209        return Ok(());
210    }
211
212    let source_size = all.len();
213    vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source");
214
215    let params = DictParams { segment_size: 2048 };
216    let num_segments = usize::max(1, source_size / params.segment_size as usize);
217    // According to 4. Experiments - Varying Reservoir Sampler Thresholds,
218    // setting reservoir size to collection size / min{collection size / (2 * number of segments),
219    // 256} was effective
220    let denom = usize::max(1, source_size / (2 * num_segments));
221    let sample_scale = usize::max(1, usize::min(denom, 256));
222    let mut sample_size = source_size / sample_scale;
223    sample_size = usize::max(sample_size, usize::min(source_size, 16));
224    vprintln!("create_dict: creating {sample_size} byte sample of collection");
225    let mut sample_reader = all.as_slice();
226    let collection_sample = create_sample(&mut sample_reader, sample_size);
227
228    // A collection of segments to be used in the final dictionary.
229    //
230    // Contains the best segment from every epoch.
231    // Reverse is used because we want a min heap, where
232    // the lowest scoring items come first
233    let mut pool: BinaryHeap<Reverse<Segment>> = BinaryHeap::new();
234    let (num_epochs, epoch_size_kmers) = compute_epoch_info(&params, dict_size, source_size / K);
235    // Plain `*`/`+` throughout the epoch walk below: epochs partition the
236    // training source, so `epoch_size_kmers * K`, `epoch_idx * epoch_size`, and
237    // `start + epoch_size` are all bounded by the source length (<= isize::MAX)
238    // and cannot overflow usize.
239    let epoch_size = usize::max(K, epoch_size_kmers * K);
240    vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes");
241    let mut epoch_counter = 0;
242    let mut ctx = Context {
243        frequencies: HashMap::with_capacity(epoch_size / K),
244    };
245    // Score each segment in each planned epoch and select the highest-scoring
246    // segment for the pool. Keep exactly `num_epochs` windows to avoid
247    // emitting more segments than the requested dictionary budget allows.
248    for epoch_idx in 0..num_epochs {
249        let start = epoch_idx * epoch_size;
250        if start >= all.len() {
251            break;
252        }
253        let end = if epoch_idx + 1 == num_epochs {
254            all.len()
255        } else {
256            usize::min(start + epoch_size, all.len())
257        };
258        let epoch = &all[start..end];
259        epoch_counter += 1;
260        let best_segment = pick_best_segment(&params, &mut ctx, epoch, &collection_sample);
261        vprintln!(
262            "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}",
263            best_segment.score
264        );
265        pool.push(Reverse(best_segment));
266        // Wipe frequency list for next epoch
267        ctx.frequencies.clear();
268    }
269    vprintln!(
270        "create_dict: {epoch_counter} epochs written, writing {} segments",
271        pool.len()
272    );
273    // Write the dictionary with the highest scoring segment last because
274    // closer items can be represented with a smaller offset
275    while let Some(segment) = pool.pop() {
276        output.write_all(&segment.0.raw)?;
277    }
278    Ok(())
279}
280
281fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result<Vec<u8>> {
282    fn bounded_huffman_stats(data: &[u8]) -> Vec<u8> {
283        if data.len() <= MAX_HUFFMAN_STATS_BYTES {
284            return data.to_vec();
285        }
286
287        let mut stats = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
288        for i in 0..MAX_HUFFMAN_STATS_BYTES {
289            let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
290            stats.push(data[idx]);
291        }
292        stats
293    }
294
295    let source = if sample_data.len() >= 2 {
296        sample_data
297    } else {
298        raw_content
299    };
300    let mut stats = bounded_huffman_stats(source);
301    if stats.len() < 2 || stats.iter().all(|b| *b == stats[0]) {
302        stats = (0u8..=255).collect();
303    }
304
305    let table = HuffmanEncoderTable::build_from_data(stats.as_slice());
306    let mut writer = BitWriter::new();
307    let mut encoder = HuffmanEncoder::new(&table, &mut writer);
308    encoder.encode(&[stats[0]], true);
309    let encoded = writer.dump();
310
311    let mut decoder = HuffmanDecoderTable::new();
312    let table_size = decoder
313        .build_decoder(encoded.as_slice())
314        .map_err(|e| io::Error::other(format!("failed to decode generated huffman table: {e}")))?;
315    Ok(encoded[..table_size as usize].to_vec())
316}
317
318fn serialize_fse_table(table: &fse_encoder::FSETable) -> Vec<u8> {
319    let mut writer = BitWriter::new();
320    table.write_table(&mut writer);
321    writer.dump()
322}
323
324fn bounded_fse_symbols(data: &[u8], max_symbol: u8) -> Vec<u8> {
325    let modulo = u16::from(max_symbol) + 1;
326    if data.is_empty() {
327        return Vec::from([0u8]);
328    }
329    if data.len() <= MAX_HUFFMAN_STATS_BYTES {
330        return data
331            .iter()
332            .map(|b| (u16::from(*b) % modulo) as u8)
333            .collect();
334    }
335
336    let mut out = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
337    for i in 0..MAX_HUFFMAN_STATS_BYTES {
338        let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
339        out.push((u16::from(data[idx]) % modulo) as u8);
340    }
341    out
342}
343
344fn serialize_fse_table_from_corpus(
345    sample_data: &[u8],
346    raw_content: &[u8],
347    max_symbol: u8,
348    max_log: u8,
349) -> io::Result<Vec<u8>> {
350    fn counts_total_for_source(source: &[u8], max_symbol: u8, counts: &mut [usize]) -> usize {
351        counts.fill(0);
352        for symbol in bounded_fse_symbols(source, max_symbol) {
353            counts[usize::from(symbol)] += 1;
354        }
355        counts.iter().sum::<usize>()
356    }
357
358    let mut counts = vec![0usize; usize::from(max_symbol) + 1];
359    let using_sample = !sample_data.is_empty();
360    let mut total = counts_total_for_source(
361        if using_sample {
362            sample_data
363        } else {
364            raw_content
365        },
366        max_symbol,
367        &mut counts,
368    );
369    if total <= 1 && using_sample && !raw_content.is_empty() {
370        total = counts_total_for_source(raw_content, max_symbol, &mut counts);
371    }
372    if total <= 1 {
373        return Err(io::Error::new(
374            io::ErrorKind::InvalidInput,
375            "insufficient symbol statistics for FSE table",
376        ));
377    }
378    let table = build_table_from_symbol_counts(&counts, max_log, false);
379    Ok(serialize_fse_table(&table))
380}
381
382fn finalized_content_budget(
383    sample_data: &[u8],
384    raw_fallback: &[u8],
385    dict_size: usize,
386) -> io::Result<usize> {
387    let min_content_size = 8usize;
388    let huf_len = serialize_huffman_table(sample_data, raw_fallback)?.len();
389    let of_len =
390        serialize_fse_table_from_corpus(sample_data, raw_fallback, MAX_OFFSET_CODE, OF_MAX_LOG)?
391            .len();
392    let ml_len = serialize_fse_table_from_corpus(
393        sample_data,
394        raw_fallback,
395        MAX_MATCH_LENGTH_CODE,
396        ML_MAX_LOG,
397    )?
398    .len();
399    let ll_len = serialize_fse_table_from_corpus(
400        sample_data,
401        raw_fallback,
402        MAX_LITERAL_LENGTH_CODE,
403        LL_MAX_LOG,
404    )?
405    .len();
406
407    let header_len = DICT_MAGIC_NUM.len() + 4 + huf_len + of_len + ml_len + ll_len + 12;
408    let max_content_budget = dict_size.saturating_sub(header_len);
409    if max_content_budget < min_content_size {
410        return Err(io::Error::new(
411            io::ErrorKind::InvalidInput,
412            "dictionary size too small to fit header and offset history",
413        ));
414    }
415    Ok(max_content_budget)
416}
417
418fn derive_dict_id(raw_content: &[u8]) -> u32 {
419    let mut h = 0xcbf29ce484222325u64;
420    for &b in raw_content {
421        h ^= u64::from(b);
422        h = h.wrapping_mul(0x100000001b3);
423    }
424    let compliant = (h % ((1u64 << 31) - 32768)) + 32768;
425    compliant as u32
426}
427
428/// Finalize raw dictionary content into a full zstd dictionary binary
429/// (`magic + dict_id + entropy tables + offset history + content`).
430pub fn finalize_raw_dict(
431    raw_content: &[u8],
432    sample_data: &[u8],
433    dict_size: usize,
434    options: FinalizeOptions,
435) -> io::Result<Vec<u8>> {
436    if raw_content.is_empty() {
437        return Err(io::Error::new(
438            io::ErrorKind::InvalidInput,
439            "raw dictionary content must not be empty",
440        ));
441    }
442    let mut out = Vec::with_capacity(dict_size.max(256));
443    out.extend_from_slice(&DICT_MAGIC_NUM);
444    let dict_id = options
445        .dict_id
446        .unwrap_or_else(|| derive_dict_id(raw_content));
447    if dict_id == 0 {
448        return Err(io::Error::new(
449            io::ErrorKind::InvalidInput,
450            "dictionary id must be non-zero",
451        ));
452    }
453    out.extend_from_slice(&dict_id.to_le_bytes());
454    out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice());
455    out.extend_from_slice(
456        serialize_fse_table_from_corpus(sample_data, raw_content, MAX_OFFSET_CODE, OF_MAX_LOG)?
457            .as_slice(),
458    );
459    out.extend_from_slice(
460        serialize_fse_table_from_corpus(
461            sample_data,
462            raw_content,
463            MAX_MATCH_LENGTH_CODE,
464            ML_MAX_LOG,
465        )?
466        .as_slice(),
467    );
468    out.extend_from_slice(
469        serialize_fse_table_from_corpus(
470            sample_data,
471            raw_content,
472            MAX_LITERAL_LENGTH_CODE,
473            LL_MAX_LOG,
474        )?
475        .as_slice(),
476    );
477
478    // Repeat offsets: keep default bootstrap history.
479    out.extend_from_slice(&1u32.to_le_bytes());
480    out.extend_from_slice(&4u32.to_le_bytes());
481    out.extend_from_slice(&8u32.to_le_bytes());
482
483    let min_content_size = 8usize;
484    let max_content_budget = dict_size.saturating_sub(out.len());
485    if max_content_budget < min_content_size {
486        return Err(io::Error::new(
487            io::ErrorKind::InvalidInput,
488            "dictionary size too small to fit header and offset history",
489        ));
490    }
491
492    let content = if raw_content.len() > max_content_budget {
493        &raw_content[raw_content.len() - max_content_budget..]
494    } else {
495        raw_content
496    };
497    if content.len() < min_content_size {
498        out.resize(out.len() + (min_content_size - content.len()), 0);
499    }
500    out.extend_from_slice(content);
501    Ok(out)
502}
503
504/// Train a raw FastCOVER dictionary from a source stream.
505fn train_fastcover_internal(
506    sample: &[u8],
507    dict_size: usize,
508    options: &FastCoverOptions,
509) -> (Vec<u8>, FastCoverTuned) {
510    if options.optimize {
511        fastcover::optimize_fastcover_raw(
512            sample,
513            dict_size,
514            options.split_point,
515            options.accel,
516            options.d_candidates.as_slice(),
517            options.f_candidates.as_slice(),
518            options.k_candidates.as_slice(),
519        )
520    } else {
521        let params = fastcover::normalize_fastcover_params(FastCoverParams {
522            k: options.k,
523            d: options.d,
524            f: options.f,
525            accel: options.accel,
526        });
527        (
528            fastcover::train_fastcover_raw(sample, dict_size, params),
529            FastCoverTuned {
530                k: params.k,
531                d: params.d,
532                f: params.f,
533                accel: params.accel,
534                score: 0,
535            },
536        )
537    }
538}
539
540/// Train a raw FastCOVER dictionary directly from an in-memory sample.
541pub fn train_fastcover_raw_from_slice(
542    sample: &[u8],
543    dict_size: usize,
544    options: &FastCoverOptions,
545) -> io::Result<(Vec<u8>, FastCoverTuned)> {
546    if sample.is_empty() {
547        return Err(io::Error::new(
548            io::ErrorKind::InvalidInput,
549            "source stream is empty",
550        ));
551    }
552    let (dict, tuned) = train_fastcover_internal(sample, dict_size, options);
553    if dict.is_empty() && dict_size > 0 {
554        return Err(io::Error::new(
555            io::ErrorKind::InvalidInput,
556            "training sample is too small for FastCOVER",
557        ));
558    }
559    Ok((dict, tuned))
560}
561
562/// Train a raw FastCOVER dictionary from a source stream.
563///
564/// This function fully buffers the entire training corpus into memory via
565/// `read_to_end`, which can consume significant RAM for large inputs.
566pub fn create_fastcover_raw_dict_from_source<R: io::Read, W: io::Write>(
567    mut source: R,
568    output: &mut W,
569    dict_size: usize,
570    options: &FastCoverOptions,
571) -> io::Result<FastCoverTuned> {
572    let mut sample = Vec::new();
573    source.read_to_end(&mut sample)?;
574    let (dict, tuned) = train_fastcover_raw_from_slice(sample.as_slice(), dict_size, options)?;
575    output.write_all(dict.as_slice())?;
576    Ok(tuned)
577}
578
579/// Train and finalize a FastCOVER dictionary in pure Rust.
580///
581/// This function fully buffers the entire training corpus into memory via
582/// `read_to_end`, which can consume significant RAM for large inputs.
583pub fn create_fastcover_dict_from_source<R: io::Read, W: io::Write>(
584    mut source: R,
585    output: &mut W,
586    dict_size: usize,
587    fastcover: &FastCoverOptions,
588    finalize: FinalizeOptions,
589) -> io::Result<FastCoverTuned> {
590    let mut sample = Vec::new();
591    source.read_to_end(&mut sample)?;
592    if sample.is_empty() {
593        return Err(io::Error::new(
594            io::ErrorKind::InvalidInput,
595            "source stream is empty",
596        ));
597    }
598    let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)?;
599    let (raw_dict, tuned) =
600        train_fastcover_raw_from_slice(sample.as_slice(), content_budget, fastcover)?;
601
602    let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?;
603    output.write_all(finalized.as_slice())?;
604    Ok(tuned)
605}
606
607/// Build a finalized FastCOVER dictionary, attach it to a fastest-level
608/// frame compressor, and compress a fresh payload. Returns
609/// `(finalized_dictionary, compressed_frame, original_payload)` so a
610/// roundtrip check can decode `compressed_frame` against
611/// `finalized_dictionary` and compare to `original_payload`. The
612/// C-decoder roundtrip that consumes this lives in the `ffi-bench` crate;
613/// this side stays pure Rust.
614#[cfg(feature = "bench_internals")]
615pub(crate) fn dict_roundtrip_fixture() -> (
616    alloc::vec::Vec<u8>,
617    alloc::vec::Vec<u8>,
618    alloc::vec::Vec<u8>,
619) {
620    use crate::decoding::Dictionary;
621    use crate::encoding::{CompressionLevel, FrameCompressor};
622
623    let mut sample = alloc::vec::Vec::new();
624    for i in 0..512u32 {
625        sample.extend_from_slice(
626            alloc::format!(
627                "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n"
628            )
629            .as_bytes(),
630        );
631    }
632
633    let dict_size = 4096usize;
634    let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)
635        .expect("content budget should be computable");
636    let raw = fastcover::train_fastcover_raw(
637        sample.as_slice(),
638        content_budget,
639        fastcover::FastCoverParams {
640            k: 256,
641            d: 8,
642            f: 20,
643            accel: 1,
644        },
645    );
646    let finalized = finalize_raw_dict(
647        raw.as_slice(),
648        sample.as_slice(),
649        dict_size,
650        FinalizeOptions::default(),
651    )
652    .expect("finalization should succeed");
653    let parsed =
654        Dictionary::decode_dict(finalized.as_slice()).expect("finalized dictionary should parse");
655    assert!(!parsed.dict_content.is_empty());
656
657    let mut payload = alloc::vec::Vec::new();
658    for idx in 0..96u32 {
659        payload.extend_from_slice(
660            alloc::format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n")
661                .as_bytes(),
662        );
663    }
664
665    let mut compressed = alloc::vec::Vec::new();
666    let mut compressor = FrameCompressor::new(CompressionLevel::Fastest);
667    compressor
668        .set_dictionary(parsed)
669        .expect("dictionary should attach");
670    compressor.set_source(payload.as_slice());
671    compressor.set_drain(&mut compressed);
672    compressor.compress();
673
674    (finalized, compressed, payload)
675}
676
677#[cfg(test)]
678mod tests {
679    use super::*;
680    use crate::decoding::Dictionary;
681    use std::io::Cursor;
682    use std::string::ToString;
683
684    fn training_data() -> Vec<u8> {
685        let mut data = Vec::new();
686        for i in 0..512u32 {
687            data.extend_from_slice(
688                format!(
689                    "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n"
690                )
691                .as_bytes(),
692            );
693        }
694        data
695    }
696
697    #[test]
698    fn create_fastcover_dict_from_source_writes_non_empty_output() {
699        let sample = training_data();
700        let mut out = Vec::new();
701        let tuned = create_fastcover_dict_from_source(
702            Cursor::new(sample.as_slice()),
703            &mut out,
704            4096,
705            &FastCoverOptions::default(),
706            FinalizeOptions::default(),
707        )
708        .expect("fastcover+finalize should succeed");
709        assert!(!out.is_empty());
710        assert!(tuned.k > 0);
711        assert!(tuned.d > 0);
712    }
713
714    #[test]
715    fn create_fastcover_raw_dict_from_source_rejects_empty_source() {
716        let mut out = Vec::new();
717        let err = create_fastcover_raw_dict_from_source(
718            Cursor::new(Vec::<u8>::new()),
719            &mut out,
720            1024,
721            &FastCoverOptions::default(),
722        )
723        .expect_err("empty source must be rejected");
724        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
725    }
726
727    #[test]
728    fn create_fastcover_dict_from_source_propagates_finalize_error() {
729        let sample = training_data();
730        let mut out = Vec::new();
731        let err = create_fastcover_dict_from_source(
732            Cursor::new(sample.as_slice()),
733            &mut out,
734            32,
735            &FastCoverOptions::default(),
736            FinalizeOptions::default(),
737        )
738        .expect_err("too-small dictionary budget must fail during finalize");
739        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
740        assert!(err.to_string().contains("dictionary size too small"));
741    }
742
743    #[test]
744    fn create_fastcover_dict_from_source_rejects_empty_source() {
745        let mut out = Vec::new();
746        let err = create_fastcover_dict_from_source(
747            Cursor::new(Vec::<u8>::new()),
748            &mut out,
749            1024,
750            &FastCoverOptions::default(),
751            FinalizeOptions::default(),
752        )
753        .expect_err("empty source must be rejected");
754        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
755    }
756
757    #[test]
758    fn create_raw_dict_from_source_early_returns_on_zero_dict_size() {
759        let sample = training_data();
760        let mut out = Vec::new();
761        create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0)
762            .expect("zero dict size should no-op");
763        assert!(out.is_empty());
764    }
765
766    #[test]
767    fn create_raw_dict_from_source_treats_source_size_as_hint() {
768        let sample = training_data();
769        let mut out = Vec::new();
770        create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024)
771            .expect("raw dictionary training should succeed");
772        assert!(!out.is_empty());
773    }
774
775    #[test]
776    fn create_raw_dict_from_source_handles_tiny_source_without_epochs() {
777        let sample = b"short";
778        let mut out = Vec::new();
779        create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
780            .expect("tiny source path should succeed");
781        assert_eq!(out, b"ort");
782    }
783
784    #[test]
785    fn create_raw_dict_from_source_propagates_read_error() {
786        struct FailingReader;
787        impl io::Read for FailingReader {
788            fn read(&mut self, _buf: &mut [u8]) -> io::Result<usize> {
789                Err(io::Error::other("read failed"))
790            }
791        }
792
793        let mut out = Vec::new();
794        let err = create_raw_dict_from_source(FailingReader, 1024, &mut out, 1024)
795            .expect_err("read failures must be returned");
796        assert_eq!(err.kind(), io::ErrorKind::Other);
797        assert_eq!(err.to_string(), "read failed");
798    }
799
800    #[test]
801    fn create_raw_dict_from_source_propagates_write_error() {
802        struct FailingWriter;
803        impl io::Write for FailingWriter {
804            fn write(&mut self, _buf: &[u8]) -> io::Result<usize> {
805                Err(io::Error::other("write failed"))
806            }
807            fn flush(&mut self) -> io::Result<()> {
808                Ok(())
809            }
810        }
811
812        let sample = b"short";
813        let mut out = FailingWriter;
814        let err =
815            create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
816                .expect_err("write failures must be returned");
817        assert_eq!(err.kind(), io::ErrorKind::Other);
818        assert_eq!(err.to_string(), "write failed");
819    }
820
821    #[test]
822    fn create_raw_dict_from_source_never_exceeds_requested_size() {
823        let dict_size = 4096usize;
824        let source: Vec<u8> = core::iter::repeat_n(b'a', 320_001).collect();
825        let mut out = Vec::new();
826        create_raw_dict_from_source(
827            Cursor::new(source.as_slice()),
828            source.len(),
829            &mut out,
830            dict_size,
831        )
832        .expect("raw dictionary training should succeed");
833        assert!(
834            out.len() <= dict_size,
835            "raw dictionary exceeded requested size: {} > {}",
836            out.len(),
837            dict_size
838        );
839    }
840
841    #[test]
842    fn train_fastcover_raw_from_slice_rejects_empty_sample() {
843        let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default())
844            .expect_err("empty sample must be rejected");
845        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
846    }
847
848    #[test]
849    fn train_fastcover_raw_from_slice_supports_non_optimized_params() {
850        let sample = training_data();
851        let options = FastCoverOptions {
852            optimize: false,
853            k: 128,
854            d: 6,
855            f: 18,
856            ..FastCoverOptions::default()
857        };
858        let (dict, tuned) =
859            train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
860        assert!(!dict.is_empty());
861        assert!(dict.len() <= 2048);
862        assert_eq!(tuned.k, 128);
863        assert_eq!(tuned.d, 6);
864        assert_eq!(tuned.f, 18);
865        assert_eq!(tuned.score, 0);
866    }
867
868    #[test]
869    fn train_fastcover_raw_from_slice_rejects_tiny_sample_with_empty_dict() {
870        let sample = b"tiny";
871        let err = train_fastcover_raw_from_slice(sample, 1024, &FastCoverOptions::default())
872            .expect_err("tiny sample should not produce an empty dictionary successfully");
873        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
874        assert_eq!(
875            err.to_string(),
876            "training sample is too small for FastCOVER"
877        );
878    }
879
880    #[test]
881    fn train_fastcover_raw_from_slice_normalizes_non_optimized_params() {
882        let sample = training_data();
883        let options = FastCoverOptions {
884            optimize: false,
885            k: 8,
886            d: 64,
887            f: 42,
888            ..FastCoverOptions::default()
889        };
890        let (_, tuned) =
891            train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
892        assert_eq!(tuned.k, 32);
893        assert_eq!(tuned.d, 32);
894        assert_eq!(tuned.f, 20);
895    }
896
897    #[test]
898    fn finalize_raw_dict_rejects_empty_raw_content() {
899        let sample = training_data();
900        let err = finalize_raw_dict(&[], sample.as_slice(), 4096, FinalizeOptions::default())
901            .expect_err("empty raw dictionary must be rejected");
902        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
903    }
904
905    #[test]
906    fn finalize_raw_dict_rejects_too_small_budget() {
907        let sample = training_data();
908        let raw = b"some-raw-bytes";
909        let err = finalize_raw_dict(raw, sample.as_slice(), 32, FinalizeOptions::default())
910            .expect_err("tiny dict_size must fail");
911        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
912        assert!(err.to_string().contains("dictionary size too small"));
913    }
914
915    #[test]
916    fn finalize_raw_dict_pads_to_minimum_content_size() {
917        let sample = training_data();
918        let raw = b"x";
919        let finalized = finalize_raw_dict(raw, sample.as_slice(), 4096, FinalizeOptions::default())
920            .expect("finalize should pad small raw content");
921        let parsed = Dictionary::decode_dict(finalized.as_slice()).expect("finalized dict parses");
922        assert!(parsed.dict_content.len() >= 8);
923        assert_eq!(parsed.dict_content.last(), Some(&b'x'));
924    }
925
926    #[test]
927    fn finalize_raw_dict_rejects_zero_dict_id() {
928        let sample = training_data();
929        let raw = b"raw-fastcover-bytes";
930        let err = finalize_raw_dict(
931            raw,
932            sample.as_slice(),
933            4096,
934            FinalizeOptions { dict_id: Some(0) },
935        )
936        .expect_err("dict_id=0 must be rejected");
937        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
938        assert_eq!(err.to_string(), "dictionary id must be non-zero");
939    }
940}