1mod cover;
26mod fastcover;
27mod frequency;
28mod reservoir;
29
30use crate::bit_io::BitWriter;
31use crate::blocks::sequence_section::{
32 MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
33};
34use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM;
35use crate::decoding::sequence_section_decoder::{LL_MAX_LOG, ML_MAX_LOG, OF_MAX_LOG};
36use crate::dictionary::reservoir::create_sample;
37use crate::fse::fse_encoder::{self, build_table_from_symbol_counts};
38use crate::huff0::HuffmanTable as HuffmanDecoderTable;
39use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable};
40use core::cmp::Reverse;
41use cover::*;
42pub use fastcover::{
43 DEFAULT_D_CANDIDATES, DEFAULT_F_CANDIDATES, DEFAULT_K_CANDIDATES, FastCoverParams,
44 FastCoverTuned,
45};
46use std::{
47 boxed::Box,
48 collections::{BinaryHeap, HashMap},
49 format,
50 fs::{self, File},
51 io::{self, Read},
52 path::{Path, PathBuf},
53 vec,
58 vec::Vec,
59};
60
61const MAX_TRAINING_PREALLOC_BYTES: usize = 8 * 1024 * 1024;
62const MAX_HUFFMAN_STATS_BYTES: usize = 64 * 1024;
63
64#[derive(Debug, Clone)]
66pub struct FastCoverOptions {
67 pub optimize: bool,
68 pub split_point: f64,
69 pub accel: usize,
70 pub k: usize,
71 pub d: usize,
72 pub f: u32,
73 pub k_candidates: Vec<usize>,
74 pub d_candidates: Vec<usize>,
75 pub f_candidates: Vec<u32>,
76}
77
78impl Default for FastCoverOptions {
79 fn default() -> Self {
80 Self {
81 optimize: true,
82 split_point: 0.75,
83 accel: 1,
84 k: 256,
85 d: 8,
86 f: 20,
87 k_candidates: DEFAULT_K_CANDIDATES.to_vec(),
88 d_candidates: DEFAULT_D_CANDIDATES.to_vec(),
89 f_candidates: DEFAULT_F_CANDIDATES.to_vec(),
90 }
91 }
92}
93
94#[derive(Debug, Clone, Copy, Default)]
95pub struct FinalizeOptions {
96 pub dict_id: Option<u32>,
97}
98
99pub(super) struct DictParams {
104 pub segment_size: u32,
114}
115
116pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
136 path: P,
137 output: &mut W,
138 dict_size: usize,
139) -> Result<(), io::Error> {
140 let mut file_paths: Vec<PathBuf> = Vec::new();
142 let dir: fs::ReadDir = fs::read_dir(path)?;
143 fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
144 for entry in dir {
145 let entry = entry?;
146 if entry.file_type()?.is_dir() {
147 recurse_read(fs::read_dir(entry.path())?, file_paths)?;
148 } else {
149 file_paths.push(entry.path());
150 }
151 }
152 Ok(())
153 }
154 recurse_read(dir, &mut file_paths)?;
155
156 let mut total_file_len: u64 = 0;
158 let mut file_handles: Vec<fs::File> = Vec::new();
159 for path in file_paths {
160 let handle = File::open(path)?;
161 total_file_len += handle.metadata()?.len();
162 file_handles.push(handle);
163 }
164 let empty_reader: Box<dyn Read> = Box::new(io::empty());
165 let chained_files = file_handles
166 .iter()
167 .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader)));
168
169 create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size)?;
171 Ok(())
172}
173
174pub fn create_raw_dict_from_source<R: io::Read, W: io::Write>(
191 mut source: R,
192 source_size: usize,
193 output: &mut W,
194 dict_size: usize,
195) -> io::Result<()> {
196 if dict_size == 0 {
197 return Ok(());
198 }
199 let prealloc = source_size.min(MAX_TRAINING_PREALLOC_BYTES);
200 let mut all = Vec::with_capacity(prealloc);
201 source.read_to_end(&mut all)?;
202 if all.is_empty() {
203 return Ok(());
204 }
205
206 if all.len() < K {
207 let keep = usize::min(all.len(), dict_size);
208 output.write_all(&all[all.len() - keep..])?;
209 return Ok(());
210 }
211
212 let source_size = all.len();
213 vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source");
214
215 let params = DictParams { segment_size: 2048 };
216 let num_segments = usize::max(1, source_size / params.segment_size as usize);
217 let denom = usize::max(1, source_size / (2 * num_segments));
221 let sample_scale = usize::max(1, usize::min(denom, 256));
222 let mut sample_size = source_size / sample_scale;
223 sample_size = usize::max(sample_size, usize::min(source_size, 16));
224 vprintln!("create_dict: creating {sample_size} byte sample of collection");
225 let mut sample_reader = all.as_slice();
226 let collection_sample = create_sample(&mut sample_reader, sample_size);
227
228 let mut pool: BinaryHeap<Reverse<Segment>> = BinaryHeap::new();
234 let (num_epochs, epoch_size_kmers) = compute_epoch_info(¶ms, dict_size, source_size / K);
235 let epoch_size = usize::max(K, epoch_size_kmers.saturating_mul(K));
236 vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes");
237 let mut epoch_counter = 0;
238 let mut ctx = Context {
239 frequencies: HashMap::with_capacity(epoch_size / K),
240 };
241 for epoch_idx in 0..num_epochs {
245 let start = epoch_idx.saturating_mul(epoch_size);
246 if start >= all.len() {
247 break;
248 }
249 let end = if epoch_idx + 1 == num_epochs {
250 all.len()
251 } else {
252 usize::min(start.saturating_add(epoch_size), all.len())
253 };
254 let epoch = &all[start..end];
255 epoch_counter += 1;
256 let best_segment = pick_best_segment(¶ms, &mut ctx, epoch, &collection_sample);
257 vprintln!(
258 "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}",
259 best_segment.score
260 );
261 pool.push(Reverse(best_segment));
262 ctx.frequencies.clear();
264 }
265 vprintln!(
266 "create_dict: {epoch_counter} epochs written, writing {} segments",
267 pool.len()
268 );
269 while let Some(segment) = pool.pop() {
272 output.write_all(&segment.0.raw)?;
273 }
274 Ok(())
275}
276
277fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result<Vec<u8>> {
278 fn bounded_huffman_stats(data: &[u8]) -> Vec<u8> {
279 if data.len() <= MAX_HUFFMAN_STATS_BYTES {
280 return data.to_vec();
281 }
282
283 let mut stats = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
284 for i in 0..MAX_HUFFMAN_STATS_BYTES {
285 let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
286 stats.push(data[idx]);
287 }
288 stats
289 }
290
291 let source = if sample_data.len() >= 2 {
292 sample_data
293 } else {
294 raw_content
295 };
296 let mut stats = bounded_huffman_stats(source);
297 if stats.len() < 2 || stats.iter().all(|b| *b == stats[0]) {
298 stats = (0u8..=255).collect();
299 }
300
301 let table = HuffmanEncoderTable::build_from_data(stats.as_slice());
302 let mut writer = BitWriter::new();
303 let mut encoder = HuffmanEncoder::new(&table, &mut writer);
304 encoder.encode(&[stats[0]], true);
305 let encoded = writer.dump();
306
307 let mut decoder = HuffmanDecoderTable::new();
308 let table_size = decoder
309 .build_decoder(encoded.as_slice())
310 .map_err(|e| io::Error::other(format!("failed to decode generated huffman table: {e}")))?;
311 Ok(encoded[..table_size as usize].to_vec())
312}
313
314fn serialize_fse_table(table: &fse_encoder::FSETable) -> Vec<u8> {
315 let mut writer = BitWriter::new();
316 table.write_table(&mut writer);
317 writer.dump()
318}
319
320fn bounded_fse_symbols(data: &[u8], max_symbol: u8) -> Vec<u8> {
321 let modulo = u16::from(max_symbol) + 1;
322 if data.is_empty() {
323 return Vec::from([0u8]);
324 }
325 if data.len() <= MAX_HUFFMAN_STATS_BYTES {
326 return data
327 .iter()
328 .map(|b| (u16::from(*b) % modulo) as u8)
329 .collect();
330 }
331
332 let mut out = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
333 for i in 0..MAX_HUFFMAN_STATS_BYTES {
334 let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
335 out.push((u16::from(data[idx]) % modulo) as u8);
336 }
337 out
338}
339
340fn serialize_fse_table_from_corpus(
341 sample_data: &[u8],
342 raw_content: &[u8],
343 max_symbol: u8,
344 max_log: u8,
345) -> io::Result<Vec<u8>> {
346 fn counts_total_for_source(source: &[u8], max_symbol: u8, counts: &mut [usize]) -> usize {
347 counts.fill(0);
348 for symbol in bounded_fse_symbols(source, max_symbol) {
349 counts[usize::from(symbol)] += 1;
350 }
351 counts.iter().sum::<usize>()
352 }
353
354 let mut counts = vec![0usize; usize::from(max_symbol) + 1];
355 let using_sample = !sample_data.is_empty();
356 let mut total = counts_total_for_source(
357 if using_sample {
358 sample_data
359 } else {
360 raw_content
361 },
362 max_symbol,
363 &mut counts,
364 );
365 if total <= 1 && using_sample && !raw_content.is_empty() {
366 total = counts_total_for_source(raw_content, max_symbol, &mut counts);
367 }
368 if total <= 1 {
369 return Err(io::Error::new(
370 io::ErrorKind::InvalidInput,
371 "insufficient symbol statistics for FSE table",
372 ));
373 }
374 let table = build_table_from_symbol_counts(&counts, max_log, false);
375 Ok(serialize_fse_table(&table))
376}
377
378fn finalized_content_budget(
379 sample_data: &[u8],
380 raw_fallback: &[u8],
381 dict_size: usize,
382) -> io::Result<usize> {
383 let min_content_size = 8usize;
384 let huf_len = serialize_huffman_table(sample_data, raw_fallback)?.len();
385 let of_len =
386 serialize_fse_table_from_corpus(sample_data, raw_fallback, MAX_OFFSET_CODE, OF_MAX_LOG)?
387 .len();
388 let ml_len = serialize_fse_table_from_corpus(
389 sample_data,
390 raw_fallback,
391 MAX_MATCH_LENGTH_CODE,
392 ML_MAX_LOG,
393 )?
394 .len();
395 let ll_len = serialize_fse_table_from_corpus(
396 sample_data,
397 raw_fallback,
398 MAX_LITERAL_LENGTH_CODE,
399 LL_MAX_LOG,
400 )?
401 .len();
402
403 let header_len = DICT_MAGIC_NUM.len() + 4 + huf_len + of_len + ml_len + ll_len + 12;
404 let max_content_budget = dict_size.saturating_sub(header_len);
405 if max_content_budget < min_content_size {
406 return Err(io::Error::new(
407 io::ErrorKind::InvalidInput,
408 "dictionary size too small to fit header and offset history",
409 ));
410 }
411 Ok(max_content_budget)
412}
413
414fn derive_dict_id(raw_content: &[u8]) -> u32 {
415 let mut h = 0xcbf29ce484222325u64;
416 for &b in raw_content {
417 h ^= u64::from(b);
418 h = h.wrapping_mul(0x100000001b3);
419 }
420 let compliant = (h % ((1u64 << 31) - 32768)) + 32768;
421 compliant as u32
422}
423
424pub fn finalize_raw_dict(
427 raw_content: &[u8],
428 sample_data: &[u8],
429 dict_size: usize,
430 options: FinalizeOptions,
431) -> io::Result<Vec<u8>> {
432 if raw_content.is_empty() {
433 return Err(io::Error::new(
434 io::ErrorKind::InvalidInput,
435 "raw dictionary content must not be empty",
436 ));
437 }
438 let mut out = Vec::with_capacity(dict_size.max(256));
439 out.extend_from_slice(&DICT_MAGIC_NUM);
440 let dict_id = options
441 .dict_id
442 .unwrap_or_else(|| derive_dict_id(raw_content));
443 if dict_id == 0 {
444 return Err(io::Error::new(
445 io::ErrorKind::InvalidInput,
446 "dictionary id must be non-zero",
447 ));
448 }
449 out.extend_from_slice(&dict_id.to_le_bytes());
450 out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice());
451 out.extend_from_slice(
452 serialize_fse_table_from_corpus(sample_data, raw_content, MAX_OFFSET_CODE, OF_MAX_LOG)?
453 .as_slice(),
454 );
455 out.extend_from_slice(
456 serialize_fse_table_from_corpus(
457 sample_data,
458 raw_content,
459 MAX_MATCH_LENGTH_CODE,
460 ML_MAX_LOG,
461 )?
462 .as_slice(),
463 );
464 out.extend_from_slice(
465 serialize_fse_table_from_corpus(
466 sample_data,
467 raw_content,
468 MAX_LITERAL_LENGTH_CODE,
469 LL_MAX_LOG,
470 )?
471 .as_slice(),
472 );
473
474 out.extend_from_slice(&1u32.to_le_bytes());
476 out.extend_from_slice(&4u32.to_le_bytes());
477 out.extend_from_slice(&8u32.to_le_bytes());
478
479 let min_content_size = 8usize;
480 let max_content_budget = dict_size.saturating_sub(out.len());
481 if max_content_budget < min_content_size {
482 return Err(io::Error::new(
483 io::ErrorKind::InvalidInput,
484 "dictionary size too small to fit header and offset history",
485 ));
486 }
487
488 let content = if raw_content.len() > max_content_budget {
489 &raw_content[raw_content.len() - max_content_budget..]
490 } else {
491 raw_content
492 };
493 if content.len() < min_content_size {
494 out.resize(out.len() + (min_content_size - content.len()), 0);
495 }
496 out.extend_from_slice(content);
497 Ok(out)
498}
499
500fn train_fastcover_internal(
502 sample: &[u8],
503 dict_size: usize,
504 options: &FastCoverOptions,
505) -> (Vec<u8>, FastCoverTuned) {
506 if options.optimize {
507 fastcover::optimize_fastcover_raw(
508 sample,
509 dict_size,
510 options.split_point,
511 options.accel,
512 options.d_candidates.as_slice(),
513 options.f_candidates.as_slice(),
514 options.k_candidates.as_slice(),
515 )
516 } else {
517 let params = fastcover::normalize_fastcover_params(FastCoverParams {
518 k: options.k,
519 d: options.d,
520 f: options.f,
521 accel: options.accel,
522 });
523 (
524 fastcover::train_fastcover_raw(sample, dict_size, params),
525 FastCoverTuned {
526 k: params.k,
527 d: params.d,
528 f: params.f,
529 accel: params.accel,
530 score: 0,
531 },
532 )
533 }
534}
535
536pub fn train_fastcover_raw_from_slice(
538 sample: &[u8],
539 dict_size: usize,
540 options: &FastCoverOptions,
541) -> io::Result<(Vec<u8>, FastCoverTuned)> {
542 if sample.is_empty() {
543 return Err(io::Error::new(
544 io::ErrorKind::InvalidInput,
545 "source stream is empty",
546 ));
547 }
548 let (dict, tuned) = train_fastcover_internal(sample, dict_size, options);
549 if dict.is_empty() && dict_size > 0 {
550 return Err(io::Error::new(
551 io::ErrorKind::InvalidInput,
552 "training sample is too small for FastCOVER",
553 ));
554 }
555 Ok((dict, tuned))
556}
557
558pub fn create_fastcover_raw_dict_from_source<R: io::Read, W: io::Write>(
563 mut source: R,
564 output: &mut W,
565 dict_size: usize,
566 options: &FastCoverOptions,
567) -> io::Result<FastCoverTuned> {
568 let mut sample = Vec::new();
569 source.read_to_end(&mut sample)?;
570 let (dict, tuned) = train_fastcover_raw_from_slice(sample.as_slice(), dict_size, options)?;
571 output.write_all(dict.as_slice())?;
572 Ok(tuned)
573}
574
575pub fn create_fastcover_dict_from_source<R: io::Read, W: io::Write>(
580 mut source: R,
581 output: &mut W,
582 dict_size: usize,
583 fastcover: &FastCoverOptions,
584 finalize: FinalizeOptions,
585) -> io::Result<FastCoverTuned> {
586 let mut sample = Vec::new();
587 source.read_to_end(&mut sample)?;
588 if sample.is_empty() {
589 return Err(io::Error::new(
590 io::ErrorKind::InvalidInput,
591 "source stream is empty",
592 ));
593 }
594 let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)?;
595 let (raw_dict, tuned) =
596 train_fastcover_raw_from_slice(sample.as_slice(), content_budget, fastcover)?;
597
598 let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?;
599 output.write_all(finalized.as_slice())?;
600 Ok(tuned)
601}
602
603#[cfg(test)]
604mod tests {
605 use super::*;
606 use crate::decoding::Dictionary;
607 use crate::encoding::{CompressionLevel, FrameCompressor};
608 use std::io::Cursor;
609 use std::string::ToString;
610
611 fn training_data() -> Vec<u8> {
612 let mut data = Vec::new();
613 for i in 0..512u32 {
614 data.extend_from_slice(
615 format!(
616 "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n"
617 )
618 .as_bytes(),
619 );
620 }
621 data
622 }
623
624 #[test]
625 fn finalize_raw_dict_roundtrips_with_ffi_decoder() {
626 let sample = training_data();
627 let dict_size = 4096usize;
628 let content_budget =
629 finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)
630 .expect("content budget should be computable");
631 let raw = fastcover::train_fastcover_raw(
632 sample.as_slice(),
633 content_budget,
634 FastCoverParams {
635 k: 256,
636 d: 8,
637 f: 20,
638 accel: 1,
639 },
640 );
641 let finalized = finalize_raw_dict(
642 raw.as_slice(),
643 sample.as_slice(),
644 dict_size,
645 FinalizeOptions::default(),
646 )
647 .expect("finalization should succeed");
648 let parsed = Dictionary::decode_dict(finalized.as_slice())
649 .expect("finalized dictionary should parse");
650 assert!(!parsed.dict_content.is_empty());
651
652 let mut payload = Vec::new();
653 for idx in 0..96u32 {
654 payload.extend_from_slice(
655 format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n")
656 .as_bytes(),
657 );
658 }
659
660 let mut compressed = Vec::new();
661 let mut compressor = FrameCompressor::new(CompressionLevel::Fastest);
662 compressor
663 .set_dictionary(parsed)
664 .expect("dictionary should attach");
665 compressor.set_source(payload.as_slice());
666 compressor.set_drain(&mut compressed);
667 compressor.compress();
668
669 let mut ffi_decoder = zstd::bulk::Decompressor::with_dictionary(finalized.as_slice())
670 .expect("ffi decoder should accept finalized dictionary");
671 let mut decoded = Vec::with_capacity(payload.len());
672 let written = ffi_decoder
673 .decompress_to_buffer(compressed.as_slice(), &mut decoded)
674 .expect("ffi decoder should decode payload");
675 assert_eq!(written, payload.len());
676 assert_eq!(decoded, payload);
677 }
678
679 #[test]
680 fn create_fastcover_dict_from_source_writes_non_empty_output() {
681 let sample = training_data();
682 let mut out = Vec::new();
683 let tuned = create_fastcover_dict_from_source(
684 Cursor::new(sample.as_slice()),
685 &mut out,
686 4096,
687 &FastCoverOptions::default(),
688 FinalizeOptions::default(),
689 )
690 .expect("fastcover+finalize should succeed");
691 assert!(!out.is_empty());
692 assert!(tuned.k > 0);
693 assert!(tuned.d > 0);
694 }
695
696 #[test]
697 fn create_fastcover_raw_dict_from_source_rejects_empty_source() {
698 let mut out = Vec::new();
699 let err = create_fastcover_raw_dict_from_source(
700 Cursor::new(Vec::<u8>::new()),
701 &mut out,
702 1024,
703 &FastCoverOptions::default(),
704 )
705 .expect_err("empty source must be rejected");
706 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
707 }
708
709 #[test]
710 fn create_fastcover_dict_from_source_propagates_finalize_error() {
711 let sample = training_data();
712 let mut out = Vec::new();
713 let err = create_fastcover_dict_from_source(
714 Cursor::new(sample.as_slice()),
715 &mut out,
716 32,
717 &FastCoverOptions::default(),
718 FinalizeOptions::default(),
719 )
720 .expect_err("too-small dictionary budget must fail during finalize");
721 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
722 assert!(err.to_string().contains("dictionary size too small"));
723 }
724
725 #[test]
726 fn create_fastcover_dict_from_source_rejects_empty_source() {
727 let mut out = Vec::new();
728 let err = create_fastcover_dict_from_source(
729 Cursor::new(Vec::<u8>::new()),
730 &mut out,
731 1024,
732 &FastCoverOptions::default(),
733 FinalizeOptions::default(),
734 )
735 .expect_err("empty source must be rejected");
736 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
737 }
738
739 #[test]
740 fn create_raw_dict_from_source_early_returns_on_zero_dict_size() {
741 let sample = training_data();
742 let mut out = Vec::new();
743 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0)
744 .expect("zero dict size should no-op");
745 assert!(out.is_empty());
746 }
747
748 #[test]
749 fn create_raw_dict_from_source_treats_source_size_as_hint() {
750 let sample = training_data();
751 let mut out = Vec::new();
752 create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024)
753 .expect("raw dictionary training should succeed");
754 assert!(!out.is_empty());
755 }
756
757 #[test]
758 fn create_raw_dict_from_source_handles_tiny_source_without_epochs() {
759 let sample = b"short";
760 let mut out = Vec::new();
761 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
762 .expect("tiny source path should succeed");
763 assert_eq!(out, b"ort");
764 }
765
766 #[test]
767 fn create_raw_dict_from_source_propagates_read_error() {
768 struct FailingReader;
769 impl io::Read for FailingReader {
770 fn read(&mut self, _buf: &mut [u8]) -> io::Result<usize> {
771 Err(io::Error::other("read failed"))
772 }
773 }
774
775 let mut out = Vec::new();
776 let err = create_raw_dict_from_source(FailingReader, 1024, &mut out, 1024)
777 .expect_err("read failures must be returned");
778 assert_eq!(err.kind(), io::ErrorKind::Other);
779 assert_eq!(err.to_string(), "read failed");
780 }
781
782 #[test]
783 fn create_raw_dict_from_source_propagates_write_error() {
784 struct FailingWriter;
785 impl io::Write for FailingWriter {
786 fn write(&mut self, _buf: &[u8]) -> io::Result<usize> {
787 Err(io::Error::other("write failed"))
788 }
789 fn flush(&mut self) -> io::Result<()> {
790 Ok(())
791 }
792 }
793
794 let sample = b"short";
795 let mut out = FailingWriter;
796 let err =
797 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
798 .expect_err("write failures must be returned");
799 assert_eq!(err.kind(), io::ErrorKind::Other);
800 assert_eq!(err.to_string(), "write failed");
801 }
802
803 #[test]
804 fn create_raw_dict_from_source_never_exceeds_requested_size() {
805 let dict_size = 4096usize;
806 let source: Vec<u8> = core::iter::repeat_n(b'a', 320_001).collect();
807 let mut out = Vec::new();
808 create_raw_dict_from_source(
809 Cursor::new(source.as_slice()),
810 source.len(),
811 &mut out,
812 dict_size,
813 )
814 .expect("raw dictionary training should succeed");
815 assert!(
816 out.len() <= dict_size,
817 "raw dictionary exceeded requested size: {} > {}",
818 out.len(),
819 dict_size
820 );
821 }
822
823 #[test]
824 fn train_fastcover_raw_from_slice_rejects_empty_sample() {
825 let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default())
826 .expect_err("empty sample must be rejected");
827 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
828 }
829
830 #[test]
831 fn train_fastcover_raw_from_slice_supports_non_optimized_params() {
832 let sample = training_data();
833 let options = FastCoverOptions {
834 optimize: false,
835 k: 128,
836 d: 6,
837 f: 18,
838 ..FastCoverOptions::default()
839 };
840 let (dict, tuned) =
841 train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
842 assert!(!dict.is_empty());
843 assert!(dict.len() <= 2048);
844 assert_eq!(tuned.k, 128);
845 assert_eq!(tuned.d, 6);
846 assert_eq!(tuned.f, 18);
847 assert_eq!(tuned.score, 0);
848 }
849
850 #[test]
851 fn train_fastcover_raw_from_slice_rejects_tiny_sample_with_empty_dict() {
852 let sample = b"tiny";
853 let err = train_fastcover_raw_from_slice(sample, 1024, &FastCoverOptions::default())
854 .expect_err("tiny sample should not produce an empty dictionary successfully");
855 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
856 assert_eq!(
857 err.to_string(),
858 "training sample is too small for FastCOVER"
859 );
860 }
861
862 #[test]
863 fn train_fastcover_raw_from_slice_normalizes_non_optimized_params() {
864 let sample = training_data();
865 let options = FastCoverOptions {
866 optimize: false,
867 k: 8,
868 d: 64,
869 f: 42,
870 ..FastCoverOptions::default()
871 };
872 let (_, tuned) =
873 train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
874 assert_eq!(tuned.k, 32);
875 assert_eq!(tuned.d, 32);
876 assert_eq!(tuned.f, 20);
877 }
878
879 #[test]
880 fn finalize_raw_dict_rejects_empty_raw_content() {
881 let sample = training_data();
882 let err = finalize_raw_dict(&[], sample.as_slice(), 4096, FinalizeOptions::default())
883 .expect_err("empty raw dictionary must be rejected");
884 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
885 }
886
887 #[test]
888 fn finalize_raw_dict_rejects_too_small_budget() {
889 let sample = training_data();
890 let raw = b"some-raw-bytes";
891 let err = finalize_raw_dict(raw, sample.as_slice(), 32, FinalizeOptions::default())
892 .expect_err("tiny dict_size must fail");
893 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
894 assert!(err.to_string().contains("dictionary size too small"));
895 }
896
897 #[test]
898 fn finalize_raw_dict_pads_to_minimum_content_size() {
899 let sample = training_data();
900 let raw = b"x";
901 let finalized = finalize_raw_dict(raw, sample.as_slice(), 4096, FinalizeOptions::default())
902 .expect("finalize should pad small raw content");
903 let parsed = Dictionary::decode_dict(finalized.as_slice()).expect("finalized dict parses");
904 assert!(parsed.dict_content.len() >= 8);
905 assert_eq!(parsed.dict_content.last(), Some(&b'x'));
906 }
907
908 #[test]
909 fn finalize_raw_dict_rejects_zero_dict_id() {
910 let sample = training_data();
911 let raw = b"raw-fastcover-bytes";
912 let err = finalize_raw_dict(
913 raw,
914 sample.as_slice(),
915 4096,
916 FinalizeOptions { dict_id: Some(0) },
917 )
918 .expect_err("dict_id=0 must be rejected");
919 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
920 assert_eq!(err.to_string(), "dictionary id must be non-zero");
921 }
922}