1mod cover;
26mod fastcover;
27mod frequency;
28mod reservoir;
29
30use crate::bit_io::BitWriter;
31use crate::blocks::sequence_section::{
32 MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
33};
34use crate::decoding::dictionary::MAGIC_NUM as DICT_MAGIC_NUM;
35use crate::decoding::sequence_section_decoder::{LL_MAX_LOG, ML_MAX_LOG, OF_MAX_LOG};
36use crate::dictionary::reservoir::create_sample;
37use crate::fse::fse_encoder::{self, build_table_from_symbol_counts};
38use crate::huff0::HuffmanTable as HuffmanDecoderTable;
39use crate::huff0::huff0_encoder::{HuffmanEncoder, HuffmanTable as HuffmanEncoderTable};
40use core::cmp::Reverse;
41use cover::*;
42pub use fastcover::{
43 DEFAULT_D_CANDIDATES, DEFAULT_F_CANDIDATES, DEFAULT_K_CANDIDATES, FastCoverParams,
44 FastCoverTuned,
45};
46use std::{
47 boxed::Box,
48 collections::{BinaryHeap, HashMap},
49 format,
50 fs::{self, File},
51 io::{self, Read},
52 path::{Path, PathBuf},
53 vec,
58 vec::Vec,
59};
60
61const MAX_TRAINING_PREALLOC_BYTES: usize = 8 * 1024 * 1024;
62const MAX_HUFFMAN_STATS_BYTES: usize = 64 * 1024;
63
64#[derive(Debug, Clone)]
66pub struct FastCoverOptions {
67 pub optimize: bool,
68 pub split_point: f64,
69 pub accel: usize,
70 pub k: usize,
71 pub d: usize,
72 pub f: u32,
73 pub k_candidates: Vec<usize>,
74 pub d_candidates: Vec<usize>,
75 pub f_candidates: Vec<u32>,
76}
77
78impl Default for FastCoverOptions {
79 fn default() -> Self {
80 Self {
81 optimize: true,
82 split_point: 0.75,
83 accel: 1,
84 k: 256,
85 d: 8,
86 f: 20,
87 k_candidates: DEFAULT_K_CANDIDATES.to_vec(),
88 d_candidates: DEFAULT_D_CANDIDATES.to_vec(),
89 f_candidates: DEFAULT_F_CANDIDATES.to_vec(),
90 }
91 }
92}
93
94#[derive(Debug, Clone, Copy, Default)]
95pub struct FinalizeOptions {
96 pub dict_id: Option<u32>,
97}
98
99pub(super) struct DictParams {
104 pub segment_size: u32,
114}
115
116pub fn create_raw_dict_from_dir<P: AsRef<Path>, W: io::Write>(
136 path: P,
137 output: &mut W,
138 dict_size: usize,
139) -> Result<(), io::Error> {
140 let mut file_paths: Vec<PathBuf> = Vec::new();
142 let dir: fs::ReadDir = fs::read_dir(path)?;
143 fn recurse_read(dir: fs::ReadDir, file_paths: &mut Vec<PathBuf>) -> Result<(), io::Error> {
144 for entry in dir {
145 let entry = entry?;
146 if entry.file_type()?.is_dir() {
147 recurse_read(fs::read_dir(entry.path())?, file_paths)?;
148 } else {
149 file_paths.push(entry.path());
150 }
151 }
152 Ok(())
153 }
154 recurse_read(dir, &mut file_paths)?;
155
156 let mut total_file_len: u64 = 0;
158 let mut file_handles: Vec<fs::File> = Vec::new();
159 for path in file_paths {
160 let handle = File::open(path)?;
161 total_file_len += handle.metadata()?.len();
162 file_handles.push(handle);
163 }
164 let empty_reader: Box<dyn Read> = Box::new(io::empty());
165 let chained_files = file_handles
166 .iter()
167 .fold(empty_reader, |acc, reader| Box::new(acc.chain(reader)));
168
169 create_raw_dict_from_source(chained_files, total_file_len as usize, output, dict_size)?;
171 Ok(())
172}
173
174pub fn create_raw_dict_from_source<R: io::Read, W: io::Write>(
191 mut source: R,
192 source_size: usize,
193 output: &mut W,
194 dict_size: usize,
195) -> io::Result<()> {
196 if dict_size == 0 {
197 return Ok(());
198 }
199 let prealloc = source_size.min(MAX_TRAINING_PREALLOC_BYTES);
200 let mut all = Vec::with_capacity(prealloc);
201 source.read_to_end(&mut all)?;
202 if all.is_empty() {
203 return Ok(());
204 }
205
206 if all.len() < K {
207 let keep = usize::min(all.len(), dict_size);
208 output.write_all(&all[all.len() - keep..])?;
209 return Ok(());
210 }
211
212 let source_size = all.len();
213 vprintln!("create_dict: creating {dict_size} byte dict from {source_size} byte source");
214
215 let params = DictParams { segment_size: 2048 };
216 let num_segments = usize::max(1, source_size / params.segment_size as usize);
217 let denom = usize::max(1, source_size / (2 * num_segments));
221 let sample_scale = usize::max(1, usize::min(denom, 256));
222 let mut sample_size = source_size / sample_scale;
223 sample_size = usize::max(sample_size, usize::min(source_size, 16));
224 vprintln!("create_dict: creating {sample_size} byte sample of collection");
225 let mut sample_reader = all.as_slice();
226 let collection_sample = create_sample(&mut sample_reader, sample_size);
227
228 let mut pool: BinaryHeap<Reverse<Segment>> = BinaryHeap::new();
234 let (num_epochs, epoch_size_kmers) = compute_epoch_info(¶ms, dict_size, source_size / K);
235 let epoch_size = usize::max(K, epoch_size_kmers * K);
240 vprintln!("create_dict: computed epoch info, using {num_epochs} epochs of {epoch_size} bytes");
241 let mut epoch_counter = 0;
242 let mut ctx = Context {
243 frequencies: HashMap::with_capacity(epoch_size / K),
244 };
245 for epoch_idx in 0..num_epochs {
249 let start = epoch_idx * epoch_size;
250 if start >= all.len() {
251 break;
252 }
253 let end = if epoch_idx + 1 == num_epochs {
254 all.len()
255 } else {
256 usize::min(start + epoch_size, all.len())
257 };
258 let epoch = &all[start..end];
259 epoch_counter += 1;
260 let best_segment = pick_best_segment(¶ms, &mut ctx, epoch, &collection_sample);
261 vprintln!(
262 "\tcreate_dict: epoch {epoch_counter}/{num_epochs} has best segment score {}",
263 best_segment.score
264 );
265 pool.push(Reverse(best_segment));
266 ctx.frequencies.clear();
268 }
269 vprintln!(
270 "create_dict: {epoch_counter} epochs written, writing {} segments",
271 pool.len()
272 );
273 while let Some(segment) = pool.pop() {
276 output.write_all(&segment.0.raw)?;
277 }
278 Ok(())
279}
280
281fn serialize_huffman_table(sample_data: &[u8], raw_content: &[u8]) -> io::Result<Vec<u8>> {
282 fn bounded_huffman_stats(data: &[u8]) -> Vec<u8> {
283 if data.len() <= MAX_HUFFMAN_STATS_BYTES {
284 return data.to_vec();
285 }
286
287 let mut stats = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
288 for i in 0..MAX_HUFFMAN_STATS_BYTES {
289 let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
290 stats.push(data[idx]);
291 }
292 stats
293 }
294
295 let source = if sample_data.len() >= 2 {
296 sample_data
297 } else {
298 raw_content
299 };
300 let mut stats = bounded_huffman_stats(source);
301 if stats.len() < 2 || stats.iter().all(|b| *b == stats[0]) {
302 stats = (0u8..=255).collect();
303 }
304
305 let table = HuffmanEncoderTable::build_from_data(stats.as_slice());
306 let mut writer = BitWriter::new();
307 let mut encoder = HuffmanEncoder::new(&table, &mut writer);
308 encoder.encode(&[stats[0]], true);
309 let encoded = writer.dump();
310
311 let mut decoder = HuffmanDecoderTable::new();
312 let table_size = decoder
313 .build_decoder(encoded.as_slice())
314 .map_err(|e| io::Error::other(format!("failed to decode generated huffman table: {e}")))?;
315 Ok(encoded[..table_size as usize].to_vec())
316}
317
318fn serialize_fse_table(table: &fse_encoder::FSETable) -> Vec<u8> {
319 let mut writer = BitWriter::new();
320 table.write_table(&mut writer);
321 writer.dump()
322}
323
324fn bounded_fse_symbols(data: &[u8], max_symbol: u8) -> Vec<u8> {
325 let modulo = u16::from(max_symbol) + 1;
326 if data.is_empty() {
327 return Vec::from([0u8]);
328 }
329 if data.len() <= MAX_HUFFMAN_STATS_BYTES {
330 return data
331 .iter()
332 .map(|b| (u16::from(*b) % modulo) as u8)
333 .collect();
334 }
335
336 let mut out = Vec::with_capacity(MAX_HUFFMAN_STATS_BYTES);
337 for i in 0..MAX_HUFFMAN_STATS_BYTES {
338 let idx = i * data.len() / MAX_HUFFMAN_STATS_BYTES;
339 out.push((u16::from(data[idx]) % modulo) as u8);
340 }
341 out
342}
343
344fn serialize_fse_table_from_corpus(
345 sample_data: &[u8],
346 raw_content: &[u8],
347 max_symbol: u8,
348 max_log: u8,
349) -> io::Result<Vec<u8>> {
350 fn counts_total_for_source(source: &[u8], max_symbol: u8, counts: &mut [usize]) -> usize {
351 counts.fill(0);
352 for symbol in bounded_fse_symbols(source, max_symbol) {
353 counts[usize::from(symbol)] += 1;
354 }
355 counts.iter().sum::<usize>()
356 }
357
358 let mut counts = vec![0usize; usize::from(max_symbol) + 1];
359 let using_sample = !sample_data.is_empty();
360 let mut total = counts_total_for_source(
361 if using_sample {
362 sample_data
363 } else {
364 raw_content
365 },
366 max_symbol,
367 &mut counts,
368 );
369 if total <= 1 && using_sample && !raw_content.is_empty() {
370 total = counts_total_for_source(raw_content, max_symbol, &mut counts);
371 }
372 if total <= 1 {
373 return Err(io::Error::new(
374 io::ErrorKind::InvalidInput,
375 "insufficient symbol statistics for FSE table",
376 ));
377 }
378 let table = build_table_from_symbol_counts(&counts, max_log, false);
379 Ok(serialize_fse_table(&table))
380}
381
382fn finalized_content_budget(
383 sample_data: &[u8],
384 raw_fallback: &[u8],
385 dict_size: usize,
386) -> io::Result<usize> {
387 let min_content_size = 8usize;
388 let huf_len = serialize_huffman_table(sample_data, raw_fallback)?.len();
389 let of_len =
390 serialize_fse_table_from_corpus(sample_data, raw_fallback, MAX_OFFSET_CODE, OF_MAX_LOG)?
391 .len();
392 let ml_len = serialize_fse_table_from_corpus(
393 sample_data,
394 raw_fallback,
395 MAX_MATCH_LENGTH_CODE,
396 ML_MAX_LOG,
397 )?
398 .len();
399 let ll_len = serialize_fse_table_from_corpus(
400 sample_data,
401 raw_fallback,
402 MAX_LITERAL_LENGTH_CODE,
403 LL_MAX_LOG,
404 )?
405 .len();
406
407 let header_len = DICT_MAGIC_NUM.len() + 4 + huf_len + of_len + ml_len + ll_len + 12;
408 let max_content_budget = dict_size.saturating_sub(header_len);
409 if max_content_budget < min_content_size {
410 return Err(io::Error::new(
411 io::ErrorKind::InvalidInput,
412 "dictionary size too small to fit header and offset history",
413 ));
414 }
415 Ok(max_content_budget)
416}
417
418fn derive_dict_id(raw_content: &[u8]) -> u32 {
419 let mut h = 0xcbf29ce484222325u64;
420 for &b in raw_content {
421 h ^= u64::from(b);
422 h = h.wrapping_mul(0x100000001b3);
423 }
424 let compliant = (h % ((1u64 << 31) - 32768)) + 32768;
425 compliant as u32
426}
427
428pub fn finalize_raw_dict(
431 raw_content: &[u8],
432 sample_data: &[u8],
433 dict_size: usize,
434 options: FinalizeOptions,
435) -> io::Result<Vec<u8>> {
436 if raw_content.is_empty() {
437 return Err(io::Error::new(
438 io::ErrorKind::InvalidInput,
439 "raw dictionary content must not be empty",
440 ));
441 }
442 let mut out = Vec::with_capacity(dict_size.max(256));
443 out.extend_from_slice(&DICT_MAGIC_NUM);
444 let dict_id = options
445 .dict_id
446 .unwrap_or_else(|| derive_dict_id(raw_content));
447 if dict_id == 0 {
448 return Err(io::Error::new(
449 io::ErrorKind::InvalidInput,
450 "dictionary id must be non-zero",
451 ));
452 }
453 out.extend_from_slice(&dict_id.to_le_bytes());
454 out.extend_from_slice(serialize_huffman_table(sample_data, raw_content)?.as_slice());
455 out.extend_from_slice(
456 serialize_fse_table_from_corpus(sample_data, raw_content, MAX_OFFSET_CODE, OF_MAX_LOG)?
457 .as_slice(),
458 );
459 out.extend_from_slice(
460 serialize_fse_table_from_corpus(
461 sample_data,
462 raw_content,
463 MAX_MATCH_LENGTH_CODE,
464 ML_MAX_LOG,
465 )?
466 .as_slice(),
467 );
468 out.extend_from_slice(
469 serialize_fse_table_from_corpus(
470 sample_data,
471 raw_content,
472 MAX_LITERAL_LENGTH_CODE,
473 LL_MAX_LOG,
474 )?
475 .as_slice(),
476 );
477
478 out.extend_from_slice(&1u32.to_le_bytes());
480 out.extend_from_slice(&4u32.to_le_bytes());
481 out.extend_from_slice(&8u32.to_le_bytes());
482
483 let min_content_size = 8usize;
484 let max_content_budget = dict_size.saturating_sub(out.len());
485 if max_content_budget < min_content_size {
486 return Err(io::Error::new(
487 io::ErrorKind::InvalidInput,
488 "dictionary size too small to fit header and offset history",
489 ));
490 }
491
492 let content = if raw_content.len() > max_content_budget {
493 &raw_content[raw_content.len() - max_content_budget..]
494 } else {
495 raw_content
496 };
497 if content.len() < min_content_size {
498 out.resize(out.len() + (min_content_size - content.len()), 0);
499 }
500 out.extend_from_slice(content);
501 Ok(out)
502}
503
504fn train_fastcover_internal(
506 sample: &[u8],
507 dict_size: usize,
508 options: &FastCoverOptions,
509) -> (Vec<u8>, FastCoverTuned) {
510 if options.optimize {
511 fastcover::optimize_fastcover_raw(
512 sample,
513 dict_size,
514 options.split_point,
515 options.accel,
516 options.d_candidates.as_slice(),
517 options.f_candidates.as_slice(),
518 options.k_candidates.as_slice(),
519 )
520 } else {
521 let params = fastcover::normalize_fastcover_params(FastCoverParams {
522 k: options.k,
523 d: options.d,
524 f: options.f,
525 accel: options.accel,
526 });
527 (
528 fastcover::train_fastcover_raw(sample, dict_size, params),
529 FastCoverTuned {
530 k: params.k,
531 d: params.d,
532 f: params.f,
533 accel: params.accel,
534 score: 0,
535 },
536 )
537 }
538}
539
540pub fn train_fastcover_raw_from_slice(
542 sample: &[u8],
543 dict_size: usize,
544 options: &FastCoverOptions,
545) -> io::Result<(Vec<u8>, FastCoverTuned)> {
546 if sample.is_empty() {
547 return Err(io::Error::new(
548 io::ErrorKind::InvalidInput,
549 "source stream is empty",
550 ));
551 }
552 let (dict, tuned) = train_fastcover_internal(sample, dict_size, options);
553 if dict.is_empty() && dict_size > 0 {
554 return Err(io::Error::new(
555 io::ErrorKind::InvalidInput,
556 "training sample is too small for FastCOVER",
557 ));
558 }
559 Ok((dict, tuned))
560}
561
562pub fn create_fastcover_raw_dict_from_source<R: io::Read, W: io::Write>(
567 mut source: R,
568 output: &mut W,
569 dict_size: usize,
570 options: &FastCoverOptions,
571) -> io::Result<FastCoverTuned> {
572 let mut sample = Vec::new();
573 source.read_to_end(&mut sample)?;
574 let (dict, tuned) = train_fastcover_raw_from_slice(sample.as_slice(), dict_size, options)?;
575 output.write_all(dict.as_slice())?;
576 Ok(tuned)
577}
578
579pub fn create_fastcover_dict_from_source<R: io::Read, W: io::Write>(
584 mut source: R,
585 output: &mut W,
586 dict_size: usize,
587 fastcover: &FastCoverOptions,
588 finalize: FinalizeOptions,
589) -> io::Result<FastCoverTuned> {
590 let mut sample = Vec::new();
591 source.read_to_end(&mut sample)?;
592 if sample.is_empty() {
593 return Err(io::Error::new(
594 io::ErrorKind::InvalidInput,
595 "source stream is empty",
596 ));
597 }
598 let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)?;
599 let (raw_dict, tuned) =
600 train_fastcover_raw_from_slice(sample.as_slice(), content_budget, fastcover)?;
601
602 let finalized = finalize_raw_dict(raw_dict.as_slice(), sample.as_slice(), dict_size, finalize)?;
603 output.write_all(finalized.as_slice())?;
604 Ok(tuned)
605}
606
607#[cfg(feature = "bench_internals")]
615pub(crate) fn dict_roundtrip_fixture() -> (
616 alloc::vec::Vec<u8>,
617 alloc::vec::Vec<u8>,
618 alloc::vec::Vec<u8>,
619) {
620 use crate::decoding::Dictionary;
621 use crate::encoding::{CompressionLevel, FrameCompressor};
622
623 let mut sample = alloc::vec::Vec::new();
624 for i in 0..512u32 {
625 sample.extend_from_slice(
626 alloc::format!(
627 "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n"
628 )
629 .as_bytes(),
630 );
631 }
632
633 let dict_size = 4096usize;
634 let content_budget = finalized_content_budget(sample.as_slice(), sample.as_slice(), dict_size)
635 .expect("content budget should be computable");
636 let raw = fastcover::train_fastcover_raw(
637 sample.as_slice(),
638 content_budget,
639 fastcover::FastCoverParams {
640 k: 256,
641 d: 8,
642 f: 20,
643 accel: 1,
644 },
645 );
646 let finalized = finalize_raw_dict(
647 raw.as_slice(),
648 sample.as_slice(),
649 dict_size,
650 FinalizeOptions::default(),
651 )
652 .expect("finalization should succeed");
653 let parsed =
654 Dictionary::decode_dict(finalized.as_slice()).expect("finalized dictionary should parse");
655 assert!(!parsed.dict_content.is_empty());
656
657 let mut payload = alloc::vec::Vec::new();
658 for idx in 0..96u32 {
659 payload.extend_from_slice(
660 alloc::format!("tenant=demo op=put key={idx} value=aaaaabbbbbcccccdddddeeeee\n")
661 .as_bytes(),
662 );
663 }
664
665 let mut compressed = alloc::vec::Vec::new();
666 let mut compressor = FrameCompressor::new(CompressionLevel::Fastest);
667 compressor
668 .set_dictionary(parsed)
669 .expect("dictionary should attach");
670 compressor.set_source(payload.as_slice());
671 compressor.set_drain(&mut compressed);
672 compressor.compress();
673
674 (finalized, compressed, payload)
675}
676
677#[cfg(test)]
678mod tests {
679 use super::*;
680 use crate::decoding::Dictionary;
681 use std::io::Cursor;
682 use std::string::ToString;
683
684 fn training_data() -> Vec<u8> {
685 let mut data = Vec::new();
686 for i in 0..512u32 {
687 data.extend_from_slice(
688 format!(
689 "tenant=demo table=orders key={i} region=eu payload=aaaaabbbbbcccccdddddeeeee\n"
690 )
691 .as_bytes(),
692 );
693 }
694 data
695 }
696
697 #[test]
698 fn create_fastcover_dict_from_source_writes_non_empty_output() {
699 let sample = training_data();
700 let mut out = Vec::new();
701 let tuned = create_fastcover_dict_from_source(
702 Cursor::new(sample.as_slice()),
703 &mut out,
704 4096,
705 &FastCoverOptions::default(),
706 FinalizeOptions::default(),
707 )
708 .expect("fastcover+finalize should succeed");
709 assert!(!out.is_empty());
710 assert!(tuned.k > 0);
711 assert!(tuned.d > 0);
712 }
713
714 #[test]
715 fn create_fastcover_raw_dict_from_source_rejects_empty_source() {
716 let mut out = Vec::new();
717 let err = create_fastcover_raw_dict_from_source(
718 Cursor::new(Vec::<u8>::new()),
719 &mut out,
720 1024,
721 &FastCoverOptions::default(),
722 )
723 .expect_err("empty source must be rejected");
724 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
725 }
726
727 #[test]
728 fn create_fastcover_dict_from_source_propagates_finalize_error() {
729 let sample = training_data();
730 let mut out = Vec::new();
731 let err = create_fastcover_dict_from_source(
732 Cursor::new(sample.as_slice()),
733 &mut out,
734 32,
735 &FastCoverOptions::default(),
736 FinalizeOptions::default(),
737 )
738 .expect_err("too-small dictionary budget must fail during finalize");
739 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
740 assert!(err.to_string().contains("dictionary size too small"));
741 }
742
743 #[test]
744 fn create_fastcover_dict_from_source_rejects_empty_source() {
745 let mut out = Vec::new();
746 let err = create_fastcover_dict_from_source(
747 Cursor::new(Vec::<u8>::new()),
748 &mut out,
749 1024,
750 &FastCoverOptions::default(),
751 FinalizeOptions::default(),
752 )
753 .expect_err("empty source must be rejected");
754 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
755 }
756
757 #[test]
758 fn create_raw_dict_from_source_early_returns_on_zero_dict_size() {
759 let sample = training_data();
760 let mut out = Vec::new();
761 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 0)
762 .expect("zero dict size should no-op");
763 assert!(out.is_empty());
764 }
765
766 #[test]
767 fn create_raw_dict_from_source_treats_source_size_as_hint() {
768 let sample = training_data();
769 let mut out = Vec::new();
770 create_raw_dict_from_source(Cursor::new(sample.as_slice()), 0, &mut out, 1024)
771 .expect("raw dictionary training should succeed");
772 assert!(!out.is_empty());
773 }
774
775 #[test]
776 fn create_raw_dict_from_source_handles_tiny_source_without_epochs() {
777 let sample = b"short";
778 let mut out = Vec::new();
779 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
780 .expect("tiny source path should succeed");
781 assert_eq!(out, b"ort");
782 }
783
784 #[test]
785 fn create_raw_dict_from_source_propagates_read_error() {
786 struct FailingReader;
787 impl io::Read for FailingReader {
788 fn read(&mut self, _buf: &mut [u8]) -> io::Result<usize> {
789 Err(io::Error::other("read failed"))
790 }
791 }
792
793 let mut out = Vec::new();
794 let err = create_raw_dict_from_source(FailingReader, 1024, &mut out, 1024)
795 .expect_err("read failures must be returned");
796 assert_eq!(err.kind(), io::ErrorKind::Other);
797 assert_eq!(err.to_string(), "read failed");
798 }
799
800 #[test]
801 fn create_raw_dict_from_source_propagates_write_error() {
802 struct FailingWriter;
803 impl io::Write for FailingWriter {
804 fn write(&mut self, _buf: &[u8]) -> io::Result<usize> {
805 Err(io::Error::other("write failed"))
806 }
807 fn flush(&mut self) -> io::Result<()> {
808 Ok(())
809 }
810 }
811
812 let sample = b"short";
813 let mut out = FailingWriter;
814 let err =
815 create_raw_dict_from_source(Cursor::new(sample.as_slice()), sample.len(), &mut out, 3)
816 .expect_err("write failures must be returned");
817 assert_eq!(err.kind(), io::ErrorKind::Other);
818 assert_eq!(err.to_string(), "write failed");
819 }
820
821 #[test]
822 fn create_raw_dict_from_source_never_exceeds_requested_size() {
823 let dict_size = 4096usize;
824 let source: Vec<u8> = core::iter::repeat_n(b'a', 320_001).collect();
825 let mut out = Vec::new();
826 create_raw_dict_from_source(
827 Cursor::new(source.as_slice()),
828 source.len(),
829 &mut out,
830 dict_size,
831 )
832 .expect("raw dictionary training should succeed");
833 assert!(
834 out.len() <= dict_size,
835 "raw dictionary exceeded requested size: {} > {}",
836 out.len(),
837 dict_size
838 );
839 }
840
841 #[test]
842 fn train_fastcover_raw_from_slice_rejects_empty_sample() {
843 let err = train_fastcover_raw_from_slice(&[], 1024, &FastCoverOptions::default())
844 .expect_err("empty sample must be rejected");
845 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
846 }
847
848 #[test]
849 fn train_fastcover_raw_from_slice_supports_non_optimized_params() {
850 let sample = training_data();
851 let options = FastCoverOptions {
852 optimize: false,
853 k: 128,
854 d: 6,
855 f: 18,
856 ..FastCoverOptions::default()
857 };
858 let (dict, tuned) =
859 train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
860 assert!(!dict.is_empty());
861 assert!(dict.len() <= 2048);
862 assert_eq!(tuned.k, 128);
863 assert_eq!(tuned.d, 6);
864 assert_eq!(tuned.f, 18);
865 assert_eq!(tuned.score, 0);
866 }
867
868 #[test]
869 fn train_fastcover_raw_from_slice_rejects_tiny_sample_with_empty_dict() {
870 let sample = b"tiny";
871 let err = train_fastcover_raw_from_slice(sample, 1024, &FastCoverOptions::default())
872 .expect_err("tiny sample should not produce an empty dictionary successfully");
873 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
874 assert_eq!(
875 err.to_string(),
876 "training sample is too small for FastCOVER"
877 );
878 }
879
880 #[test]
881 fn train_fastcover_raw_from_slice_normalizes_non_optimized_params() {
882 let sample = training_data();
883 let options = FastCoverOptions {
884 optimize: false,
885 k: 8,
886 d: 64,
887 f: 42,
888 ..FastCoverOptions::default()
889 };
890 let (_, tuned) =
891 train_fastcover_raw_from_slice(sample.as_slice(), 2048, &options).expect("must train");
892 assert_eq!(tuned.k, 32);
893 assert_eq!(tuned.d, 32);
894 assert_eq!(tuned.f, 20);
895 }
896
897 #[test]
898 fn finalize_raw_dict_rejects_empty_raw_content() {
899 let sample = training_data();
900 let err = finalize_raw_dict(&[], sample.as_slice(), 4096, FinalizeOptions::default())
901 .expect_err("empty raw dictionary must be rejected");
902 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
903 }
904
905 #[test]
906 fn finalize_raw_dict_rejects_too_small_budget() {
907 let sample = training_data();
908 let raw = b"some-raw-bytes";
909 let err = finalize_raw_dict(raw, sample.as_slice(), 32, FinalizeOptions::default())
910 .expect_err("tiny dict_size must fail");
911 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
912 assert!(err.to_string().contains("dictionary size too small"));
913 }
914
915 #[test]
916 fn finalize_raw_dict_pads_to_minimum_content_size() {
917 let sample = training_data();
918 let raw = b"x";
919 let finalized = finalize_raw_dict(raw, sample.as_slice(), 4096, FinalizeOptions::default())
920 .expect("finalize should pad small raw content");
921 let parsed = Dictionary::decode_dict(finalized.as_slice()).expect("finalized dict parses");
922 assert!(parsed.dict_content.len() >= 8);
923 assert_eq!(parsed.dict_content.last(), Some(&b'x'));
924 }
925
926 #[test]
927 fn finalize_raw_dict_rejects_zero_dict_id() {
928 let sample = training_data();
929 let raw = b"raw-fastcover-bytes";
930 let err = finalize_raw_dict(
931 raw,
932 sample.as_slice(),
933 4096,
934 FinalizeOptions { dict_id: Some(0) },
935 )
936 .expect_err("dict_id=0 must be rejected");
937 assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
938 assert_eq!(err.to_string(), "dictionary id must be non-zero");
939 }
940}