haagenti_zstd/compress/
mod.rs

1//! Zstd compression pipeline.
2//!
3//! This module implements the compression side of Zstandard (RFC 8878).
4//!
5//! ## Pipeline Overview
6//!
7//! ```text
8//! Input Data
9//!     │
10//!     ▼
11//! ┌─────────────────────────────────────┐
12//! │  Compressibility Analysis           │
13//! │  - Entropy estimation               │
14//! │  - Pattern detection                │
15//! │  - Strategy selection               │
16//! └─────────────────────────────────────┘
17//!     │
18//!     ▼
19//! ┌─────────────────────────────────────┐
20//! │  Match Finding (LZ77)               │
21//! │  - Hash chain construction          │
22//! │  - Best match selection             │
23//! │  - Coverage analysis                │
24//! └─────────────────────────────────────┘
25//!     │
26//!     ▼
27//! ┌─────────────────────────────────────┐
28//! │  Block Encoding                     │
29//! │  - Literals (Huffman/Raw/RLE)       │
30//! │  - Sequences (FSE/RLE/Raw)          │
31//! │  - Block type selection             │
32//! └─────────────────────────────────────┘
33//!     │
34//!     ▼
35//! ┌─────────────────────────────────────┐
36//! │  Frame Assembly                     │
37//! │  - Magic number                     │
38//! │  - Frame header                     │
39//! │  - Blocks                           │
40//! │  - XXHash64 checksum                │
41//! └─────────────────────────────────────┘
42//!     │
43//!     ▼
44//! Compressed Output
45//! ```
46//!
47//! ## Novel Approach: Compressibility Fingerprinting
48//!
49//! Unlike traditional compressors that blindly attempt compression and
50//! fall back on failure, this implementation uses **compressibility
51//! fingerprinting** to predict the optimal encoding strategy before
52//! attempting compression.
53//!
54//! ### Benefits
55//!
56//! 1. **CPU Efficiency**: Skip compression attempts on incompressible data
57//! 2. **Better Strategy Selection**: Choose RLE vs FSE vs Raw based on analysis
58//! 3. **Reduced Branching**: Predict block type without trial encoding
59//!
60//! ### Fingerprint Components
61//!
62//! - **Entropy Estimate**: Shannon entropy from byte frequencies (0.0-8.0)
63//! - **Pattern Type**: Uniform, Periodic, TextLike, HighEntropy, Random
64//! - **Match Uniformity**: Whether matches have consistent offset/length
65//!
66//! ## RLE-First Sequence Encoding
67//!
68//! Instead of jumping straight to complex FSE sequence encoding, we
69//! leverage **RLE sequence mode** for uniform match patterns:
70//!
71//! ```text
72//! if all_sequences_have_same(ll_code, ml_code, of_code):
73//!     use RLE mode (3 bytes overhead vs FSE table overhead)
74//! else:
75//!     check reachability and try FSE with predefined tables
76//! ```
77//!
78//! This is simpler, faster, and often just as effective for patterns like:
79//! - "abcdabcdabcd..." (uniform 4-byte matches)
80//! - Repeated structures in binary data
81//!
82//! ## Components
83//!
84//! - `analysis`: Compressibility fingerprinting and strategy selection
85//! - `match_finder`: LZ77 match finding using hash chains
86//! - `sequences`: RLE-first sequence encoding with FSE fallback
87//! - `block`: Block-level encoding (literals + sequences)
88//!
89//! ## Strategy Selection
90//!
91//! | Pattern Type | Entropy | Match Coverage | Strategy |
92//! |--------------|---------|----------------|----------|
93//! | Uniform | 0.0 | N/A | RLE Block |
94//! | Periodic (≤4) | Low | High | RLE Sequences |
95//! | LowEntropy | Low | High | RLE Sequences |
96//! | TextLike | Medium | Medium | Predefined FSE |
97//! | HighEntropy | High | Low | Raw Block |
98//! | Random | 7.5+ | None | Raw Block |
99
100mod analysis;
101mod arena;
102pub mod block;
103mod match_finder;
104mod repeat_offset_finder;
105mod sequences;
106pub mod speculative;
107
108pub use analysis::{
109    // Phase 3: Ultra-fast entropy fingerprinting
110    fast_entropy_estimate,
111    fast_predict_block_type,
112    fast_should_compress,
113    CompressibilityFingerprint,
114    CompressionStrategy,
115    FastBlockType,
116    PatternType,
117};
118pub use arena::{Arena, ArenaVec, DEFAULT_ARENA_SIZE};
119pub use block::{encode_block, BlockEncoder};
120pub use match_finder::{CacheAligned, LazyMatchFinder, Match, MatchFinder};
121pub use repeat_offset_finder::RepeatOffsetMatchFinder;
122pub use sequences::{
123    analyze_for_rle, encode_sequences_fse, encode_sequences_fse_with_encoded, encode_sequences_rle,
124    encode_sequences_with_custom_tables, EncodedSequence, RleSuitability,
125};
126pub use speculative::{SpeculativeCompressor, SpeculativeStrategy};
127
128use crate::frame::{xxhash64, ZSTD_MAGIC};
129use crate::{CustomFseTables, CustomHuffmanTable};
130use haagenti_core::{CompressionLevel, Result};
131
132/// Match finder variant based on compression level.
133#[derive(Debug)]
134enum MatchFinderVariant {
135    /// Fast greedy match finder for speed.
136    Greedy(MatchFinder),
137    /// Lazy match finder for better compression ratio.
138    Lazy(LazyMatchFinder),
139    /// Repeat offset-aware match finder for best compression.
140    RepeatAware(RepeatOffsetMatchFinder),
141}
142
143/// Compression context holding state during compression.
144pub struct CompressContext {
145    /// Compression level (affects match finding depth).
146    #[allow(dead_code)] // Used for future FSE encoding depth tuning
147    level: CompressionLevel,
148    /// Match finder (greedy or lazy based on level).
149    match_finder: MatchFinderVariant,
150    /// Optional dictionary ID for dictionary compression.
151    dictionary_id: Option<u32>,
152    /// Arena for per-frame temporary allocations.
153    /// Reduces allocation overhead by reusing memory between frames.
154    arena: Arena,
155    /// Optional custom FSE tables for sequence encoding.
156    custom_tables: Option<CustomFseTables>,
157    /// Optional custom Huffman table for literal encoding.
158    custom_huffman: Option<CustomHuffmanTable>,
159}
160
161// Manual Debug impl since Arena uses Cell which has limited Debug output
162impl core::fmt::Debug for CompressContext {
163    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
164        f.debug_struct("CompressContext")
165            .field("level", &self.level)
166            .field("match_finder", &self.match_finder)
167            .field("dictionary_id", &self.dictionary_id)
168            .field("arena_usage", &self.arena.usage())
169            .field("arena_capacity", &self.arena.capacity())
170            .field("has_custom_tables", &self.custom_tables.is_some())
171            .field("has_custom_huffman", &self.custom_huffman.is_some())
172            .finish()
173    }
174}
175
176impl CompressContext {
177    /// Create a new compression context.
178    pub fn new(level: CompressionLevel) -> Self {
179        // Balanced search depths:
180        // - Fast mode prioritizes throughput with acceptable ratio
181        // - Higher levels trade throughput for compression ratio
182        // Note: Our hash chain + 4-byte prefix check is efficient, so we can
183        // afford moderate depth without excessive slowdown.
184        let search_depth = match level {
185            CompressionLevel::None => 1,
186            CompressionLevel::Fast => 8,     // Balanced for speed
187            CompressionLevel::Default => 24, // Good balance
188            CompressionLevel::Best => 48,    // Favor ratio
189            CompressionLevel::Ultra => 128,  // Maximum quality
190            CompressionLevel::Custom(n) => n.min(255) as usize,
191        };
192
193        // Match finder selection:
194        // - None: Greedy for maximum speed
195        // - Fast/Default: Lazy for good ratio/speed balance
196        // - Best/Ultra: RepeatAware for best compression (exploits repeat offsets)
197        let match_finder = match level {
198            CompressionLevel::None => MatchFinderVariant::Greedy(MatchFinder::new(search_depth)),
199            CompressionLevel::Best | CompressionLevel::Ultra => {
200                MatchFinderVariant::RepeatAware(RepeatOffsetMatchFinder::new(search_depth))
201            }
202            _ => MatchFinderVariant::Lazy(LazyMatchFinder::new(search_depth)),
203        };
204
205        Self {
206            level,
207            match_finder,
208            dictionary_id: None,
209            arena: Arena::with_default_size(),
210            custom_tables: None,
211            custom_huffman: None,
212        }
213    }
214
215    /// Create a compression context with custom FSE tables.
216    ///
217    /// Custom tables allow overriding the predefined FSE tables used for
218    /// sequence encoding. This can improve compression ratio when the data
219    /// has symbol distributions that differ from the predefined tables.
220    pub fn with_custom_tables(level: CompressionLevel, custom_tables: CustomFseTables) -> Self {
221        let mut ctx = Self::new(level);
222        ctx.custom_tables = Some(custom_tables);
223        ctx
224    }
225
226    /// Create a compression context with all custom options.
227    ///
228    /// This is the most flexible constructor, allowing both custom FSE tables
229    /// and custom Huffman tables to be specified.
230    pub fn with_options(
231        level: CompressionLevel,
232        custom_tables: Option<CustomFseTables>,
233        custom_huffman: Option<CustomHuffmanTable>,
234    ) -> Self {
235        let mut ctx = Self::new(level);
236        ctx.custom_tables = custom_tables;
237        ctx.custom_huffman = custom_huffman;
238        ctx
239    }
240
241    /// Create a new compression context with a custom arena size.
242    ///
243    /// Larger arena sizes can improve performance for large inputs by reducing
244    /// the number of heap allocations during compression.
245    pub fn with_arena_size(level: CompressionLevel, arena_size: usize) -> Self {
246        let mut ctx = Self::new(level);
247        ctx.arena = Arena::new(arena_size);
248        ctx
249    }
250
251    /// Get the custom FSE tables, if any.
252    pub fn custom_tables(&self) -> Option<&CustomFseTables> {
253        self.custom_tables.as_ref()
254    }
255
256    /// Get the custom Huffman table, if any.
257    pub fn custom_huffman(&self) -> Option<&CustomHuffmanTable> {
258        self.custom_huffman.as_ref()
259    }
260
261    /// Get the arena's peak usage (useful for tuning arena size).
262    pub fn arena_peak_usage(&self) -> usize {
263        self.arena.peak_usage()
264    }
265
266    /// Set dictionary ID for dictionary compression.
267    pub fn set_dictionary_id(&mut self, dict_id: u32) {
268        self.dictionary_id = Some(dict_id);
269    }
270
271    /// Find matches using the appropriate match finder variant.
272    ///
273    /// For lazy matching, uses adaptive threshold scaling based on input size
274    /// to optimize throughput for larger inputs while maintaining good ratio
275    /// for smaller inputs.
276    fn find_matches(&mut self, input: &[u8]) -> Vec<Match> {
277        match &mut self.match_finder {
278            MatchFinderVariant::Greedy(mf) => mf.find_matches(input),
279            // Use find_matches_auto for adaptive lazy threshold
280            MatchFinderVariant::Lazy(mf) => mf.find_matches_auto(input),
281            // Repeat offset-aware matching for best compression
282            MatchFinderVariant::RepeatAware(mf) => mf.find_matches(input),
283        }
284    }
285
286    /// Compress data into a Zstd frame.
287    ///
288    /// By default, does NOT include a checksum (matching reference zstd level 1 behavior).
289    /// Use `compress_with_checksum` for checksum-protected frames.
290    pub fn compress(&mut self, input: &[u8]) -> Result<Vec<u8>> {
291        self.compress_internal(input, false)
292    }
293
294    /// Compress data into a Zstd frame with XXH64 checksum.
295    ///
296    /// The checksum is the lower 32 bits of XXH64 of the original uncompressed content.
297    #[allow(dead_code)]
298    pub fn compress_with_checksum(&mut self, input: &[u8]) -> Result<Vec<u8>> {
299        self.compress_internal(input, true)
300    }
301
302    /// Compress using speculative multi-path compression.
303    ///
304    /// This runs multiple compression strategies in parallel (when the `parallel`
305    /// feature is enabled) and returns the smallest result. This provides optimal
306    /// compression without needing to know the data characteristics in advance.
307    ///
308    /// **When to use:**
309    /// - When compression ratio matters more than latency
310    /// - For mixed content where the optimal strategy varies
311    /// - When CPU cores are available for parallel execution
312    ///
313    /// **Performance characteristics:**
314    /// - Uses all available CPU cores via work-stealing
315    /// - Minimal overhead for small inputs (< 4KB uses single-threaded)
316    /// - Near-linear scaling for large inputs
317    #[allow(dead_code)]
318    pub fn compress_speculative(&self, input: &[u8]) -> Result<Vec<u8>> {
319        let compressor = speculative::SpeculativeCompressor::new();
320        compressor.compress(input)
321    }
322
323    /// Compress using fast speculative compression.
324    ///
325    /// Uses fewer strategies for lower latency while still trying multiple approaches.
326    #[allow(dead_code)]
327    pub fn compress_speculative_fast(&self, input: &[u8]) -> Result<Vec<u8>> {
328        let compressor = speculative::SpeculativeCompressor::fast();
329        compressor.compress(input)
330    }
331
332    /// Compress using best speculative compression.
333    ///
334    /// Uses aggressive strategies for maximum compression ratio.
335    #[allow(dead_code)]
336    pub fn compress_speculative_best(&self, input: &[u8]) -> Result<Vec<u8>> {
337        let compressor = speculative::SpeculativeCompressor::best();
338        compressor.compress(input)
339    }
340
341    /// Internal compression with optional checksum.
342    fn compress_internal(&mut self, input: &[u8], include_checksum: bool) -> Result<Vec<u8>> {
343        // Reset arena for this frame (O(1) operation)
344        // This allows reuse of temporary allocations from previous frames
345        self.arena.reset();
346
347        let mut output = Vec::with_capacity(input.len() + 32);
348
349        // Write magic number
350        output.extend_from_slice(&ZSTD_MAGIC.to_le_bytes());
351
352        // Write frame header
353        self.write_frame_header(input.len(), include_checksum, &mut output)?;
354
355        // Encode blocks
356        self.encode_blocks(input, &mut output)?;
357
358        // Write checksum if requested
359        if include_checksum {
360            let checksum = (xxhash64(input, 0) & 0xFFFFFFFF) as u32;
361            output.extend_from_slice(&checksum.to_le_bytes());
362        }
363
364        Ok(output)
365    }
366
367    /// Get access to the arena for temporary allocations.
368    ///
369    /// This can be used by callers who want to use arena-allocated
370    /// buffers for input data preparation.
371    pub fn arena(&self) -> &Arena {
372        &self.arena
373    }
374
375    /// Write frame header.
376    ///
377    /// Matches reference zstd's level 1 behavior:
378    /// - Uses window descriptor mode (not single_segment) for better compatibility
379    /// - No FCS for small frames
380    fn write_frame_header(
381        &self,
382        content_size: usize,
383        include_checksum: bool,
384        output: &mut Vec<u8>,
385    ) -> Result<()> {
386        // Frame descriptor layout:
387        // FHD: FCS_Field_Size[7:6], Single_Segment[5], Unused[4], Reserved[3], Checksum[2], Dict_ID[1:0]
388        let checksum_flag = if include_checksum { 0x04 } else { 0x00 };
389
390        // Match reference zstd behavior: use window descriptor mode (Single_Segment=0)
391        // This provides better cross-library compatibility
392        //
393        // Window_Size = 2^(10 + Exponent) * (1 + Mantissa/8)
394        // For simplicity, use Mantissa=0, so Window_Size = 2^(10 + Exponent)
395        // Window_Descriptor = Exponent << 3 (with Mantissa=0)
396
397        // Calculate window exponent
398        // Use minimum 19 (512KB) to match reference zstd behavior at level 1
399        // This provides better cross-decoder compatibility
400        let window_log = if content_size == 0 {
401            19u8 // 512KB default
402        } else {
403            let log = (content_size as f64).log2().ceil() as u8;
404            log.clamp(19, 30) // minimum 512KB window
405        };
406
407        // Use FCS for larger content (>128KB), skip for typical block sizes
408        // Reference zstd doesn't use FCS for 64KB inputs, uses window descriptor only
409        let (fcs_size, descriptor) = if content_size > 131071 {
410            // 4-byte FCS (FCS_Field_Size=10, Single_Segment=0)
411            (4, 0x80u8 | checksum_flag)
412        } else {
413            // No FCS (FCS_Field_Size=00, Single_Segment=0)
414            // Decoder will use window size as max content size
415            (0, checksum_flag)
416        };
417
418        output.push(descriptor);
419
420        // Window descriptor (always present when Single_Segment=0)
421        let window_descriptor = (window_log - 10) << 3;
422        output.push(window_descriptor);
423
424        // Write FCS if present
425        if fcs_size == 4 {
426            output.extend_from_slice(&(content_size as u32).to_le_bytes());
427        }
428
429        Ok(())
430    }
431
432    /// Encode input data into blocks.
433    fn encode_blocks(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
434        const MAX_BLOCK_SIZE: usize = 128 * 1024 - 1; // 128KB - 1
435
436        if input.is_empty() {
437            // Empty block
438            let header = 1u32; // last=1, type=Raw, size=0
439            output.push((header & 0xFF) as u8);
440            output.push(((header >> 8) & 0xFF) as u8);
441            output.push(((header >> 16) & 0xFF) as u8);
442            return Ok(());
443        }
444
445        let mut pos = 0;
446        while pos < input.len() {
447            let remaining = input.len() - pos;
448            let block_size = remaining.min(MAX_BLOCK_SIZE);
449            let is_last = pos + block_size >= input.len();
450
451            let block_data = &input[pos..pos + block_size];
452            self.encode_single_block(block_data, is_last, output)?;
453
454            pos += block_size;
455        }
456
457        Ok(())
458    }
459
460    /// Encode a single block.
461    fn encode_single_block(
462        &mut self,
463        input: &[u8],
464        is_last: bool,
465        output: &mut Vec<u8>,
466    ) -> Result<()> {
467        // Try to find the best encoding strategy
468        let (block_type, encoded) = self.select_block_encoding(input)?;
469
470        // Write block header
471        let mut header = if is_last { 1u32 } else { 0u32 };
472        header |= (block_type as u32) << 1;
473
474        let size = match block_type {
475            0 => input.len(),   // Raw: decompressed size
476            1 => input.len(),   // RLE: decompressed size
477            2 => encoded.len(), // Compressed: compressed size
478            _ => unreachable!(),
479        };
480
481        header |= (size as u32) << 3;
482
483        output.push((header & 0xFF) as u8);
484        output.push(((header >> 8) & 0xFF) as u8);
485        output.push(((header >> 16) & 0xFF) as u8);
486
487        // Write block data
488        output.extend_from_slice(&encoded);
489
490        Ok(())
491    }
492
493    /// Select the best block encoding strategy using compressibility analysis.
494    ///
495    /// Uses a tiered approach for optimal throughput:
496    /// - Small inputs (<4KB): Full fingerprint analysis for best decisions
497    /// - Large inputs (>=4KB): Fast sampling to detect special cases, otherwise
498    ///   proceed directly to match finding
499    fn select_block_encoding(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
500        // For large inputs, use fast sampling to avoid O(n) histogram overhead
501        if input.len() >= 4096 {
502            return self.select_block_encoding_fast(input);
503        }
504
505        // Small inputs: use full fingerprint analysis
506        let fingerprint = analysis::CompressibilityFingerprint::analyze(input);
507
508        // Fast-path for uniform data (RLE block)
509        if fingerprint.strategy == CompressionStrategy::RleBlock {
510            return Ok((1, vec![input[0]]));
511        }
512
513        // Note: We don't skip match finding for "Random" patterns because
514        // byte-level entropy can be misleading. Periodic patterns like 0,1,2,...255
515        // have high entropy (8.0) but are highly compressible via offset matches.
516        // Always try match finding and only fall back to raw if it doesn't help.
517
518        // Find matches
519        let matches = self.find_matches(input);
520
521        // Refine strategy with match data
522        let mut fingerprint = fingerprint;
523        fingerprint.refine_with_matches(input, &matches);
524
525        // Choose encoding based on refined strategy
526        match fingerprint.strategy {
527            CompressionStrategy::RleBlock => Ok((1, vec![input[0]])),
528            CompressionStrategy::RawBlock => {
529                let compressed = self.encode_literals_only(input)?;
530                if compressed.len() < input.len() {
531                    Ok((2, compressed))
532                } else {
533                    Ok((0, input.to_vec()))
534                }
535            }
536            CompressionStrategy::RleSequences | CompressionStrategy::PredefinedFse => {
537                let compressed = self.encode_with_rle_sequences(input, &matches)?;
538                if compressed.len() < input.len() {
539                    Ok((2, compressed))
540                } else {
541                    Ok((0, input.to_vec()))
542                }
543            }
544        }
545    }
546
547    /// Fast block encoding selection for large inputs.
548    ///
549    /// Uses sampling-based analysis (~100 cycles) instead of full histogram (~10K cycles).
550    /// This dramatically improves throughput for 64KB+ inputs.
551    fn select_block_encoding_fast(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
552        // Fast check for RLE (uniform data)
553        let first = input[0];
554        let is_uniform = input.len() >= 64 && {
555            // Check first 64 bytes
556            let first_uniform = input[..64].iter().all(|&b| b == first);
557            if first_uniform {
558                // Verify with sampling for rest
559                let step = input.len() / 32;
560                (1..32).all(|i| input.get(i * step).copied() == Some(first))
561            } else {
562                false
563            }
564        };
565
566        if is_uniform {
567            return Ok((1, vec![first]));
568        }
569
570        // Note: We don't skip match finding based on entropy because byte-level
571        // entropy can be misleading. Periodic patterns like 0,1,2,...255 have
572        // high entropy (8.0) but are highly compressible via offset matches.
573        // Always try match finding and only fall back to raw if it doesn't help.
574
575        // Proceed with match finding - most data will take this path
576        let matches = self.find_matches(input);
577
578        // Encode with sequences
579        let compressed = self.encode_with_rle_sequences(input, &matches)?;
580        if compressed.len() < input.len() {
581            Ok((2, compressed))
582        } else {
583            Ok((0, input.to_vec()))
584        }
585    }
586
587    /// Encode a compressed block using sequence encoding.
588    ///
589    /// Uses RLE mode for uniform sequences (same codes for all LL, OF, ML).
590    /// Falls back to FSE encoding for non-uniform sequences.
591    /// When custom FSE tables are configured, uses them for sequence encoding.
592    /// When custom Huffman tables are configured, uses them for literal encoding.
593    fn encode_with_rle_sequences(&mut self, input: &[u8], matches: &[Match]) -> Result<Vec<u8>> {
594        // Pre-allocate output: compressed size is usually smaller than input
595        let mut output = Vec::with_capacity(input.len());
596
597        // Convert matches to sequences
598        let (literals, seqs) = block::matches_to_sequences(input, matches);
599
600        if seqs.is_empty() {
601            // No sequences - use literals-only mode
602            self.encode_literals_with_options(input, &mut output)?;
603            block::encode_sequences(&[], &mut output)?;
604        } else {
605            // Analyze sequences (also pre-encodes into codes+extra bits)
606            let suitability = sequences::analyze_for_rle(&seqs);
607
608            // Encode literals using custom Huffman if available
609            self.encode_literals_with_options(&literals, &mut output)?;
610
611            // Use custom tables if available, otherwise use predefined
612            if let Some(custom_tables) = &self.custom_tables {
613                sequences::encode_sequences_with_custom_tables(
614                    &suitability.encoded,
615                    custom_tables,
616                    &mut output,
617                )?;
618            } else {
619                // Always use FSE/Predefined mode for cross-decoder compatibility.
620                // RLE mode has subtle implementation differences that cause issues.
621                sequences::encode_sequences_fse_with_encoded(&suitability.encoded, &mut output)?;
622            }
623        }
624
625        Ok(output)
626    }
627
628    /// Encode literals using custom Huffman table if available, otherwise use default logic.
629    fn encode_literals_with_options(&self, literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
630        if let Some(custom_huffman) = &self.custom_huffman {
631            block::encode_literals_with_encoder(literals, custom_huffman.encoder(), output)
632        } else {
633            block::encode_literals(literals, output)
634        }
635    }
636
637    /// Encode a compressed block with only literals (no sequences).
638    fn encode_literals_only(&mut self, input: &[u8]) -> Result<Vec<u8>> {
639        let mut output = Vec::with_capacity(input.len());
640
641        // Encode all input as literals
642        self.encode_literals_with_options(input, &mut output)?;
643
644        // Empty sequences section
645        block::encode_sequences(&[], &mut output)?;
646
647        Ok(output)
648    }
649}
650
651// =============================================================================
652// Tests
653// =============================================================================
654
655#[cfg(test)]
656mod tests {
657    use super::*;
658
659    #[test]
660    fn test_compress_context_creation() {
661        let ctx = CompressContext::new(CompressionLevel::Default);
662        assert_eq!(ctx.level, CompressionLevel::Default);
663    }
664
665    #[test]
666    fn test_compress_empty() {
667        let mut ctx = CompressContext::new(CompressionLevel::Fast);
668        let result = ctx.compress(&[]).unwrap();
669
670        // Should have magic (4) + header (2: descriptor + 1-byte FCS) + empty block (3)
671        // Note: compress() does NOT include checksum by default
672        assert!(
673            result.len() >= 4 + 2 + 3,
674            "expected at least 9 bytes, got {}",
675            result.len()
676        );
677
678        // Verify magic
679        assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
680    }
681
682    #[test]
683    fn test_compress_small() {
684        let mut ctx = CompressContext::new(CompressionLevel::Fast);
685        let input = b"Hello, World!";
686        let result = ctx.compress(input).unwrap();
687
688        // Should be valid frame
689        assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
690    }
691
692    #[test]
693    fn test_compress_rle_detection() {
694        let mut ctx = CompressContext::new(CompressionLevel::Fast);
695        let input = vec![b'A'; 100];
696        let result = ctx.compress(&input).unwrap();
697
698        // RLE should be much smaller than raw
699        assert!(result.len() < input.len());
700    }
701
702    #[test]
703    fn test_compression_levels() {
704        for level in [
705            CompressionLevel::Fast,
706            CompressionLevel::Default,
707            CompressionLevel::Best,
708        ] {
709            let mut ctx = CompressContext::new(level);
710            let input = b"Test compression with different levels";
711            let result = ctx.compress(input);
712            assert!(result.is_ok());
713        }
714    }
715
716    // =========================================================================
717    // Track A.4: Compression Ratio Optimization Tests
718    // =========================================================================
719
720    #[test]
721    fn test_text_compression_ratio_baseline() {
722        // Test compression on text-like data
723        let data = b"The quick brown fox jumps over the lazy dog. ".repeat(1000);
724
725        let mut ctx = CompressContext::new(CompressionLevel::Default);
726        let compressed = ctx.compress(&data).unwrap();
727
728        let ratio = data.len() as f64 / compressed.len() as f64;
729
730        // Text should achieve at least 2.5x compression (baseline)
731        assert!(
732            ratio >= 2.5,
733            "Text compression ratio {:.2}x below baseline 2.5x",
734            ratio
735        );
736    }
737
738    #[test]
739    fn test_repetitive_data_compression_ratio() {
740        // Repetitive data should compress very well
741        let data = b"abcdabcdabcdabcd".repeat(5000);
742
743        let mut ctx = CompressContext::new(CompressionLevel::Default);
744        let compressed = ctx.compress(&data).unwrap();
745
746        let ratio = data.len() as f64 / compressed.len() as f64;
747
748        // Repetitive data should achieve at least 50x compression
749        assert!(
750            ratio >= 50.0,
751            "Repetitive data ratio {:.2}x below expected 50x",
752            ratio
753        );
754    }
755
756    #[test]
757    fn test_compression_levels_ratio_ordering() {
758        // Better compression levels should produce smaller or equal output
759        let data = b"This is test data that will be compressed at different levels. ".repeat(500);
760
761        let mut fast_ctx = CompressContext::new(CompressionLevel::Fast);
762        let mut default_ctx = CompressContext::new(CompressionLevel::Default);
763        let mut best_ctx = CompressContext::new(CompressionLevel::Best);
764
765        let fast = fast_ctx.compress(&data).unwrap();
766        let default = default_ctx.compress(&data).unwrap();
767        let best = best_ctx.compress(&data).unwrap();
768
769        // Best should be <= Default <= Fast (in terms of output size)
770        assert!(
771            best.len() <= default.len(),
772            "Best ({}) should be <= Default ({})",
773            best.len(),
774            default.len()
775        );
776        assert!(
777            default.len() <= fast.len() + 50, // Allow small tolerance for heuristics
778            "Default ({}) should be close to or better than Fast ({})",
779            default.len(),
780            fast.len()
781        );
782    }
783
784    #[test]
785    fn test_adaptive_search_depth_performance() {
786        // Larger data shouldn't take disproportionately longer
787        let small_data = vec![b'a'; 1000];
788        let large_data = vec![b'a'; 100_000];
789
790        // Use fast level for timing test
791        let mut ctx_small = CompressContext::new(CompressionLevel::Fast);
792        let mut ctx_large = CompressContext::new(CompressionLevel::Fast);
793
794        let start = std::time::Instant::now();
795        let _ = ctx_small.compress(&small_data).unwrap();
796        let small_time = start.elapsed();
797
798        let start = std::time::Instant::now();
799        let _ = ctx_large.compress(&large_data).unwrap();
800        let large_time = start.elapsed();
801
802        // Large should take at most 200x longer (100x more data + overhead)
803        let time_ratio = large_time.as_nanos() as f64 / small_time.as_nanos().max(1) as f64;
804        assert!(
805            time_ratio < 200.0,
806            "Time ratio {:.1}x exceeds expected linear scaling",
807            time_ratio
808        );
809    }
810
811    #[test]
812    fn test_lazy_match_finder_produces_matches() {
813        // Lazy match finder should find good matches
814        let data = b"abcdefabcdefabcdef".repeat(100);
815
816        let mut lazy_mf = LazyMatchFinder::new(24);
817        let matches = lazy_mf.find_matches(&data);
818
819        // Should find matches in repetitive data
820        assert!(
821            !matches.is_empty(),
822            "Lazy match finder should find matches in repetitive data"
823        );
824
825        // Matches should have valid offsets and lengths
826        for m in &matches {
827            assert!(m.offset > 0, "Match offset should be positive");
828            assert!(m.length >= 3, "Match length should be at least 3");
829        }
830    }
831
832    #[test]
833    fn test_long_distance_pattern_detection() {
834        // Create data with long-distance repeats
835        let pattern = b"This is a distinctive pattern.";
836        let mut data = Vec::new();
837        data.extend_from_slice(pattern);
838        data.extend_from_slice(&vec![b'x'; 10_000]); // 10KB gap
839        data.extend_from_slice(pattern); // Repeat
840
841        let mut ctx = CompressContext::new(CompressionLevel::Default);
842        let compressed = ctx.compress(&data).unwrap();
843
844        // Should compress well due to finding the long-distance match
845        // Compressed size should be noticeably smaller than input
846        assert!(
847            compressed.len() < data.len() - 20,
848            "Should find long-distance pattern match"
849        );
850    }
851
852    #[test]
853    fn test_entropy_based_strategy_selection() {
854        // Low entropy data should trigger efficient encoding
855        let low_entropy = vec![0u8; 100_000];
856
857        let mut ctx = CompressContext::new(CompressionLevel::Fast);
858        let compressed = ctx.compress(&low_entropy).unwrap();
859
860        // Low entropy should achieve extreme compression (RLE)
861        let ratio = low_entropy.len() as f64 / compressed.len() as f64;
862        assert!(
863            ratio > 1000.0,
864            "Low entropy ratio {:.0}x should be >1000x",
865            ratio
866        );
867    }
868
869    #[test]
870    fn test_mixed_content_compression() {
871        // Mixed content: some compressible, some not
872        let mut data = Vec::new();
873        data.extend_from_slice(b"Compressible repeated text. ".repeat(50).as_slice());
874        data.extend_from_slice(&(0u8..=255u8).cycle().take(1000).collect::<Vec<u8>>());
875        data.extend_from_slice(b"More compressible text here. ".repeat(50).as_slice());
876
877        let mut ctx = CompressContext::new(CompressionLevel::Default);
878        let compressed = ctx.compress(&data).unwrap();
879
880        // Should still achieve some compression
881        assert!(
882            compressed.len() < data.len(),
883            "Mixed content should still compress"
884        );
885    }
886
887    #[test]
888    fn test_speculative_compression_available() {
889        // Verify speculative compression is available
890        let data = b"Test data for speculative compression. ".repeat(100);
891
892        let ctx = CompressContext::new(CompressionLevel::Default);
893        let result = ctx.compress_speculative(&data);
894
895        assert!(result.is_ok(), "Speculative compression should work");
896        assert!(
897            result.unwrap().len() < data.len(),
898            "Should produce compression"
899        );
900    }
901
902    #[test]
903    fn test_compression_with_checksum() {
904        let data = b"Data to compress with checksum verification.".repeat(100);
905
906        let mut ctx = CompressContext::new(CompressionLevel::Default);
907        let with_checksum = ctx.compress_with_checksum(&data).unwrap();
908        let without_checksum = ctx.compress(&data).unwrap();
909
910        // With checksum should be 4 bytes larger (XXH64 lower 32 bits)
911        assert_eq!(
912            with_checksum.len(),
913            without_checksum.len() + 4,
914            "Checksum adds 4 bytes"
915        );
916    }
917
918    #[test]
919    fn test_arena_reuse_efficiency() {
920        let mut ctx = CompressContext::new(CompressionLevel::Default);
921
922        // Compress multiple times with different data to force arena usage
923        for i in 0..5 {
924            let data = format!("Test arena reuse iteration {}. ", i).repeat(1000);
925            let result = ctx.compress(data.as_bytes()).unwrap();
926            // Verify compression succeeded
927            assert!(!result.is_empty());
928        }
929
930        // Arena capacity should be available for inspection
931        let capacity = ctx.arena().capacity();
932        assert!(capacity > 0, "Arena should have capacity");
933    }
934
935    #[test]
936    fn test_block_size_handling() {
937        // Test compression with data larger than max block size (128KB - 1)
938        let large_data = vec![b'A'; 150_000]; // >128KB
939
940        let mut ctx = CompressContext::new(CompressionLevel::Fast);
941        let compressed = ctx.compress(&large_data).unwrap();
942
943        // Should produce valid compressed output
944        assert!(!compressed.is_empty());
945        // Should be smaller than input for RLE-able data
946        assert!(
947            compressed.len() < large_data.len(),
948            "Large RLE data should compress well"
949        );
950    }
951
952    #[test]
953    fn test_match_finder_variant_selection() {
954        // Different levels should use different match finder variants
955        let data = b"test pattern for matching variants".repeat(100);
956
957        // None uses Greedy
958        let mut none_ctx = CompressContext::new(CompressionLevel::None);
959        let none_result = none_ctx.compress(&data);
960        assert!(none_result.is_ok());
961
962        // Best uses RepeatAware
963        let mut best_ctx = CompressContext::new(CompressionLevel::Best);
964        let best_result = best_ctx.compress(&data);
965        assert!(best_result.is_ok());
966
967        // Best should typically produce smaller output for repetitive data
968        assert!(
969            best_result.unwrap().len() <= none_result.unwrap().len(),
970            "Best should be at least as good as None"
971        );
972    }
973
974    #[test]
975    fn test_repeat_offset_match_finder() {
976        // RepeatAware should find repeat offset patterns
977        let data = b"abcdefghijklmnopabcdefghijklmnopabcdefghijklmnop".repeat(50);
978
979        let mut mf = RepeatOffsetMatchFinder::new(48);
980        let matches = mf.find_matches(&data);
981
982        // Should find matches using repeat offsets
983        assert!(
984            !matches.is_empty(),
985            "RepeatAware should find matches in repetitive data"
986        );
987    }
988
989    #[test]
990    fn test_compression_fingerprint_accuracy() {
991        // Test fingerprint analysis accuracy
992        let uniform_data = vec![42u8; 1000];
993        let fp_uniform = analysis::CompressibilityFingerprint::analyze(&uniform_data);
994        assert_eq!(
995            fp_uniform.pattern,
996            PatternType::Uniform,
997            "Should detect uniform pattern"
998        );
999
1000        let text_data = b"The quick brown fox. ".repeat(100);
1001        let fp_text = analysis::CompressibilityFingerprint::analyze(&text_data);
1002        assert!(
1003            matches!(
1004                fp_text.pattern,
1005                PatternType::TextLike | PatternType::Periodic { .. } | PatternType::LowEntropy
1006            ),
1007            "Should detect text-like or periodic pattern, got {:?}",
1008            fp_text.pattern
1009        );
1010    }
1011
1012    #[test]
1013    fn test_fast_entropy_estimate() {
1014        // Test fast entropy estimation
1015        let low_entropy = vec![0u8; 10000];
1016        let est_low = fast_entropy_estimate(&low_entropy);
1017        assert!(est_low < 1.0, "Low entropy data should have low estimate");
1018
1019        let high_entropy: Vec<u8> = (0..10000).map(|i| (i % 256) as u8).collect();
1020        let est_high = fast_entropy_estimate(&high_entropy);
1021        assert!(
1022            est_high > 5.0,
1023            "High entropy data should have high estimate: {}",
1024            est_high
1025        );
1026    }
1027
1028    #[test]
1029    fn test_fast_predict_block_type() {
1030        // Test fast block type prediction
1031        let zeros = vec![0u8; 1000];
1032        assert_eq!(
1033            fast_predict_block_type(&zeros),
1034            FastBlockType::Rle,
1035            "Uniform data should predict RLE"
1036        );
1037
1038        let text = b"Hello world! ".repeat(100);
1039        let predicted = fast_predict_block_type(&text);
1040        assert!(
1041            predicted == FastBlockType::Compress || predicted == FastBlockType::Raw,
1042            "Text should predict Compress or Raw"
1043        );
1044    }
1045}
haagenti_zstd/compress/mod.rs

haagenti_zstd/compress/
mod.rs