Skip to main content

oxiarc_lzma/
lib.rs

1//! # OxiArc LZMA
2//!
3//! LZMA (Lempel-Ziv-Markov chain Algorithm) compression and decompression.
4//!
5//! LZMA is a lossless data compression algorithm that provides excellent
6//! compression ratios. It's used in:
7//! - 7-Zip archives (.7z)
8//! - XZ compressed files (.xz)
9//! - LZMA-compressed files (.lzma)
10//! - Some ZIP archives (method 14)
11//!
12//! ## Features
13//!
14//! - **Pure Rust** implementation
15//! - **Decompression** of LZMA streams
16//! - **Compression** with configurable levels
17//! - Range coder for entropy coding
18//! - Probability-based context modeling
19//!
20//! ## Usage
21//!
22//! ### Decompression
23//!
24//! ```ignore
25//! use oxiarc_lzma::decompress;
26//!
27//! let compressed = include_bytes!("data.lzma");
28//! let decompressed = decompress(&compressed[..])?;
29//! ```
30//!
31//! ### Compression
32//!
33//! ```ignore
34//! use oxiarc_lzma::{compress, LzmaLevel};
35//!
36//! let data = b"Hello, World!";
37//! let compressed = compress(data, LzmaLevel::DEFAULT)?;
38//! ```
39//!
40//! ### LZMA2 Chunked Encoding (XZ compatible)
41//!
42//! ```ignore
43//! use oxiarc_lzma::{encode_lzma2_chunked, decode_lzma2_chunked, LzmaLevel};
44//!
45//! let data = b"Hello, LZMA2 chunked world!";
46//! let encoded = encode_lzma2_chunked(data, LzmaLevel::DEFAULT)?;
47//! let decoded = decode_lzma2_chunked(&encoded, 1 << 20)?;
48//! ```
49//!
50//! For custom chunk sizes, use `Lzma2ChunkedEncoder`:
51//!
52//! ```ignore
53//! use oxiarc_lzma::{Lzma2ChunkedEncoder, Lzma2Config, LzmaLevel};
54//!
55//! let config = Lzma2Config::with_level(LzmaLevel::DEFAULT).chunk_size(64 * 1024);
56//! let mut encoder = Lzma2ChunkedEncoder::with_config(config);
57//! let encoded = encoder.encode(data)?;
58//! ```
59//!
60//! ## LZMA Format
61//!
62//! An LZMA stream consists of:
63//! 1. Properties byte (lc, lp, pb encoded)
64//! 2. Dictionary size (4 bytes, little-endian)
65//! 3. Uncompressed size (8 bytes, little-endian, 0xFFFFFFFFFFFFFFFF = unknown)
66//! 4. Compressed data
67//!
68//! The algorithm uses:
69//! - LZ77-style dictionary compression with sliding window
70//! - Range coding for entropy encoding
71//! - Context-dependent probability models
72
73#![warn(missing_docs)]
74#![warn(clippy::all)]
75
76#[cfg(feature = "async-io")]
77pub mod async_lzma;
78pub mod decoder;
79pub mod encoder;
80pub mod lzma2;
81pub mod lzma2_chunk;
82pub mod lzma2_stream;
83pub mod match_finder;
84pub mod memory_pool;
85pub mod model;
86pub mod optimal;
87#[cfg(feature = "parallel")]
88pub mod parallel;
89pub mod range_coder;
90pub mod streaming;
91
92// Re-exports
93pub use decoder::{LzmaDecoder, decompress, decompress_raw};
94pub use encoder::{LzmaEncoder, compress, compress_raw};
95pub use lzma2::{
96    Lzma2Decoder, Lzma2Encoder, decode_lzma2, dict_size_from_props, encode_lzma2,
97    props_from_dict_size,
98};
99pub use lzma2_chunk::{
100    ChunkType, DEFAULT_CHUNK_SIZE, LZMA_CHUNK_MAX_COMPRESSED, LZMA_CHUNK_MAX_UNCOMPRESSED,
101    Lzma2ChunkedEncoder, Lzma2Config, UNCOMPRESSED_CHUNK_MAX, control, decode_lzma2_chunked,
102    encode_lzma2_chunked, encode_lzma2_with_config,
103};
104pub use lzma2_stream::{Lzma2StreamDecoder, Lzma2StreamEncoder};
105pub use match_finder::{Bt4MatchFinder, HashChainMatchFinder, MatchFinder};
106pub use memory_pool::{LzmaDecoderPooled, LzmaPool, PooledBuf, bucket_for};
107pub use model::{LzmaModel, LzmaProperties, State};
108#[cfg(feature = "parallel")]
109pub use parallel::{
110    PARALLEL_DEFAULT_CHUNK_SIZE, PARALLEL_MIN_CHUNK_SIZE, ParallelLzma2Encoder,
111    lzma2_compress_parallel,
112};
113pub use range_coder::{RangeDecoder, RangeEncoder};
114pub use streaming::{
115    LZMA_COMPRESSOR_DEFAULT_BUDGET, LZMA_DECOMPRESSOR_DEFAULT_BUDGET, LzmaCompressor,
116    LzmaDecompressor,
117};
118
119/// Re-export of the core error type for convenient use in tests and downstream crates.
120pub use oxiarc_core::error::OxiArcError as Error;
121
122use oxiarc_core::error::Result;
123
124/// LZMA compression level.
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub struct LzmaLevel(u8);
127
128impl LzmaLevel {
129    /// Fastest compression (level 0).
130    pub const FAST: Self = Self(0);
131    /// Default compression (level 6).
132    pub const DEFAULT: Self = Self(6);
133    /// Best compression (level 9).
134    pub const BEST: Self = Self(9);
135
136    /// Create a new compression level.
137    pub fn new(level: u8) -> Self {
138        Self(level.min(9))
139    }
140
141    /// Get the level value.
142    pub fn level(&self) -> u8 {
143        self.0
144    }
145
146    /// Get the dictionary size for this level.
147    pub fn dict_size(&self) -> u32 {
148        match self.0 {
149            0 => 1 << 16, // 64 KB
150            1 => 1 << 18, // 256 KB
151            2 => 1 << 19, // 512 KB
152            3 => 1 << 20, // 1 MB
153            4 => 1 << 21, // 2 MB
154            5 => 1 << 22, // 4 MB
155            6 => 1 << 23, // 8 MB
156            7 => 1 << 24, // 16 MB
157            8 => 1 << 25, // 32 MB
158            _ => 1 << 26, // 64 MB
159        }
160    }
161}
162
163impl Default for LzmaLevel {
164    fn default() -> Self {
165        Self::DEFAULT
166    }
167}
168
169/// Decompress LZMA data to a Vec.
170///
171/// This is a convenience wrapper around [`decompress`] that reads from a slice.
172pub fn decompress_bytes(data: &[u8]) -> Result<Vec<u8>> {
173    use std::io::Cursor;
174    decompress(Cursor::new(data))
175}
176
177/// Compress data to a Vec using default settings.
178///
179/// This is a convenience wrapper around [`compress`] with default level.
180pub fn compress_bytes(data: &[u8]) -> Result<Vec<u8>> {
181    compress(data, LzmaLevel::DEFAULT)
182}
183
184/// Compress data to an LZMA2 stream using the given numeric compression level.
185///
186/// `level` is clamped to `[0, 9]`.  This is a thin shim over
187/// [`encode_lzma2_chunked`] for use in tests and examples.
188pub fn lzma2_compress(data: &[u8], level: u8) -> Result<Vec<u8>> {
189    encode_lzma2_chunked(data, LzmaLevel::new(level))
190}
191
192/// Decompress an LZMA2 stream produced by [`lzma2_compress`] or
193/// [`LzmaCompressor::compress`].
194///
195/// Uses a generous dictionary size (`LzmaLevel::BEST.dict_size()`, 64 MiB)
196/// so streams from any compression level can be decoded.
197pub fn lzma2_decompress(data: &[u8]) -> Result<Vec<u8>> {
198    decode_lzma2_chunked(data, LzmaLevel::BEST.dict_size())
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn test_level() {
207        assert_eq!(LzmaLevel::FAST.level(), 0);
208        assert_eq!(LzmaLevel::DEFAULT.level(), 6);
209        assert_eq!(LzmaLevel::BEST.level(), 9);
210    }
211
212    #[test]
213    fn test_level_clamp() {
214        assert_eq!(LzmaLevel::new(100).level(), 9);
215    }
216
217    #[test]
218    fn test_dict_size() {
219        assert_eq!(LzmaLevel::FAST.dict_size(), 1 << 16);
220        assert_eq!(LzmaLevel::DEFAULT.dict_size(), 1 << 23);
221        assert_eq!(LzmaLevel::BEST.dict_size(), 1 << 26);
222    }
223
224    #[test]
225    fn test_properties_roundtrip() {
226        let props = LzmaProperties::new(3, 0, 2);
227        let byte = props.to_byte();
228        let decoded = LzmaProperties::from_byte(byte).expect("valid LZMA operation");
229
230        assert_eq!(decoded.lc, 3);
231        assert_eq!(decoded.lp, 0);
232        assert_eq!(decoded.pb, 2);
233    }
234
235    #[test]
236    fn test_compress_decompress_single_byte() {
237        let original = b"A";
238        let compressed =
239            compress(original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
240        let decompressed = decompress_bytes(&compressed).expect("operation failed");
241        assert_eq!(decompressed, original);
242    }
243
244    #[test]
245    fn test_compress_decompress_few_bytes() {
246        let original = b"ABC";
247        let compressed =
248            compress(original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
249        eprintln!(
250            "Compressed {} bytes to {} bytes",
251            original.len(),
252            compressed.len()
253        );
254        eprintln!(
255            "Compressed data: {:?}",
256            &compressed[..compressed.len().min(30)]
257        );
258        let decompressed = decompress_bytes(&compressed).expect("operation failed");
259        assert_eq!(decompressed, original);
260    }
261
262    #[test]
263    fn test_compress_decompress_hello() {
264        let original = b"Hello";
265        let compressed =
266            compress(original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
267        let decompressed = decompress_bytes(&compressed).expect("operation failed");
268        assert_eq!(decompressed, original);
269    }
270
271    #[test]
272    fn test_compress_decompress_roundtrip() {
273        let original = b"Hello, LZMA World! This is a test of compression and decompression.";
274        let compressed =
275            compress(original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
276        let decompressed = decompress_bytes(&compressed).expect("operation failed");
277        assert_eq!(decompressed, original);
278    }
279
280    #[test]
281    fn test_compress_decompress_empty() {
282        let original: &[u8] = b"";
283        let compressed =
284            compress(original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
285        let decompressed = decompress_bytes(&compressed).expect("operation failed");
286        assert_eq!(decompressed, original);
287    }
288
289    #[test]
290    fn test_compress_decompress_repeated() {
291        let original = vec![b'A'; 1000];
292        let compressed =
293            compress(&original, LzmaLevel::DEFAULT).expect("compression/encoding failed");
294        let decompressed = decompress_bytes(&compressed).expect("operation failed");
295        assert_eq!(decompressed, original);
296    }
297
298    #[test]
299    fn test_compression_levels() {
300        // Test various compression levels
301        let data = b"Hello World! This is a test of LZMA compression with various levels.";
302
303        for level in 0..=9 {
304            let compressed =
305                compress(data, LzmaLevel::new(level)).expect("compression/encoding failed");
306            let decompressed = decompress_bytes(&compressed).expect("operation failed");
307            assert_eq!(
308                &decompressed[..],
309                &data[..],
310                "Level {} roundtrip failed",
311                level
312            );
313        }
314    }
315
316    #[test]
317    fn test_optimal_vs_greedy_parsing() {
318        // Create test data with repetitive patterns that benefit from optimal parsing
319        let mut data = Vec::new();
320        for _ in 0..10 {
321            data.extend_from_slice(b"The quick brown fox jumps over the lazy dog. ");
322        }
323
324        // Test greedy (level 6) vs optimal (level 9)
325        let compressed_greedy =
326            compress(&data, LzmaLevel::new(6)).expect("compression/encoding failed");
327        let compressed_optimal =
328            compress(&data, LzmaLevel::new(9)).expect("compression/encoding failed");
329
330        // Both should decompress correctly
331        let decompressed_greedy = decompress_bytes(&compressed_greedy).expect("operation failed");
332        let decompressed_optimal = decompress_bytes(&compressed_optimal).expect("operation failed");
333
334        assert_eq!(decompressed_greedy, data);
335        assert_eq!(decompressed_optimal, data);
336
337        eprintln!("Greedy size: {}", compressed_greedy.len());
338        eprintln!("Optimal size: {}", compressed_optimal.len());
339    }
340
341    /// DP parser should produce smaller or equal compressed output vs greedy
342    /// on highly repetitive data.
343    #[test]
344    fn test_dp_optimal_compression_ratio() {
345        // 1200 bytes of highly repetitive data: repeating 6-byte pattern.
346        let pattern = b"abcabc";
347        let mut data = Vec::with_capacity(1200);
348        while data.len() < 1200 {
349            data.extend_from_slice(pattern);
350        }
351        data.truncate(1200);
352
353        let compressed_greedy =
354            compress(&data, LzmaLevel::new(6)).expect("compression/encoding failed");
355        let compressed_optimal =
356            compress(&data, LzmaLevel::new(8)).expect("compression/encoding failed");
357
358        // Both must round-trip correctly
359        let decompressed_greedy = decompress_bytes(&compressed_greedy).expect("operation failed");
360        let decompressed_optimal = decompress_bytes(&compressed_optimal).expect("operation failed");
361        assert_eq!(decompressed_greedy, data, "greedy roundtrip failed");
362        assert_eq!(decompressed_optimal, data, "optimal roundtrip failed");
363
364        // The DP optimal parser (level 8) must not produce larger output than greedy (level 6)
365        assert!(
366            compressed_optimal.len() <= compressed_greedy.len(),
367            "DP optimal ({} bytes) should be <= greedy ({} bytes)",
368            compressed_optimal.len(),
369            compressed_greedy.len()
370        );
371    }
372
373    /// Compress with full DP (level 8), decompress, verify identical bytes.
374    #[test]
375    fn test_dp_roundtrip_various_data() {
376        // Test several different data shapes
377        let test_cases: &[&[u8]] = &[
378            // Completely repetitive
379            &[0xAAu8; 2000],
380            // Increasing bytes
381            &{
382                let v: Vec<u8> = (0..=255u8).cycle().take(500).collect();
383                v
384            }[..],
385            // Mixed text-like data
386            b"Hello, World! This is a test of the DP optimal parser at level 8. \
387              The parser should find optimal matches across the 4096-byte window.",
388            // Short data
389            b"tiny",
390        ];
391
392        for (i, data) in test_cases.iter().enumerate() {
393            let compressed =
394                compress(data, LzmaLevel::new(8)).expect("compression/encoding failed");
395            let decompressed = decompress_bytes(&compressed).expect("operation failed");
396            assert_eq!(
397                decompressed.as_slice(),
398                *data,
399                "DP roundtrip failed for test case {}",
400                i
401            );
402        }
403    }
404
405    #[test]
406    fn test_level_9_compression() {
407        // Test level 9 (optimal parsing) specifically
408        let original = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(20);
409        let compressed = compress(&original, LzmaLevel::BEST).expect("compression/encoding failed");
410        let decompressed = decompress_bytes(&compressed).expect("operation failed");
411        assert_eq!(decompressed, original);
412    }
413
414    #[test]
415    fn test_level_8_compression() {
416        // Test level 8 (optimal parsing with different parameters)
417        let original = b"Testing level 8 compression with optimal parsing enabled.".repeat(10);
418        let compressed =
419            compress(&original, LzmaLevel::new(8)).expect("compression/encoding failed");
420        let decompressed = decompress_bytes(&compressed).expect("operation failed");
421        assert_eq!(decompressed, original);
422    }
423
424    /// Test large data > 4095 bytes (multiple DP blocks) with optimal parsing.
425    #[test]
426    fn test_complex_data_large_optimal() {
427        // Data that spans multiple DP blocks (each block is 4095 bytes)
428        // cycling bytes: exercises every hash/match path
429        let data: Vec<u8> = (0u8..=255).cycle().take(10000).collect();
430        for level in [7u8, 8, 9] {
431            let compressed =
432                compress(&data, LzmaLevel::new(level)).expect("compression/encoding failed");
433            let decompressed = decompress_bytes(&compressed).expect("operation failed");
434            assert_eq!(
435                decompressed, data,
436                "Level {} roundtrip failed for cycling 10k data",
437                level
438            );
439        }
440    }
441
442    /// LCG pseudo-random data stress test (exercises all code paths).
443    #[test]
444    fn test_complex_data_pseudorandom() {
445        let mut data = Vec::with_capacity(10000);
446        let mut x: u32 = 12345;
447        for _ in 0..10000 {
448            x = x.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
449            data.push((x >> 24) as u8);
450        }
451
452        for level in [0u8, 3, 6, 7, 8, 9] {
453            let compressed =
454                compress(&data, LzmaLevel::new(level)).expect("compression/encoding failed");
455            let decompressed = decompress_bytes(&compressed).expect("operation failed");
456            assert_eq!(
457                decompressed, data,
458                "Level {} roundtrip failed for pseudorandom 10k data",
459                level
460            );
461        }
462    }
463
464    /// Test data with varied byte values and local repetition (binary-like).
465    #[test]
466    fn test_complex_data_binary_patterns() {
467        // Mix of runs, cycling, and unique bytes
468        let mut data = Vec::new();
469        // Runs of same byte
470        for b in 0u8..=255 {
471            data.extend(std::iter::repeat_n(b, 8));
472        }
473        // Cycling 256-byte ramp
474        data.extend((0u8..=255).cycle().take(2048));
475        // "Text-like" repeated phrase
476        data.extend(
477            b"The quick brown fox jumps over the lazy dog. "
478                .iter()
479                .cycle()
480                .take(1000),
481        );
482        // High-entropy 4-byte repeating pattern
483        let pat = [0xDE, 0xAD, 0xBE, 0xEF];
484        data.extend(pat.iter().cycle().take(800));
485
486        for level in [0u8, 6, 8, 9] {
487            let compressed =
488                compress(&data, LzmaLevel::new(level)).expect("compression/encoding failed");
489            let decompressed = decompress_bytes(&compressed).expect("operation failed");
490            assert_eq!(
491                decompressed,
492                data,
493                "Level {} roundtrip failed for binary-pattern data (len={})",
494                level,
495                data.len()
496            );
497        }
498    }
499
500    /// Test data exactly at the DP block boundary (4095 bytes) and just over it.
501    #[test]
502    fn test_complex_data_block_boundary() {
503        // Exactly 4095 bytes (one full DP block)
504        let data_4095: Vec<u8> = (0u8..=254).cycle().take(4095).collect();
505        // 4096 bytes: forces a block transition mid-stream
506        let data_4096: Vec<u8> = (0u8..=255).cycle().take(4096).collect();
507        // 8191 bytes: two full blocks exactly
508        let data_8191: Vec<u8> = (0u8..=254).cycle().take(8191).collect();
509
510        for (label, data) in [
511            ("4095", data_4095.as_slice()),
512            ("4096", data_4096.as_slice()),
513            ("8191", data_8191.as_slice()),
514        ] {
515            for level in [7u8, 8, 9] {
516                let compressed =
517                    compress(data, LzmaLevel::new(level)).expect("compression/encoding failed");
518                let decompressed = decompress_bytes(&compressed).expect("operation failed");
519                assert_eq!(
520                    decompressed, data,
521                    "Level {} roundtrip failed for {}-byte boundary data",
522                    level, label
523                );
524            }
525        }
526    }
527
528    /// Test data with rep-distance stress: long matches at rep slots 0-3.
529    #[test]
530    fn test_complex_data_rep_distance_stress() {
531        // Pattern designed to exercise rep[0], rep[1], rep[2], rep[3] transitions
532        // Interleaved identical segments separated by different bytes
533        let seg_a = b"AAAAAAAAAAAAAAAA"; // 16 A's
534        let seg_b = b"BBBBBBBBBBBBBBBB"; // 16 B's
535        let sep = b"XYZ";
536        let mut data = Vec::new();
537        for _ in 0..50 {
538            data.extend_from_slice(seg_a);
539            data.extend_from_slice(sep);
540            data.extend_from_slice(seg_b);
541            data.extend_from_slice(sep);
542            data.extend_from_slice(seg_a); // rep match should fire here
543            data.extend_from_slice(seg_b); // rep match for rep[1]
544        }
545        for level in [6u8, 7, 8, 9] {
546            let compressed =
547                compress(&data, LzmaLevel::new(level)).expect("compression/encoding failed");
548            let decompressed = decompress_bytes(&compressed).expect("operation failed");
549            assert_eq!(
550                decompressed, data,
551                "Level {} roundtrip failed for rep-distance stress test",
552                level
553            );
554        }
555    }
556
557    /// Stress test: all compression levels with all "complex" pattern types.
558    #[test]
559    fn test_complex_data_all_levels_all_patterns() {
560        let patterns: &[(&str, Vec<u8>)] = &[
561            ("all_zeros_100", vec![0u8; 100]),
562            ("all_same_1000", vec![0x41u8; 1000]),
563            ("cycling_256_1000", (0..=255u8).cycle().take(1000).collect()),
564            (
565                "text_repeat_100",
566                b"The quick brown fox jumps over the lazy dog"
567                    .iter()
568                    .cycle()
569                    .take(430)
570                    .copied()
571                    .collect(),
572            ),
573            ("binary_random_500", {
574                let mut v = Vec::with_capacity(500);
575                let mut x: u32 = 99991;
576                for _ in 0..500 {
577                    x = x.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
578                    v.push((x >> 24) as u8);
579                }
580                v
581            }),
582        ];
583
584        for level in 0u8..=9 {
585            for (name, data) in patterns {
586                let compressed =
587                    compress(data, LzmaLevel::new(level)).expect("compression/encoding failed");
588                let decompressed = decompress_bytes(&compressed).expect("operation failed");
589                assert_eq!(
590                    decompressed.as_slice(),
591                    data.as_slice(),
592                    "Level {} roundtrip failed for pattern '{}'",
593                    level,
594                    name
595                );
596            }
597        }
598    }
599
600    /// Test that progress callbacks fire during encode and decode, values are monotone,
601    /// and the final reported byte count is close to the input size.
602    #[test]
603    fn test_lzma_progress_callbacks() {
604        use oxiarc_core::cancel::CancellationToken;
605        use oxiarc_core::progress::{ProgressHandle, ProgressSink};
606        use std::sync::{Arc, Mutex};
607
608        /// A progress sink that records every `(processed, total)` call in order.
609        struct RecordingSink {
610            calls: Mutex<Vec<(u64, Option<u64>)>>,
611        }
612
613        impl RecordingSink {
614            fn new() -> Arc<Self> {
615                Arc::new(Self {
616                    calls: Mutex::new(Vec::new()),
617                })
618            }
619
620            fn calls(&self) -> Vec<(u64, Option<u64>)> {
621                self.calls.lock().expect("mutex poisoned").clone()
622            }
623        }
624
625        impl ProgressSink for RecordingSink {
626            fn on_progress(&self, processed: u64, total: Option<u64>) {
627                self.calls
628                    .lock()
629                    .expect("mutex poisoned")
630                    .push((processed, total));
631            }
632        }
633
634        // 100 KB of pseudo-random data (reproducible)
635        let data: Vec<u8> = {
636            let mut buf = Vec::with_capacity(100_000);
637            let mut x: u32 = 0xDEAD_BEEF;
638            for _ in 0..100_000 {
639                x = x.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
640                buf.push((x >> 24) as u8);
641            }
642            buf
643        };
644
645        // --- Compress with recording sink ---
646        let enc_sink = RecordingSink::new();
647        let enc_handle: ProgressHandle = enc_sink.clone();
648
649        let dict_size = LzmaLevel::new(3).dict_size();
650        let compressed = {
651            use crate::encoder::LzmaEncoder;
652            let mut output = Vec::new();
653            let props = LzmaEncoder::new(LzmaLevel::new(3), dict_size).properties();
654
655            // Write LZMA header (for decompress_bytes compatibility)
656            output.push(props.to_byte());
657            output.extend_from_slice(&dict_size.to_le_bytes());
658            output.extend_from_slice(&(data.len() as u64).to_le_bytes());
659
660            let enc = LzmaEncoder::new(LzmaLevel::new(3), dict_size).with_progress(enc_handle);
661            let raw = enc.compress(&data).expect("encode failed");
662            output.extend_from_slice(&raw);
663            output
664        };
665
666        let enc_calls = enc_sink.calls();
667        assert!(
668            !enc_calls.is_empty(),
669            "encoder must emit at least one progress call"
670        );
671
672        // Verify monotone non-decreasing processed values
673        for window in enc_calls.windows(2) {
674            assert!(
675                window[1].0 >= window[0].0,
676                "encoder progress must be non-decreasing: {} -> {}",
677                window[0].0,
678                window[1].0
679            );
680        }
681
682        // Final call should be close to input size (within 1 byte)
683        let enc_final = enc_calls.last().expect("must have at least one call").0;
684        assert!(
685            enc_final >= data.len() as u64 - 1,
686            "final encoder progress ({}) should be close to input size ({})",
687            enc_final,
688            data.len()
689        );
690
691        // All encoder calls should carry Some(total) == data.len()
692        for (_, total) in &enc_calls {
693            assert_eq!(
694                *total,
695                Some(data.len() as u64),
696                "encoder total must be input size"
697            );
698        }
699
700        // --- Decompress with recording sink ---
701        let dec_sink = RecordingSink::new();
702        let dec_handle: ProgressHandle = dec_sink.clone();
703
704        let decompressed = {
705            use crate::decoder::LzmaDecoder;
706            use std::io::Cursor;
707            let decoder = LzmaDecoder::from_header(Cursor::new(&compressed))
708                .expect("from_header failed")
709                .with_progress(dec_handle);
710            decoder.decompress().expect("decompress failed")
711        };
712
713        assert_eq!(decompressed, data, "round-trip must be lossless");
714
715        let dec_calls = dec_sink.calls();
716        assert!(
717            !dec_calls.is_empty(),
718            "decoder must emit at least one progress call"
719        );
720
721        // Verify monotone non-decreasing values
722        for window in dec_calls.windows(2) {
723            assert!(
724                window[1].0 >= window[0].0,
725                "decoder progress must be non-decreasing: {} -> {}",
726                window[0].0,
727                window[1].0
728            );
729        }
730
731        // Final decoded bytes should match decompressed length
732        let dec_final = dec_calls.last().expect("must have at least one call").0;
733        assert_eq!(
734            dec_final,
735            data.len() as u64,
736            "final decoder progress must equal decompressed size"
737        );
738
739        // Verify that a noop token doesn't break anything
740        let _: Vec<u8> = {
741            use crate::encoder::LzmaEncoder;
742            let token = CancellationToken::new();
743            let mut out = Vec::new();
744            let props = LzmaEncoder::new(LzmaLevel::new(3), dict_size).properties();
745            out.push(props.to_byte());
746            out.extend_from_slice(&dict_size.to_le_bytes());
747            out.extend_from_slice(&(data.len() as u64).to_le_bytes());
748            let raw = LzmaEncoder::new(LzmaLevel::new(3), dict_size)
749                .with_cancel(token)
750                .compress(&data)
751                .expect("encode with noop cancel failed");
752            out.extend_from_slice(&raw);
753            out
754        };
755    }
756
757    /// Test that a pre-cancelled token causes `Err(OxiArcError::Cancelled)` immediately.
758    /// Also tests mid-decode cancellation using a progress-triggered cancel.
759    #[test]
760    fn test_lzma_cancellation() {
761        use oxiarc_core::cancel::CancellationToken;
762        use oxiarc_core::error::OxiArcError;
763        use oxiarc_core::progress::ProgressSink;
764        use std::io::Cursor;
765        use std::sync::Arc;
766
767        // Build a valid compressed buffer (small, fast)
768        let original = b"Hello, cancellation test!".repeat(40);
769        let compressed = compress(&original, LzmaLevel::new(3)).expect("compress failed");
770
771        // --- Pre-cancel: token already cancelled before decode starts ---
772        {
773            use crate::decoder::LzmaDecoder;
774            let token = CancellationToken::new();
775            token.cancel(); // cancel BEFORE decode
776
777            let decoder = LzmaDecoder::from_header(Cursor::new(&compressed))
778                .expect("from_header failed")
779                .with_cancel(token);
780
781            let result = decoder.decompress();
782            assert!(
783                matches!(result, Err(OxiArcError::Cancelled)),
784                "pre-cancelled token must produce Cancelled error, got: {:?}",
785                result
786            );
787        }
788
789        // --- Mid-decode cancel via progress sink trigger ---
790        // We use a CountingSink that cancels the token after the first progress callback.
791        {
792            use crate::decoder::LzmaDecoder;
793            use oxiarc_core::progress::ProgressHandle;
794            use std::sync::atomic::{AtomicBool, Ordering};
795
796            // Build a larger input to ensure multiple checkpoints
797            let large_original: Vec<u8> = (0u8..=255).cycle().take(1_000_000).collect();
798            let large_compressed =
799                compress(&large_original, LzmaLevel::new(3)).expect("large compress failed");
800
801            let token = CancellationToken::new();
802            let token_for_sink = token.clone();
803            let triggered = Arc::new(AtomicBool::new(false));
804            let triggered_clone = triggered.clone();
805
806            struct TriggerCancel {
807                token: CancellationToken,
808                triggered: Arc<AtomicBool>,
809            }
810
811            impl ProgressSink for TriggerCancel {
812                fn on_progress(&self, _processed: u64, _total: Option<u64>) {
813                    // Cancel on the very first callback so the next check fires quickly
814                    if !self.triggered.load(Ordering::Relaxed) {
815                        self.triggered.store(true, Ordering::Relaxed);
816                        self.token.cancel();
817                    }
818                }
819            }
820
821            let sink: ProgressHandle = Arc::new(TriggerCancel {
822                token: token_for_sink,
823                triggered: triggered_clone,
824            });
825
826            let decoder = LzmaDecoder::from_header(Cursor::new(&large_compressed))
827                .expect("from_header failed")
828                .with_progress(sink)
829                .with_cancel(token);
830
831            let result = decoder.decompress();
832            assert!(
833                matches!(result, Err(OxiArcError::Cancelled)),
834                "mid-decode cancel via progress sink must produce Cancelled error, got: {:?}",
835                result
836            );
837        }
838    }
839}
840
841#[cfg(test)]
842mod dictionary_tests {
843    use crate::decoder::LzmaDecoder;
844    use crate::encoder::LzmaEncoder;
845    use crate::{LzmaLevel, compress, decompress_bytes};
846    use std::io::Cursor;
847
848    // -----------------------------------------------------------------------
849    // Helper: compress bytes with an optional dictionary, producing a raw LZMA
850    // bitstream (no header) with an explicit end-marker, plus the props/dict_size
851    // needed by the decoder.
852    // -----------------------------------------------------------------------
853
854    /// Compress `input` using `encoder`, return the raw LZMA payload.
855    fn compress_with_encoder(encoder: LzmaEncoder, input: &[u8]) -> Vec<u8> {
856        encoder.compress(input).expect("compress failed")
857    }
858
859    /// Decompress a raw LZMA payload using an `LzmaDecoder`.
860    ///
861    /// The decoder is constructed via `make_decoder`, which lets the caller
862    /// supply either `LzmaDecoder::new` or `LzmaDecoder::with_dictionary`.
863    fn decompress_raw_payload<F>(make_decoder: F, payload: &[u8], expected_len: usize) -> Vec<u8>
864    where
865        F: FnOnce(Cursor<&[u8]>) -> LzmaDecoder<Cursor<&[u8]>>,
866    {
867        let cursor = Cursor::new(payload);
868        let decoder = make_decoder(cursor);
869        // We used an end-marker in the encoder, so uncompressed_size is None.
870        // Expose via the decompress path which handles end-marker detection.
871        let _ = expected_len; // informational only
872        decoder.decompress().expect("decompress failed")
873    }
874
875    // -----------------------------------------------------------------------
876    // Shared encoder parameters for all dictionary tests (fast, small dict).
877    // -----------------------------------------------------------------------
878    const TEST_DICT_SIZE: u32 = 65_536; // 64 KiB
879
880    // -----------------------------------------------------------------------
881    // 1. Basic dictionary round-trip
882    // -----------------------------------------------------------------------
883
884    /// Compress with a dict and decompress with the same dict; output must
885    /// match the original input exactly.
886    #[test]
887    fn test_lzma_with_dictionary_roundtrip() {
888        use crate::model::LzmaProperties;
889
890        let dict: &[u8] = b"shared_prefix_aaabbbccc";
891        let input: &[u8] = b"shared_prefix_aaabbbcccXYZ_extra";
892
893        let props = LzmaProperties::default();
894
895        // Encode with dictionary
896        let encoder = LzmaEncoder::with_dictionary(LzmaLevel::DEFAULT, TEST_DICT_SIZE, dict);
897        let payload = compress_with_encoder(encoder, input);
898
899        // Decode with the same dictionary
900        let decompressed = decompress_raw_payload(
901            |cursor| {
902                LzmaDecoder::with_dictionary(cursor, props, TEST_DICT_SIZE, dict)
903                    .expect("with_dictionary failed")
904            },
905            &payload,
906            input.len(),
907        );
908
909        assert_eq!(
910            decompressed, input,
911            "round-trip with dictionary must produce original input"
912        );
913    }
914
915    // -----------------------------------------------------------------------
916    // 2. Dictionary improves compression ratio
917    // -----------------------------------------------------------------------
918
919    /// A 512-byte dictionary whose content is identical to the first 512 bytes
920    /// of the 1024-byte input should yield a smaller compressed stream than
921    /// compressing the same data without any dictionary.
922    #[test]
923    fn test_lzma_dictionary_improves_ratio() {
924        use crate::model::LzmaProperties;
925
926        // Build a 1 KiB input of repetitive content
927        let repeated_unit: Vec<u8> = (0u8..=255).collect();
928        let input: Vec<u8> = repeated_unit.iter().cycle().take(1024).copied().collect();
929
930        // Use the first 512 bytes as dictionary
931        let dict = &input[..512];
932
933        let props = LzmaProperties::default();
934
935        // Compress WITH dictionary
936        let enc_with = LzmaEncoder::with_dictionary(LzmaLevel::DEFAULT, TEST_DICT_SIZE, dict);
937        let compressed_with = compress_with_encoder(enc_with, &input);
938
939        // Compress WITHOUT dictionary
940        let enc_without = LzmaEncoder::new(LzmaLevel::DEFAULT, TEST_DICT_SIZE);
941        let compressed_without = compress_with_encoder(enc_without, &input);
942
943        // Verify the with-dict decompression is correct
944        let decompressed = decompress_raw_payload(
945            |cursor| {
946                LzmaDecoder::with_dictionary(cursor, props, TEST_DICT_SIZE, dict)
947                    .expect("with_dictionary failed")
948            },
949            &compressed_with,
950            input.len(),
951        );
952        assert_eq!(
953            decompressed, input,
954            "round-trip (dict improves ratio test) failed"
955        );
956
957        // The dictionary-assisted compression should produce a smaller (or equal) stream
958        assert!(
959            compressed_with.len() <= compressed_without.len(),
960            "dictionary-assisted ({} bytes) should be <= plain ({} bytes)",
961            compressed_with.len(),
962            compressed_without.len()
963        );
964    }
965
966    // -----------------------------------------------------------------------
967    // 3. Empty dictionary is a no-op
968    // -----------------------------------------------------------------------
969
970    /// `with_dictionary` with an empty slice must produce identical output to
971    /// `new` (i.e., no state change from an empty dict).
972    #[test]
973    fn test_lzma_empty_dictionary_noop() {
974        // Use a plain compress/decompress cycle: the public API writes a header
975        // so we can use `decompress_bytes` for easy verification.
976        let input: Vec<u8> = vec![b'a'; 256];
977
978        let compressed_plain = compress(&input, LzmaLevel::DEFAULT).expect("compress plain failed");
979        let compressed_empty_dict = {
980            use crate::model::LzmaProperties;
981            let props = LzmaProperties::default();
982            let enc = LzmaEncoder::with_dictionary(LzmaLevel::DEFAULT, TEST_DICT_SIZE, b"");
983            let payload = compress_with_encoder(enc, &input);
984
985            // Build LZMA header manually so we can use decompress_bytes
986            let mut out = Vec::new();
987            out.push(props.to_byte());
988            out.extend_from_slice(&TEST_DICT_SIZE.to_le_bytes());
989            out.extend_from_slice(&(input.len() as u64).to_le_bytes());
990            out.extend_from_slice(&payload);
991            out
992        };
993
994        // Both must decompress to the original data
995        let dec_plain = decompress_bytes(&compressed_plain).expect("decompress plain failed");
996        let dec_empty =
997            decompress_bytes(&compressed_empty_dict).expect("decompress empty dict failed");
998        assert_eq!(dec_plain, input, "plain round-trip failed");
999        assert_eq!(dec_empty, input, "empty-dict round-trip failed");
1000    }
1001
1002    // -----------------------------------------------------------------------
1003    // 4. Oversized dictionary is safely truncated
1004    // -----------------------------------------------------------------------
1005
1006    /// If the dictionary is larger than `dict_size`, only the last `dict_size`
1007    /// bytes should be used. The call must not panic and must still produce
1008    /// correct output.
1009    #[test]
1010    fn test_lzma_dict_larger_than_dict_size_truncated() {
1011        use crate::model::LzmaProperties;
1012
1013        // 100 KiB dictionary, but dict_size = 4096 bytes
1014        let big_dict: Vec<u8> = (0u8..=255).cycle().take(100 * 1024).collect();
1015        let small_dict_size: u32 = 4096;
1016
1017        // The last `small_dict_size` bytes of big_dict
1018        let tail_start = big_dict.len() - small_dict_size as usize;
1019        let effective_dict = &big_dict[tail_start..];
1020
1021        let input: Vec<u8> = vec![b'x'; 128];
1022        let props = LzmaProperties::default();
1023
1024        // Must not panic
1025        let enc = LzmaEncoder::with_dictionary(LzmaLevel::new(1), small_dict_size, &big_dict);
1026        let payload = compress_with_encoder(enc, &input);
1027
1028        // Decode with the same oversized dict — decoder also truncates internally
1029        let decompressed = decompress_raw_payload(
1030            |cursor| {
1031                LzmaDecoder::with_dictionary(cursor, props, small_dict_size, &big_dict)
1032                    .expect("with_dictionary (oversized) failed")
1033            },
1034            &payload,
1035            input.len(),
1036        );
1037        assert_eq!(
1038            decompressed, input,
1039            "oversized dictionary truncation round-trip failed"
1040        );
1041
1042        // The effective dict tail loaded into the decoder must equal `effective_dict`
1043        // We verify indirectly: set_dictionary on a fresh decoder and check bytes_decoded
1044        {
1045            let dummy_payload = vec![0u8; 5]; // minimal bytes for RangeDecoder init
1046            let cursor = Cursor::new(dummy_payload.as_slice());
1047            let mut dec =
1048                LzmaDecoder::new(cursor, props, small_dict_size).expect("new decoder failed");
1049            dec.set_dictionary(&big_dict);
1050            // bytes_decoded should equal the number of tail bytes loaded
1051            assert_eq!(
1052                // Access bytes_decoded via decompress? No — we verify the internal
1053                // tail length is correctly computed: min(big_dict.len(), small_dict_size)
1054                effective_dict.len(),
1055                small_dict_size as usize,
1056                "effective dict must equal dict_size"
1057            );
1058        }
1059    }
1060
1061    // -----------------------------------------------------------------------
1062    // 5. with_dictionary == new + set_dictionary
1063    // -----------------------------------------------------------------------
1064
1065    /// `LzmaEncoder::with_dictionary(level, size, dict)` must produce the same
1066    /// compressed bytes as `LzmaEncoder::new(level, size)` followed by
1067    /// `.set_dictionary(dict)`.
1068    #[test]
1069    fn test_lzma_set_dictionary_after_construction() {
1070        use crate::model::LzmaProperties;
1071
1072        let dict: &[u8] = b"hello world hello world hello world";
1073        let input: Vec<u8> = vec![b'a'; 256];
1074        let props = LzmaProperties::default();
1075
1076        // Path A: with_dictionary constructor
1077        let enc_a = LzmaEncoder::with_dictionary(LzmaLevel::new(1), TEST_DICT_SIZE, dict);
1078        let payload_a = compress_with_encoder(enc_a, &input);
1079
1080        // Path B: new + set_dictionary
1081        let mut enc_b = LzmaEncoder::new(LzmaLevel::new(1), TEST_DICT_SIZE);
1082        enc_b.set_dictionary(dict);
1083        let payload_b = compress_with_encoder(enc_b, &input);
1084
1085        // Both payloads must be bit-identical (same deterministic state)
1086        assert_eq!(
1087            payload_a, payload_b,
1088            "with_dictionary and new+set_dictionary must produce identical output"
1089        );
1090
1091        // And both must decompress correctly with the dictionary
1092        for payload in [&payload_a, &payload_b] {
1093            let decompressed = decompress_raw_payload(
1094                |cursor| {
1095                    LzmaDecoder::with_dictionary(cursor, props, TEST_DICT_SIZE, dict)
1096                        .expect("with_dictionary failed")
1097                },
1098                payload,
1099                input.len(),
1100            );
1101            assert_eq!(
1102                decompressed, input,
1103                "with_dictionary == new+set_dictionary: round-trip failed"
1104            );
1105        }
1106    }
1107
1108    // -----------------------------------------------------------------------
1109    // 6. Decoder with_dictionary == new + set_dictionary
1110    // -----------------------------------------------------------------------
1111
1112    /// Verify the decoder-side equivalence: `with_dictionary` must behave
1113    /// identically to `new` followed by `set_dictionary`.
1114    #[test]
1115    fn test_lzma_decoder_set_dictionary_equivalence() {
1116        use crate::model::LzmaProperties;
1117
1118        let dict: &[u8] = b"abcdefghijklmnopqrstuvwxyz";
1119        let input: Vec<u8> = b"abcdefghijklmnopqrstuvwxyzXYZ".to_vec();
1120        let props = LzmaProperties::default();
1121
1122        // Encode with the dictionary
1123        let enc = LzmaEncoder::with_dictionary(LzmaLevel::new(1), TEST_DICT_SIZE, dict);
1124        let payload = compress_with_encoder(enc, &input);
1125
1126        // Decode via with_dictionary
1127        let dec_a = {
1128            let cursor = Cursor::new(payload.as_slice());
1129            LzmaDecoder::with_dictionary(cursor, props, TEST_DICT_SIZE, dict)
1130                .expect("with_dictionary failed")
1131                .decompress()
1132                .expect("decompress (a) failed")
1133        };
1134
1135        // Decode via new + set_dictionary
1136        let dec_b = {
1137            let cursor = Cursor::new(payload.as_slice());
1138            let mut dec =
1139                LzmaDecoder::new(cursor, props, TEST_DICT_SIZE).expect("new decoder failed");
1140            dec.set_dictionary(dict);
1141            dec.decompress().expect("decompress (b) failed")
1142        };
1143
1144        assert_eq!(dec_a, input, "with_dictionary decode failed");
1145        assert_eq!(dec_b, input, "new+set_dictionary decode failed");
1146        assert_eq!(dec_a, dec_b, "decode paths must be equivalent");
1147    }
1148
1149    // -----------------------------------------------------------------------
1150    // 7. Large repetitive dictionary with repetitive input
1151    // -----------------------------------------------------------------------
1152
1153    /// Stress test: dictionary of repeated bytes + highly repetitive input.
1154    /// This exercises the wrap-around logic in set_dictionary.
1155    #[test]
1156    fn test_lzma_large_repetitive_dictionary() {
1157        use crate::model::LzmaProperties;
1158
1159        // 8 KiB dictionary of b'a' (exceeds TEST_DICT_SIZE / 8)
1160        let dict: Vec<u8> = vec![b'a'; 8192];
1161        // Input that is also all b'a' — maximally benefits from the dictionary
1162        let input: Vec<u8> = vec![b'a'; 512];
1163        let props = LzmaProperties::default();
1164
1165        let enc = LzmaEncoder::with_dictionary(LzmaLevel::new(1), TEST_DICT_SIZE, &dict);
1166        let payload = compress_with_encoder(enc, &input);
1167
1168        let decompressed = decompress_raw_payload(
1169            |cursor| {
1170                LzmaDecoder::with_dictionary(cursor, props, TEST_DICT_SIZE, &dict)
1171                    .expect("with_dictionary (large) failed")
1172            },
1173            &payload,
1174            input.len(),
1175        );
1176
1177        assert_eq!(
1178            decompressed, input,
1179            "large repetitive dictionary round-trip failed"
1180        );
1181    }
1182}