zesven 1.1.0

A pure Rust implementation of the 7z archive format
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
//! Fast LZMA2 encoding implementation.
//!
//! This module provides the core encoding logic for Fast LZMA2, including:
//! - Token types (literals and matches)
//! - Greedy match selection
//! - LZMA2 block output (uncompressed and compressed)

use std::io::{self, Write};

use super::radix_mf::RadixMatchFinder;

/// Maximum size for LZMA2 uncompressed chunks (64KB - 1).
const MAX_UNCOMPRESSED_CHUNK_SIZE: usize = 65535;

/// Block size for compression (1MB default).
pub const DEFAULT_BLOCK_SIZE: usize = 1024 * 1024;

/// Compression token representing either a literal byte or a match.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Token {
    /// A literal byte (no match found).
    Literal(u8),
    /// A match with distance and length.
    Match {
        /// Distance back in the history (1-indexed).
        distance: u32,
        /// Length of the match (2-273 bytes).
        length: u32,
    },
}

impl Token {
    /// Returns true if this is a literal token.
    pub fn is_literal(&self) -> bool {
        matches!(self, Token::Literal(_))
    }

    /// Returns true if this is a match token.
    pub fn is_match(&self) -> bool {
        matches!(self, Token::Match { .. })
    }
}

/// Performs greedy encoding using the match finder.
///
/// This simple greedy algorithm:
/// 1. At each position, looks for the best match
/// 2. If a match is found (length >= 2), emits it and advances by match length
/// 3. Otherwise, emits a literal and advances by 1
///
/// # Arguments
///
/// * `data` - The input data to encode
/// * `match_finder` - The match finder with pre-built tables
///
/// # Returns
///
/// A vector of tokens representing the encoded data.
pub fn encode_greedy(data: &[u8], match_finder: &RadixMatchFinder) -> Vec<Token> {
    let mut tokens = Vec::with_capacity(data.len());
    let mut pos = 0;

    while pos < data.len() {
        if let Some(m) = match_finder.get_match(data, pos) {
            if m.length >= 2 {
                tokens.push(Token::Match {
                    distance: m.offset,
                    length: m.length,
                });
                pos += m.length as usize;
                continue;
            }
        }
        tokens.push(Token::Literal(data[pos]));
        pos += 1;
    }

    tokens
}

/// Statistics about an encoding result.
#[derive(Debug, Clone, Default)]
pub struct EncodingStats {
    /// Number of literal tokens.
    pub literals: usize,
    /// Number of match tokens.
    pub matches: usize,
    /// Total bytes encoded by matches.
    pub matched_bytes: usize,
}

impl EncodingStats {
    /// Computes statistics from a token stream.
    pub fn from_tokens(tokens: &[Token]) -> Self {
        let mut stats = Self::default();
        for token in tokens {
            match token {
                Token::Literal(_) => stats.literals += 1,
                Token::Match { length, .. } => {
                    stats.matches += 1;
                    stats.matched_bytes += *length as usize;
                }
            }
        }
        stats
    }

    /// Returns the match ratio (matched bytes / total bytes).
    pub fn match_ratio(&self) -> f64 {
        let total = self.literals + self.matched_bytes;
        if total == 0 {
            0.0
        } else {
            self.matched_bytes as f64 / total as f64
        }
    }
}

/// Writes an LZMA2 uncompressed chunk.
///
/// Uncompressed chunks have the format:
/// ```text
/// [ctrl] [size_high] [size_low] [data...]
/// ```
///
/// Where ctrl is:
/// - 0x01: Uncompressed, reset dictionary
/// - 0x02: Uncompressed, no dictionary reset
///
/// # Arguments
///
/// * `output` - The output writer
/// * `data` - The uncompressed data (max 65535 bytes)
/// * `reset_dict` - Whether to reset the dictionary
pub fn write_uncompressed_chunk(
    output: &mut impl Write,
    data: &[u8],
    reset_dict: bool,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    // Chunk size is limited to 64KB - 1
    let chunk_size = data.len().min(MAX_UNCOMPRESSED_CHUNK_SIZE);
    let ctrl = if reset_dict { 0x01u8 } else { 0x02u8 };

    // Size is stored as (size - 1) in big-endian
    let size_minus_one = (chunk_size - 1) as u16;

    output.write_all(&[ctrl])?;
    output.write_all(&size_minus_one.to_be_bytes())?;
    output.write_all(&data[..chunk_size])?;

    Ok(())
}

/// Writes multiple LZMA2 uncompressed chunks for large data.
///
/// Splits data into chunks of up to 64KB each.
///
/// # Arguments
///
/// * `output` - The output writer
/// * `data` - The data to write
/// * `reset_dict_first` - Whether to reset dictionary on first chunk
pub fn write_uncompressed_data(
    output: &mut impl Write,
    data: &[u8],
    reset_dict_first: bool,
) -> io::Result<()> {
    if data.is_empty() {
        return Ok(());
    }

    let mut pos = 0;
    let mut first = true;

    while pos < data.len() {
        let remaining = data.len() - pos;
        let chunk_size = remaining.min(MAX_UNCOMPRESSED_CHUNK_SIZE);

        let reset = first && reset_dict_first;
        write_uncompressed_chunk(output, &data[pos..pos + chunk_size], reset)?;

        pos += chunk_size;
        first = false;
    }

    Ok(())
}

/// Writes the LZMA2 end-of-stream marker (0x00).
pub fn write_end_marker(output: &mut impl Write) -> io::Result<()> {
    output.write_all(&[0x00])
}

/// LZMA2 control byte flags for compressed chunks.
#[allow(dead_code)] // Constants module reserved for full LZMA2 encoder
mod ctrl {
    /// End of stream marker.
    pub const END_MARKER: u8 = 0x00;
    /// Uncompressed chunk, reset dictionary.
    pub const UNCOMPRESSED_RESET: u8 = 0x01;
    /// Uncompressed chunk, no dictionary reset.
    pub const UNCOMPRESSED_NO_RESET: u8 = 0x02;
    /// LZMA compressed chunk base (bits 0-4 are uncompressed size high bits).
    pub const LZMA_BASE: u8 = 0x80;
    /// LZMA compressed, reset state.
    pub const LZMA_RESET_STATE: u8 = 0x80;
    /// LZMA compressed, reset state and properties.
    pub const LZMA_RESET_STATE_PROPS: u8 = 0xC0;
    /// LZMA compressed, reset state, properties, and dictionary.
    pub const LZMA_RESET_ALL: u8 = 0xE0;
}

/// Reset mode for LZMA2 compressed chunks.
///
/// The mode determines what encoder state is reset at the start of a chunk.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum ChunkResetMode {
    /// No reset - continue with previous encoder state.
    /// Use for subsequent chunks after the first.
    None = 0xA0,
    /// Reset state only - keep dictionary and properties.
    StateReset = 0x80,
    /// Reset state and properties - keep dictionary.
    StatePropsReset = 0xC0,
    /// Reset everything - state, properties, and dictionary.
    /// Must be used for the first chunk.
    AllReset = 0xE0,
}

impl ChunkResetMode {
    /// Returns true if this mode includes properties in the chunk.
    pub fn includes_props(self) -> bool {
        matches!(
            self,
            ChunkResetMode::StatePropsReset | ChunkResetMode::AllReset
        )
    }
}

/// Maximum size for LZMA2 compressed chunks (64KB uncompressed).
const MAX_COMPRESSED_CHUNK_UNPACK_SIZE: usize = 1 << 16;

/// Writes an LZMA2 compressed chunk.
///
/// Compressed chunks have the format:
/// ```text
/// [ctrl] [unpacked_size_low] [packed_size] [props?] [data...]
/// ```
///
/// Where:
/// - ctrl: Control byte with reset mode and high 5 bits of (unpacked_size - 1)
/// - unpacked_size_low: Low 16 bits of (unpacked_size - 1) (big-endian u16)
/// - packed_size: Compressed data size minus 1 (big-endian u16)
/// - props: LZMA properties byte (only if reset mode includes props)
/// - data: Range-encoded LZMA data
///
/// # Arguments
///
/// * `output` - The output writer
/// * `compressed` - The range-encoded LZMA data
/// * `uncompressed_size` - Original uncompressed size (max 64KB)
/// * `reset_mode` - The reset mode for this chunk
/// * `props` - LZMA properties byte (required if reset_mode includes props)
pub fn write_compressed_chunk(
    output: &mut impl Write,
    compressed: &[u8],
    uncompressed_size: usize,
    reset_mode: ChunkResetMode,
    props: Option<u8>,
) -> io::Result<()> {
    if compressed.is_empty() || uncompressed_size == 0 {
        return Ok(());
    }

    // Validate chunk size
    debug_assert!(uncompressed_size <= MAX_COMPRESSED_CHUNK_UNPACK_SIZE);

    // Uncompressed size is stored as (size - 1)
    let unpack_size_minus1 = (uncompressed_size - 1) as u32;

    // Control byte: reset_mode base + high 5 bits of (uncompressed_size - 1)
    let unpack_high_bits = ((unpack_size_minus1 >> 16) & 0x1F) as u8;
    let ctrl = (reset_mode as u8) | unpack_high_bits;

    // Packed size minus 1
    let pack_size_minus1 = (compressed.len() - 1) as u16;

    // Low 16 bits of uncompressed size minus 1
    let unpack_size_low = (unpack_size_minus1 & 0xFFFF) as u16;

    output.write_all(&[ctrl])?;
    // LZMA2 format: unpack_size_low comes first, then pack_size
    // (per lzma_rust2 decoder: reads uncompressed_size, then compressed_size)
    output.write_all(&unpack_size_low.to_be_bytes())?;
    output.write_all(&pack_size_minus1.to_be_bytes())?;

    if reset_mode.includes_props() {
        // Default properties: lc=3, lp=0, pb=2 -> 0x5D
        output.write_all(&[props.unwrap_or(0x5D)])?;
    }

    output.write_all(compressed)?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_token_is_literal() {
        assert!(Token::Literal(b'a').is_literal());
        assert!(
            !Token::Match {
                distance: 1,
                length: 2
            }
            .is_literal()
        );
    }

    #[test]
    fn test_token_is_match() {
        assert!(!Token::Literal(b'a').is_match());
        assert!(
            Token::Match {
                distance: 1,
                length: 2
            }
            .is_match()
        );
    }

    #[test]
    fn test_encoding_stats() {
        let tokens = vec![
            Token::Literal(b'a'),
            Token::Literal(b'b'),
            Token::Match {
                distance: 2,
                length: 5,
            },
            Token::Literal(b'c'),
        ];

        let stats = EncodingStats::from_tokens(&tokens);
        assert_eq!(stats.literals, 3);
        assert_eq!(stats.matches, 1);
        assert_eq!(stats.matched_bytes, 5);
    }

    #[test]
    fn test_write_uncompressed_chunk() {
        let mut output = Vec::new();
        let data = b"Hello, World!";

        write_uncompressed_chunk(&mut output, data, true).unwrap();

        // Check control byte
        assert_eq!(output[0], 0x01); // reset dict

        // Check size (big-endian, size - 1)
        let size = u16::from_be_bytes([output[1], output[2]]);
        assert_eq!(size, 12); // 13 - 1 = 12

        // Check data
        assert_eq!(&output[3..], data);
    }

    #[test]
    fn test_write_uncompressed_chunk_no_reset() {
        let mut output = Vec::new();
        let data = b"Test";

        write_uncompressed_chunk(&mut output, data, false).unwrap();

        // Check control byte
        assert_eq!(output[0], 0x02); // no reset
    }

    #[test]
    fn test_write_end_marker() {
        let mut output = Vec::new();
        write_end_marker(&mut output).unwrap();
        assert_eq!(output, vec![0x00]);
    }

    #[test]
    fn test_encode_greedy_all_literals() {
        // Data with no repeats
        let data = b"abcdefgh";
        let mut mf = RadixMatchFinder::new(1024, 32);
        mf.build(data);

        let tokens = encode_greedy(data, &mf);

        // Should be all literals
        assert_eq!(tokens.len(), 8);
        for (i, token) in tokens.iter().enumerate() {
            assert_eq!(*token, Token::Literal(data[i]));
        }
    }

    #[test]
    fn test_encode_greedy_with_matches() {
        // Data with repeating pattern
        let data = b"abcabcabc";
        let mut mf = RadixMatchFinder::new(1024, 32);
        mf.build(data);

        let tokens = encode_greedy(data, &mf);

        // First "abc" should be literals, then matches for the rest
        let stats = EncodingStats::from_tokens(&tokens);
        assert!(stats.matches > 0, "Should find at least one match");
        assert!(stats.literals > 0, "Should have some literals");
    }

    #[test]
    fn test_write_uncompressed_data_large() {
        // Create data larger than one chunk (64KB)
        let data: Vec<u8> = (0..70000).map(|i| (i % 256) as u8).collect();
        let mut output = Vec::new();

        write_uncompressed_data(&mut output, &data, true).unwrap();

        // Should have two chunks
        // First chunk: ctrl + 2 bytes size + 65535 bytes data
        // Second chunk: ctrl + 2 bytes size + 4465 bytes data
        assert!(output.len() > data.len()); // Overhead from headers
    }

    #[test]
    fn test_chunk_reset_mode_includes_props() {
        assert!(!ChunkResetMode::None.includes_props());
        assert!(!ChunkResetMode::StateReset.includes_props());
        assert!(ChunkResetMode::StatePropsReset.includes_props());
        assert!(ChunkResetMode::AllReset.includes_props());
    }

    #[test]
    fn test_write_compressed_chunk_format() {
        let mut output = Vec::new();
        let compressed = vec![0x00, 0x01, 0x02, 0x03, 0x04]; // 5 bytes fake compressed data
        let uncompressed_size = 100;

        write_compressed_chunk(
            &mut output,
            &compressed,
            uncompressed_size,
            ChunkResetMode::AllReset,
            Some(0x5D),
        )
        .unwrap();

        // Check control byte: 0xE0 (AllReset) | 0 (high bits of 99)
        assert_eq!(output[0], 0xE0);

        // Check unpacked size - 1 (big-endian): 100 - 1 = 99 (comes first!)
        let unpack_size = u16::from_be_bytes([output[1], output[2]]);
        assert_eq!(unpack_size, 99);

        // Check packed size - 1 (big-endian): 5 - 1 = 4 (comes second!)
        let pack_size = u16::from_be_bytes([output[3], output[4]]);
        assert_eq!(pack_size, 4);

        // Check properties byte
        assert_eq!(output[5], 0x5D);

        // Check compressed data
        assert_eq!(&output[6..], &compressed[..]);
    }

    #[test]
    fn test_write_compressed_chunk_no_props() {
        let mut output = Vec::new();
        let compressed = vec![0xAA, 0xBB, 0xCC];
        let uncompressed_size = 50;

        write_compressed_chunk(
            &mut output,
            &compressed,
            uncompressed_size,
            ChunkResetMode::StateReset, // No props
            None,
        )
        .unwrap();

        // Check control byte: 0x80 (StateReset) | 0
        assert_eq!(output[0], 0x80);

        // No properties byte, so data starts at offset 5
        // Header: 1 (ctrl) + 2 (pack_size) + 2 (unpack_size) = 5 bytes
        assert_eq!(output.len(), 5 + 3);
        assert_eq!(&output[5..], &compressed[..]);
    }

    #[test]
    fn test_write_compressed_chunk_empty() {
        let mut output = Vec::new();

        // Empty compressed data should do nothing
        write_compressed_chunk(&mut output, &[], 100, ChunkResetMode::AllReset, None).unwrap();
        assert!(output.is_empty());

        // Zero uncompressed size should do nothing
        write_compressed_chunk(
            &mut output,
            &[0x01, 0x02],
            0,
            ChunkResetMode::AllReset,
            None,
        )
        .unwrap();
        assert!(output.is_empty());
    }
}