Skip to main content

oxiarc_zstd/
lz77.rs

1//! LZ77 match finder for Zstandard compression.
2//!
3//! This module implements the LZ77 parsing phase of Zstandard compression.
4//! It uses a hash chain data structure to efficiently find repeated byte
5//! sequences (matches) in the input data.
6//!
7//! # Architecture
8//!
9//! The match finder uses two tables:
10//! - **Hash table**: maps a 4-byte hash to the most recent position with that hash.
11//! - **Chain table**: links positions with the same hash, forming a chain of candidates.
12//!
13//! For low compression levels (1-2), only the hash table is consulted (greedy matching).
14//! For higher levels, the chain is followed up to `search_depth` entries, and
15//! lazy matching is used to improve compression ratios.
16//!
17//! # Dictionary Support
18//!
19//! The match finder supports an optional dictionary (history) buffer. Matches
20//! can reference bytes in the dictionary, which is logically prepended before
21//! the actual data. Offsets into the dictionary are computed accordingly.
22
23use oxiarc_core::error::Result;
24
25/// A match found by the LZ77 engine.
26#[derive(Debug, Clone, Copy)]
27pub struct Match {
28    /// Offset of the match (distance back from the current position).
29    /// An offset of 1 means the match starts at the immediately preceding byte.
30    pub offset: usize,
31    /// Length of the match in bytes.
32    pub length: usize,
33}
34
35/// Minimum match length for Zstandard.
36pub const MIN_MATCH: usize = 3;
37
38/// Maximum match length we search for (from Zstd match length table maximum).
39pub const MAX_MATCH: usize = 65539;
40
41/// Hash seed for the multiply-shift hash function.
42const HASH_PRIME: u32 = 0x9E3779B1;
43
44/// Compression level configuration.
45///
46/// Controls the trade-off between compression speed and ratio.
47/// Higher levels use larger hash/chain tables and deeper search,
48/// producing better compression at the cost of more CPU time.
49#[derive(Debug, Clone)]
50pub struct LevelConfig {
51    /// Hash log: hash table size = `1 << hash_log`.
52    pub hash_log: u8,
53    /// Chain log: chain table size = `1 << chain_log`. Set to 0 to disable chaining.
54    pub chain_log: u8,
55    /// Maximum number of positions checked when following a hash chain.
56    pub search_depth: u32,
57    /// Whether to use lazy matching (check position P+1 before committing to P).
58    pub lazy_matching: bool,
59    /// Minimum length improvement required for a lazy match to replace the current.
60    pub lazy_min_gain: usize,
61    /// Target block size in bytes for block splitting.
62    pub target_block_size: usize,
63}
64
65impl LevelConfig {
66    /// Get configuration for a given compression level (1-22).
67    ///
68    /// Levels are clamped to the range [1, 22]. Higher levels produce
69    /// better compression at the cost of more memory and CPU time.
70    pub fn for_level(level: i32) -> Self {
71        let level = level.clamp(1, 22);
72        match level {
73            1 => LevelConfig {
74                hash_log: 17,
75                chain_log: 0,
76                search_depth: 1,
77                lazy_matching: false,
78                lazy_min_gain: 0,
79                target_block_size: 128 * 1024,
80            },
81            2 => LevelConfig {
82                hash_log: 17,
83                chain_log: 0,
84                search_depth: 2,
85                lazy_matching: false,
86                lazy_min_gain: 0,
87                target_block_size: 128 * 1024,
88            },
89            3 => LevelConfig {
90                hash_log: 17,
91                chain_log: 16,
92                search_depth: 6,
93                lazy_matching: false,
94                lazy_min_gain: 0,
95                target_block_size: 128 * 1024,
96            },
97            4..=6 => LevelConfig {
98                hash_log: 18,
99                chain_log: 17,
100                search_depth: (level * 4) as u32,
101                lazy_matching: true,
102                lazy_min_gain: 1,
103                target_block_size: 128 * 1024,
104            },
105            7..=9 => LevelConfig {
106                hash_log: 19,
107                chain_log: 18,
108                search_depth: (level * 8) as u32,
109                lazy_matching: true,
110                lazy_min_gain: 1,
111                target_block_size: 128 * 1024,
112            },
113            10..=15 => LevelConfig {
114                hash_log: 20,
115                chain_log: 19,
116                search_depth: (level * 16) as u32,
117                lazy_matching: true,
118                lazy_min_gain: 2,
119                target_block_size: 128 * 1024,
120            },
121            _ => LevelConfig {
122                hash_log: 20,
123                chain_log: 20,
124                search_depth: (level * 32) as u32,
125                lazy_matching: true,
126                lazy_min_gain: 3,
127                target_block_size: 128 * 1024,
128            },
129        }
130    }
131}
132
133/// LZ77 sequence: a literal run followed by an optional match.
134///
135/// Each sequence represents a contiguous portion of the input:
136/// first `literals.len()` literal bytes, then `match_length` bytes
137/// copied from `offset` bytes back in the output history.
138#[derive(Debug, Clone)]
139pub struct Lz77Sequence {
140    /// Literal bytes before the match.
141    pub literals: Vec<u8>,
142    /// Match offset (distance back from current position). 0 means no match.
143    pub offset: usize,
144    /// Match length in bytes. 0 means no match (literal-only sequence).
145    pub match_length: usize,
146}
147
148/// LZ77 match finder using hash chains.
149///
150/// The match finder maintains a hash table and chain table that are indexed
151/// by position in a virtual address space where the dictionary is prepended
152/// to the actual data.
153pub struct MatchFinder {
154    /// Hash table: maps hash value to most recent position with that hash.
155    /// Positions are stored as u32 for memory efficiency.
156    hash_table: Vec<u32>,
157    /// Chain table: for each position, links to the previous position with
158    /// the same hash value. Only used when `chain_log > 0`.
159    chain_table: Vec<u32>,
160    /// Bitmask for hash table indexing.
161    hash_mask: u32,
162    /// Bitmask for chain table indexing. Zero if chaining is disabled.
163    chain_mask: u32,
164    /// Configuration for this compression level.
165    config: LevelConfig,
166}
167
168impl MatchFinder {
169    /// Create a new match finder with the given level configuration.
170    pub fn new(config: &LevelConfig) -> Self {
171        let hash_size = 1usize << config.hash_log;
172        let chain_size = if config.chain_log > 0 {
173            1usize << config.chain_log
174        } else {
175            0
176        };
177
178        Self {
179            hash_table: vec![0u32; hash_size],
180            chain_table: vec![0u32; chain_size],
181            hash_mask: (hash_size as u32).wrapping_sub(1),
182            chain_mask: if chain_size > 0 {
183                (chain_size as u32).wrapping_sub(1)
184            } else {
185                0
186            },
187            config: config.clone(),
188        }
189    }
190
191    /// Find LZ77 sequences in the input data.
192    ///
193    /// The optional `dict` slice acts as history prepended before `data`.
194    /// Matches may reference bytes in the dictionary. The returned sequences
195    /// cover exactly all bytes of `data` (not the dictionary).
196    ///
197    /// # Errors
198    ///
199    /// Returns an error if internal invariants are violated (should not happen
200    /// in normal operation).
201    pub fn find_sequences(&mut self, data: &[u8], dict: &[u8]) -> Result<Vec<Lz77Sequence>> {
202        if data.is_empty() {
203            return Ok(Vec::new());
204        }
205
206        let dict_len = dict.len();
207        // Build a combined view: dict ++ data.
208        // Positions in [0..dict_len) refer to dictionary bytes,
209        // positions in [dict_len..dict_len+data.len()) refer to data bytes.
210        let combined_len = dict_len + data.len();
211        let combined = CombinedBuffer::new(dict, data);
212
213        // Seed the hash table with dictionary positions.
214        self.seed_dictionary(&combined, dict_len);
215
216        let mut sequences = Vec::new();
217        let mut pos = dict_len; // Current position in combined space.
218        let mut literal_start = dict_len; // Start of current literal run.
219
220        while pos < combined_len {
221            // Need at least MIN_MATCH bytes remaining to find a match.
222            if pos + MIN_MATCH > combined_len {
223                break;
224            }
225
226            let best_match = self.find_best_match(&combined, pos, dict_len);
227
228            // Lazy matching: if we found a match at pos, check if pos+1 gives a better one.
229            let (final_match, advance_one) = if self.config.lazy_matching {
230                if let Some(m1) = best_match {
231                    if pos + 1 + MIN_MATCH <= combined_len {
232                        // Temporarily insert pos into hash/chain so pos+1 can reference it.
233                        self.insert_position(&combined, pos);
234                        let m2 = self.find_best_match(&combined, pos + 1, dict_len);
235                        if let Some(m2) = m2 {
236                            if m2.length > m1.length + self.config.lazy_min_gain {
237                                // Lazy match is better: emit literal at pos, use m2 at pos+1.
238                                (Some(m2), true)
239                            } else {
240                                (Some(m1), false)
241                            }
242                        } else {
243                            (Some(m1), false)
244                        }
245                    } else {
246                        (Some(m1), false)
247                    }
248                } else {
249                    (None, false)
250                }
251            } else {
252                (best_match, false)
253            };
254
255            if advance_one {
256                // Emit literal for current position, then advance to pos+1 for the match.
257                pos += 1;
258            }
259
260            if let Some(m) = final_match {
261                // Collect literals from literal_start to pos.
262                let literals: Vec<u8> = (literal_start..pos).map(|p| combined.get(p)).collect();
263
264                // Convert combined-space offset to data-relative offset.
265                // The match offset is already a distance back from pos.
266                sequences.push(Lz77Sequence {
267                    literals,
268                    offset: m.offset,
269                    match_length: m.length,
270                });
271
272                // Insert all positions covered by the match into hash/chain
273                // (so future matches can reference them).
274                let match_end = (pos + m.length).min(combined_len);
275                // Insert the match start position if not already inserted by lazy matching.
276                if !advance_one {
277                    self.insert_position(&combined, pos);
278                }
279                // Insert intermediate positions (skip first, already inserted).
280                for insert_pos in (pos + 1)..match_end {
281                    if insert_pos + MIN_MATCH <= combined_len {
282                        self.insert_position(&combined, insert_pos);
283                    }
284                }
285
286                pos = match_end;
287                literal_start = pos;
288            } else {
289                // No match found: insert position and advance.
290                self.insert_position(&combined, pos);
291                pos += 1;
292            }
293        }
294
295        // Remaining literals at the end of the data.
296        if literal_start < combined_len {
297            let literals: Vec<u8> = (literal_start..combined_len)
298                .map(|p| combined.get(p))
299                .collect();
300
301            sequences.push(Lz77Sequence {
302                literals,
303                offset: 0,
304                match_length: 0,
305            });
306        }
307
308        Ok(sequences)
309    }
310
311    /// Compute the hash of 4 bytes at the given position.
312    ///
313    /// Uses a multiply-shift scheme for fast, reasonably distributed hashes.
314    fn hash(&self, combined: &CombinedBuffer<'_>, pos: usize) -> u32 {
315        let b0 = combined.get(pos) as u32;
316        let b1 = combined.get(pos + 1) as u32;
317        let b2 = combined.get(pos + 2) as u32;
318        let b3 = combined.get(pos + 3) as u32;
319        let val = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
320        let h = val.wrapping_mul(HASH_PRIME);
321        (h >> (32 - self.config.hash_log)) & self.hash_mask
322    }
323
324    /// Insert a position into the hash table (and chain table if enabled).
325    fn insert_position(&mut self, combined: &CombinedBuffer<'_>, pos: usize) {
326        if pos + 4 > combined.len() {
327            return;
328        }
329
330        let h = self.hash(combined, pos) as usize;
331        let prev = self.hash_table[h];
332        self.hash_table[h] = pos as u32;
333
334        if !self.chain_table.is_empty() {
335            let chain_idx = (pos as u32) & self.chain_mask;
336            self.chain_table[chain_idx as usize] = prev;
337        }
338    }
339
340    /// Seed the hash table with dictionary positions so matches can reference the dict.
341    fn seed_dictionary(&mut self, combined: &CombinedBuffer<'_>, dict_len: usize) {
342        if dict_len < 4 {
343            return;
344        }
345        for pos in 0..=(dict_len.saturating_sub(4)) {
346            self.insert_position(combined, pos);
347        }
348    }
349
350    /// Find the best match at a given position by consulting the hash table
351    /// and optionally following the hash chain.
352    fn find_best_match(
353        &self,
354        combined: &CombinedBuffer<'_>,
355        pos: usize,
356        dict_len: usize,
357    ) -> Option<Match> {
358        if pos + 4 > combined.len() {
359            return None;
360        }
361
362        let h = self.hash(combined, pos) as usize;
363        let mut candidate = self.hash_table[h] as usize;
364
365        // The candidate must be strictly before pos.
366        if candidate >= pos {
367            return None;
368        }
369
370        let max_distance = if pos >= dict_len {
371            // In data region: can reference back through data and into dictionary.
372            pos
373        } else {
374            // In dictionary region (during seeding): no matches.
375            return None;
376        };
377
378        let mut best: Option<Match> = None;
379        let mut steps = 0u32;
380        let max_steps = self.config.search_depth;
381
382        loop {
383            if steps >= max_steps {
384                break;
385            }
386            if candidate >= pos {
387                break;
388            }
389
390            let distance = pos - candidate;
391            if distance > max_distance || distance == 0 {
392                break;
393            }
394
395            // Compare bytes at candidate and pos to determine match length.
396            let match_len = self.compute_match_length(combined, candidate, pos);
397
398            if match_len >= MIN_MATCH {
399                let is_better = match best {
400                    Some(ref b) => {
401                        match_len > b.length || (match_len == b.length && distance < b.offset)
402                    }
403                    None => true,
404                };
405                if is_better {
406                    let clamped_len = match_len.min(MAX_MATCH);
407                    best = Some(Match {
408                        offset: distance,
409                        length: clamped_len,
410                    });
411                    // If we found a very long match, stop searching.
412                    if clamped_len >= 128 {
413                        break;
414                    }
415                }
416            }
417
418            steps += 1;
419
420            // Follow the chain to the next candidate with the same hash.
421            if self.chain_table.is_empty() {
422                break;
423            }
424            let chain_idx = (candidate as u32) & self.chain_mask;
425            let next_candidate = self.chain_table[chain_idx as usize] as usize;
426            if next_candidate >= candidate || next_candidate == 0 {
427                // Chain is broken or loops back: stop.
428                break;
429            }
430            candidate = next_candidate;
431        }
432
433        best
434    }
435
436    /// Compute the number of matching bytes starting at positions `a` and `b`.
437    fn compute_match_length(&self, combined: &CombinedBuffer<'_>, a: usize, b: usize) -> usize {
438        let max_len = (combined.len() - b).min(MAX_MATCH);
439        let mut len = 0usize;
440
441        // Fast path: compare 8 bytes at a time when possible.
442        while len + 8 <= max_len {
443            let va = combined.get_u64(a + len);
444            let vb = combined.get_u64(b + len);
445            if va != vb {
446                // Find the first differing byte within this 8-byte chunk.
447                let diff = va ^ vb;
448                len += (diff.trailing_zeros() / 8) as usize;
449                return len;
450            }
451            len += 8;
452        }
453
454        // Byte-by-byte for the remaining tail.
455        while len < max_len {
456            if combined.get(a + len) != combined.get(b + len) {
457                break;
458            }
459            len += 1;
460        }
461
462        len
463    }
464
465    /// Reset the match finder for a new block.
466    ///
467    /// Clears the hash and chain tables so no stale positions from previous
468    /// blocks are referenced.
469    pub fn reset(&mut self) {
470        for entry in self.hash_table.iter_mut() {
471            *entry = 0;
472        }
473        for entry in self.chain_table.iter_mut() {
474            *entry = 0;
475        }
476    }
477}
478
479/// Combined view of dictionary + data as a single logical byte array.
480///
481/// Avoids copying by dispatching reads to the appropriate slice based
482/// on the position.
483struct CombinedBuffer<'a> {
484    /// Dictionary (history) bytes, logically at positions [0..dict.len()).
485    dict: &'a [u8],
486    /// Actual data bytes, logically at positions [dict.len()..dict.len()+data.len()).
487    data: &'a [u8],
488    /// Total logical length.
489    total_len: usize,
490}
491
492impl<'a> CombinedBuffer<'a> {
493    /// Create a new combined buffer from dict and data slices.
494    fn new(dict: &'a [u8], data: &'a [u8]) -> Self {
495        Self {
496            dict,
497            data,
498            total_len: dict.len() + data.len(),
499        }
500    }
501
502    /// Total length of the combined buffer.
503    fn len(&self) -> usize {
504        self.total_len
505    }
506
507    /// Get a single byte at the given logical position.
508    fn get(&self, pos: usize) -> u8 {
509        if pos < self.dict.len() {
510            self.dict[pos]
511        } else {
512            self.data[pos - self.dict.len()]
513        }
514    }
515
516    /// Read 8 bytes as a little-endian u64 starting at the given position.
517    ///
518    /// If the read spans the dict/data boundary or exceeds the buffer,
519    /// falls back to byte-by-byte assembly.
520    fn get_u64(&self, pos: usize) -> u64 {
521        let dict_len = self.dict.len();
522
523        // Fast path: entirely within one slice.
524        if pos + 8 <= dict_len {
525            // Entirely in dict.
526            let slice = &self.dict[pos..pos + 8];
527            return u64::from_le_bytes([
528                slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
529            ]);
530        }
531
532        let data_pos = pos.wrapping_sub(dict_len);
533        if pos >= dict_len && data_pos + 8 <= self.data.len() {
534            // Entirely in data.
535            let slice = &self.data[data_pos..data_pos + 8];
536            return u64::from_le_bytes([
537                slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
538            ]);
539        }
540
541        // Slow path: spans boundary or near end.
542        let mut bytes = [0u8; 8];
543        for (i, byte) in bytes.iter_mut().enumerate() {
544            let p = pos + i;
545            if p < self.total_len {
546                *byte = self.get(p);
547            }
548        }
549        u64::from_le_bytes(bytes)
550    }
551}
552
553#[cfg(test)]
554mod tests {
555    use super::*;
556
557    #[test]
558    fn test_level_config_clamping() {
559        let c = LevelConfig::for_level(-5);
560        assert_eq!(c.hash_log, 17); // Same as level 1
561
562        let c = LevelConfig::for_level(100);
563        assert_eq!(c.hash_log, 20); // Same as level 22
564    }
565
566    #[test]
567    fn test_level_config_ranges() {
568        for level in 1..=22 {
569            let c = LevelConfig::for_level(level);
570            assert!(c.hash_log >= 17);
571            assert!(c.hash_log <= 20);
572            assert!(c.search_depth >= 1);
573            assert!(c.target_block_size > 0);
574        }
575    }
576
577    #[test]
578    fn test_find_sequences_empty() {
579        let config = LevelConfig::for_level(1);
580        let mut finder = MatchFinder::new(&config);
581        let seqs = finder.find_sequences(&[], &[]).expect("should succeed");
582        assert!(seqs.is_empty());
583    }
584
585    #[test]
586    fn test_find_sequences_no_matches() {
587        let config = LevelConfig::for_level(1);
588        let mut finder = MatchFinder::new(&config);
589        // Data with no repeated patterns.
590        let data: Vec<u8> = (0..64).collect();
591        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
592
593        // All sequences should be literal-only (no matches in non-repeating data of this size).
594        let total_literals: usize = seqs.iter().map(|s| s.literals.len()).sum();
595        let total_match_bytes: usize = seqs.iter().map(|s| s.match_length).sum();
596        assert_eq!(total_literals + total_match_bytes, data.len());
597    }
598
599    #[test]
600    fn test_find_sequences_with_repeats() {
601        let config = LevelConfig::for_level(3);
602        let mut finder = MatchFinder::new(&config);
603        // Data with obvious repeats.
604        let mut data = Vec::new();
605        for _ in 0..10 {
606            data.extend_from_slice(b"ABCDEFGHIJ");
607        }
608        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
609
610        // Verify that sequences cover the entire input.
611        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
612        assert_eq!(total_bytes, data.len());
613
614        // There should be at least one match found in repeated data.
615        let has_match = seqs.iter().any(|s| s.match_length > 0);
616        assert!(has_match, "Expected at least one match in repeated data");
617    }
618
619    #[test]
620    fn test_find_sequences_all_same_byte() {
621        let config = LevelConfig::for_level(1);
622        let mut finder = MatchFinder::new(&config);
623        let data = vec![0xAAu8; 1000];
624        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
625
626        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
627        assert_eq!(total_bytes, data.len());
628
629        // Should find matches (lots of repeated bytes).
630        let total_match_bytes: usize = seqs.iter().map(|s| s.match_length).sum();
631        assert!(
632            total_match_bytes > 0,
633            "Expected matches in all-same-byte data"
634        );
635    }
636
637    #[test]
638    fn test_find_sequences_with_dictionary() {
639        let config = LevelConfig::for_level(3);
640        let mut finder = MatchFinder::new(&config);
641        let dict = b"Hello, World! This is a dictionary.";
642        let data = b"Hello, World! This is actual data.";
643        let seqs = finder
644            .find_sequences(data.as_slice(), dict.as_slice())
645            .expect("should succeed");
646
647        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
648        assert_eq!(total_bytes, data.len());
649
650        // The data starts identically to the dict, so there should be a match.
651        let has_match = seqs.iter().any(|s| s.match_length > 0);
652        assert!(has_match, "Expected match referencing dictionary");
653    }
654
655    #[test]
656    fn test_find_sequences_lazy_matching() {
657        let config = LevelConfig::for_level(5); // Lazy matching enabled
658        let mut finder = MatchFinder::new(&config);
659        // Construct data where lazy matching helps:
660        // "XABC...ABC..." where matching at pos 1 gives a longer match.
661        let mut data = Vec::new();
662        data.push(b'X');
663        data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
664        data.push(b'Y');
665        data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
666        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
667
668        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
669        assert_eq!(total_bytes, data.len());
670    }
671
672    #[test]
673    fn test_match_finder_reset() {
674        let config = LevelConfig::for_level(1);
675        let mut finder = MatchFinder::new(&config);
676
677        let data = vec![0xBBu8; 100];
678        let _ = finder.find_sequences(&data, &[]).expect("should succeed");
679
680        finder.reset();
681
682        // After reset, hash table should be zeroed.
683        assert!(finder.hash_table.iter().all(|&v| v == 0));
684    }
685
686    #[test]
687    fn test_find_sequences_short_data() {
688        let config = LevelConfig::for_level(1);
689        let mut finder = MatchFinder::new(&config);
690
691        // Data shorter than MIN_MATCH: all literals, no matches possible.
692        let data = b"AB";
693        let seqs = finder
694            .find_sequences(data.as_slice(), &[])
695            .expect("should succeed");
696        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
697        assert_eq!(total_bytes, data.len());
698        assert!(seqs.iter().all(|s| s.match_length == 0));
699    }
700
701    #[test]
702    fn test_find_sequences_exact_min_match() {
703        let config = LevelConfig::for_level(3);
704        let mut finder = MatchFinder::new(&config);
705
706        // Create data where only a 3-byte match is possible.
707        let mut data = Vec::new();
708        data.extend_from_slice(b"ABCXYZ");
709        data.extend_from_slice(b"ABCQRS");
710        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
711
712        let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
713        assert_eq!(total_bytes, data.len());
714    }
715
716    #[test]
717    fn test_combined_buffer_get() {
718        let dict = b"DICT";
719        let data = b"DATA";
720        let combined = CombinedBuffer::new(dict.as_slice(), data.as_slice());
721
722        assert_eq!(combined.len(), 8);
723        assert_eq!(combined.get(0), b'D');
724        assert_eq!(combined.get(3), b'T');
725        assert_eq!(combined.get(4), b'D');
726        assert_eq!(combined.get(7), b'A');
727    }
728
729    #[test]
730    fn test_combined_buffer_get_u64() {
731        let dict = b"ABCD";
732        let data = b"EFGHIJKL";
733        let combined = CombinedBuffer::new(dict.as_slice(), data.as_slice());
734
735        // Read crossing dict/data boundary.
736        let val = combined.get_u64(2);
737        let expected = u64::from_le_bytes([b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J']);
738        assert_eq!(val, expected);
739
740        // Read entirely in data.
741        let val = combined.get_u64(4);
742        let expected = u64::from_le_bytes([b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L']);
743        assert_eq!(val, expected);
744    }
745
746    #[test]
747    fn test_all_levels_produce_valid_output() {
748        let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox.";
749        for level in 1..=22 {
750            let config = LevelConfig::for_level(level);
751            let mut finder = MatchFinder::new(&config);
752            let seqs = finder
753                .find_sequences(data.as_slice(), &[])
754                .expect("should succeed");
755
756            let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
757            assert_eq!(
758                total_bytes,
759                data.len(),
760                "Level {} produced incorrect coverage",
761                level
762            );
763        }
764    }
765
766    #[test]
767    fn test_match_offset_validity() {
768        let config = LevelConfig::for_level(3);
769        let mut finder = MatchFinder::new(&config);
770
771        let mut data = Vec::new();
772        for _ in 0..5 {
773            data.extend_from_slice(b"REPEATED_PATTERN_");
774        }
775
776        let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
777
778        // All match offsets should be > 0 and <= current position.
779        let mut pos = 0usize;
780        for seq in &seqs {
781            pos += seq.literals.len();
782            if seq.match_length > 0 {
783                assert!(seq.offset > 0, "Match offset must be positive");
784                assert!(
785                    seq.offset <= pos,
786                    "Match offset {} exceeds position {}",
787                    seq.offset,
788                    pos
789                );
790            }
791            pos += seq.match_length;
792        }
793    }
794}