atlas-archive-core 1.1.0

High-performance compression library with adaptive context modeling (Loom) and .nyx archives
Documentation
//! LZMA-style sliding window match finder for Atlas.
//!
//! This module implements the "Hash Chain" match finder algorithm as described
//! in the LZMA specification (and 7-Zip source). It is a component of the
//! LZMA compression algorithm, responsible for finding repeated byte sequences (matches)
//! within a sliding window.
//!
//! Note: This is NOT a full LZMA compressor (which requires Range Coding and
//! Markov Chain state machines); it provides the Lempel-Ziv parsing backend.

use crate::alloc::vec::Vec;

const MIN_MATCH: usize = 2;
const MAX_MATCH: usize = 273;
const HASH_SIZE: usize = 1 << 16;

/// A simple match finder state (Hash Chain implementation).
pub struct LzmaMatchFinder {
    head: [i32; HASH_SIZE],
    next: Vec<i32>,
    window_size: usize,
}

impl LzmaMatchFinder {
    pub fn new(window_size: usize) -> Self {
        Self {
            head: [-1i32; HASH_SIZE],
            next: crate::alloc::vec![ -1i32; window_size ],
            window_size,
        }
    }

    fn hash(bytes: &[u8]) -> usize {
        // Simple hash of 3 bytes.
        let h = ((bytes[0] as usize) << 10) ^ ((bytes[1] as usize) << 5) ^ (bytes[2] as usize);
        h % HASH_SIZE
    }

    /// Find the best match at the current position.
    /// Returns (distance, length).
    pub fn find_match(&mut self, data: &[u8], pos: usize) -> Option<(usize, usize)> {
        if pos + MIN_MATCH >= data.len() {
            return None;
        }

        let h = Self::hash(&data[pos..pos + 3]);
        let mut best_len = 0;
        let mut best_dist = 0;

        let mut curr = self.head[h];
        let mut chain_len = 0;

        while curr != -1 && chain_len < 64 {
            // Limit chain to maintain speed.
            let dist = pos - curr as usize;
            if dist >= self.window_size {
                break;
            }

            // Verify match.
            let mut match_len = 0;
            while match_len < MAX_MATCH
                && pos + match_len < data.len()
                && data[pos + match_len] == data[curr as usize + match_len]
            {
                match_len += 1;
            }

            if match_len > best_len {
                best_len = match_len;
                best_dist = dist;
            }

            curr = self.next[curr as usize % self.window_size];
            chain_len += 1;
        }

        // Update head.
        self.next[pos % self.window_size] = self.head[h];
        self.head[h] = pos as i32;

        if best_len >= MIN_MATCH {
            Some((best_dist, best_len))
        } else {
            None
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_lzma_match_finder_basic() {
        // "abc" repeated -> match at dist 3, len 3
        let data = b"abcabc"; 
        let window_size = 128;
        let mut finder = LzmaMatchFinder::new(window_size);

        // Process first "abc"
        finder.find_match(data, 0); // 'a'
        finder.find_match(data, 1); // 'b'
        finder.find_match(data, 2); // 'c'

        // Now at 3 ('a'), we should see the match back at 0
        let match_res = finder.find_match(data, 3);
        assert!(match_res.is_some());
        let (dist, len) = match_res.unwrap();
        // Distance is relative to current pos. 3 - 0 = 3.
        assert_eq!(dist, 3); 
        assert_eq!(len, 3);
    }

    #[test]
    fn test_lzma_match_finder_no_match() {
        let data = b"abcdef";
        let window_size = 128;
        let mut finder = LzmaMatchFinder::new(window_size);

        assert!(finder.find_match(data, 0).is_none());
        assert!(finder.find_match(data, 1).is_none());
        assert!(finder.find_match(data, 2).is_none());
    }
}