lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Dictionary-based compression and decompression.
//!
//! This module provides compression using a shared dictionary to
//! improve ratios for similar files.

use alloc::string::{String, ToString};
use alloc::vec::Vec;

use super::types::{
    CompressOp, CompressedHeader, CompressionDict, DictError, DictResult, MIN_MATCH_LEN,
    OP_DICT_REF, OP_LITERAL,
};

// ═══════════════════════════════════════════════════════════════════════════════
// CHECKSUM
// ═══════════════════════════════════════════════════════════════════════════════

/// Compute simple checksum for data verification.
fn compute_checksum(data: &[u8]) -> u32 {
    let mut hash: u32 = 0;
    for (i, &byte) in data.iter().enumerate() {
        hash = hash.wrapping_add(byte as u32);
        hash = hash.wrapping_mul(31);
        hash ^= (i as u32).rotate_left(5);
    }
    hash
}

// ═══════════════════════════════════════════════════════════════════════════════
// COMPRESSION
// ═══════════════════════════════════════════════════════════════════════════════

/// Compress data using a dictionary.
///
/// # Arguments
/// * `data` - Data to compress
/// * `dict` - Compression dictionary to use
///
/// # Returns
/// Compressed data including header
pub fn compress(data: &[u8], dict: &CompressionDict) -> DictResult<Vec<u8>> {
    if data.is_empty() {
        // Empty data: just header
        let header = CompressedHeader::new(dict.id, 0, 0, 0);
        return Ok(header.to_bytes().to_vec());
    }

    let checksum = compute_checksum(data);

    // Compress the data
    let ops = compress_to_ops(data, &dict.data);
    let compressed_data = encode_ops(&ops);

    // Build result with header
    let header = CompressedHeader::new(
        dict.id,
        data.len() as u32,
        compressed_data.len() as u32,
        checksum,
    );

    let mut result = Vec::with_capacity(CompressedHeader::SIZE + compressed_data.len());
    result.extend_from_slice(&header.to_bytes());
    result.extend_from_slice(&compressed_data);

    Ok(result)
}

/// Compress data to a list of operations.
fn compress_to_ops(data: &[u8], dict: &[u8]) -> Vec<CompressOp> {
    let mut ops = Vec::new();
    let mut pos = 0;
    let mut literal_start = 0;

    while pos < data.len() {
        // Try to find a match in the dictionary
        let (match_offset, match_len) = find_best_match(dict, &data[pos..]);

        if match_len >= MIN_MATCH_LEN {
            // Flush any pending literal
            if pos > literal_start {
                ops.push(CompressOp::literal(data[literal_start..pos].to_vec()));
            }

            // Add dictionary reference
            ops.push(CompressOp::dict_ref(match_offset as u16, match_len as u16));

            pos += match_len;
            literal_start = pos;
        } else {
            pos += 1;
        }
    }

    // Flush final literal
    if literal_start < data.len() {
        ops.push(CompressOp::literal(data[literal_start..].to_vec()));
    }

    ops
}

/// Encode operations to bytes.
fn encode_ops(ops: &[CompressOp]) -> Vec<u8> {
    let mut result = Vec::new();
    for op in ops {
        op.encode(&mut result);
    }
    result
}

/// Find best match in dictionary using simple search.
fn find_best_match(dict: &[u8], data: &[u8]) -> (usize, usize) {
    if dict.is_empty() || data.len() < MIN_MATCH_LEN {
        return (0, 0);
    }

    let mut best_offset = 0;
    let mut best_len = 0;

    // Maximum match length we can encode (u16)
    let max_match = data.len().min(u16::MAX as usize);

    for offset in 0..dict.len() {
        let max_len = (dict.len() - offset).min(max_match);

        let mut len = 0;
        while len < max_len && len < data.len() && dict[offset + len] == data[len] {
            len += 1;
        }

        if len > best_len && len >= MIN_MATCH_LEN {
            best_offset = offset;
            best_len = len;

            // Early exit if we found a very good match
            if best_len >= 64 {
                break;
            }
        }
    }

    (best_offset, best_len)
}

// ═══════════════════════════════════════════════════════════════════════════════
// DECOMPRESSION
// ═══════════════════════════════════════════════════════════════════════════════

/// Decompress data using a dictionary.
///
/// # Arguments
/// * `compressed` - Compressed data including header
/// * `dict` - Dictionary to use (must match header dict_id)
///
/// # Returns
/// Decompressed data
pub fn decompress(compressed: &[u8], dict: &CompressionDict) -> DictResult<Vec<u8>> {
    // Parse header
    let header = CompressedHeader::from_bytes(compressed)
        .ok_or_else(|| DictError::InvalidData("invalid header".to_string()))?;

    // Verify dictionary ID
    if header.dict_id != dict.id {
        return Err(DictError::DictNotFound(header.dict_id));
    }

    // Handle empty data
    if header.original_size == 0 {
        return Ok(Vec::new());
    }

    // Decompress
    let compressed_data = &compressed[CompressedHeader::SIZE..];
    let result = decompress_ops(compressed_data, &dict.data, header.original_size as usize)?;

    // Verify checksum
    let actual_checksum = compute_checksum(&result);
    if actual_checksum != header.checksum {
        return Err(DictError::ChecksumMismatch {
            expected: header.checksum,
            actual: actual_checksum,
        });
    }

    // Verify size
    if result.len() != header.original_size as usize {
        return Err(DictError::InvalidData(alloc::format!(
            "size mismatch: expected {}, got {}",
            header.original_size,
            result.len()
        )));
    }

    Ok(result)
}

/// Decompress operations using dictionary.
fn decompress_ops(compressed: &[u8], dict: &[u8], expected_size: usize) -> DictResult<Vec<u8>> {
    let mut result = Vec::with_capacity(expected_size);
    let mut pos = 0;

    while pos < compressed.len() {
        let (op, consumed) = CompressOp::decode(&compressed[pos..])
            .ok_or_else(|| DictError::InvalidData("invalid operation".to_string()))?;

        match op {
            CompressOp::DictRef { offset, length } => {
                let start = offset as usize;
                let end = start + length as usize;

                if end > dict.len() {
                    return Err(DictError::InvalidData(alloc::format!(
                        "dict reference out of bounds: {}..{}",
                        start,
                        end
                    )));
                }

                result.extend_from_slice(&dict[start..end]);
            }
            CompressOp::Literal { data } => {
                result.extend_from_slice(&data);
            }
        }

        pos += consumed;
    }

    Ok(result)
}

/// Decompress without verifying dictionary ID (for when dict is provided externally).
pub fn decompress_with_dict_data(compressed: &[u8], dict_data: &[u8]) -> DictResult<Vec<u8>> {
    // Parse header
    let header = CompressedHeader::from_bytes(compressed)
        .ok_or_else(|| DictError::InvalidData("invalid header".to_string()))?;

    // Handle empty data
    if header.original_size == 0 {
        return Ok(Vec::new());
    }

    // Decompress
    let compressed_data = &compressed[CompressedHeader::SIZE..];
    let result = decompress_ops(compressed_data, dict_data, header.original_size as usize)?;

    // Verify checksum
    let actual_checksum = compute_checksum(&result);
    if actual_checksum != header.checksum {
        return Err(DictError::ChecksumMismatch {
            expected: header.checksum,
            actual: actual_checksum,
        });
    }

    Ok(result)
}

// ═══════════════════════════════════════════════════════════════════════════════
// COMPRESSION RATIO HELPERS
// ═══════════════════════════════════════════════════════════════════════════════

/// Calculate compression ratio for given data and dictionary.
pub fn compression_ratio(data: &[u8], dict: &CompressionDict) -> f64 {
    if data.is_empty() {
        return 1.0;
    }

    let compressed = match compress(data, dict) {
        Ok(c) => c,
        Err(_) => return 1.0,
    };

    compressed.len() as f64 / data.len() as f64
}

/// Check if using dictionary compression is beneficial.
pub fn is_beneficial(data: &[u8], dict: &CompressionDict) -> bool {
    // Don't use dict compression for very small data
    if data.len() < 100 {
        return false;
    }

    let ratio = compression_ratio(data, dict);
    ratio < 0.95 // At least 5% compression
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    fn make_dict(data: &[u8]) -> CompressionDict {
        CompressionDict::new(1, "test", data.to_vec(), "*", "ds", 0)
    }

    #[test]
    fn test_checksum() {
        let data1 = b"hello world";
        let data2 = b"hello worle";

        let sum1 = compute_checksum(data1);
        let sum2 = compute_checksum(data2);

        assert_ne!(sum1, sum2);
        assert_eq!(sum1, compute_checksum(data1));
    }

    #[test]
    fn test_compress_empty() {
        let dict = make_dict(b"some dict data");
        let result = compress(&[], &dict).unwrap();

        assert_eq!(result.len(), CompressedHeader::SIZE);
    }

    #[test]
    fn test_compress_decompress_roundtrip() {
        let dict = make_dict(b"hello world this is a test");
        let data = b"hello world test hello";

        let compressed = compress(data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_compress_with_dict_matches() {
        let dict_data = b"the quick brown fox";
        let dict = make_dict(dict_data);

        // Data that contains dictionary substrings
        let data = b"the quick brown fox jumps over the quick fox";

        let compressed = compress(data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);

        // Compressed should be smaller or equal (with repeated patterns)
        // Note: header adds overhead so small data might not compress well
    }

    #[test]
    fn test_compress_no_dict_match() {
        let dict = make_dict(b"aaaa bbbb cccc");
        let data = b"xxxx yyyy zzzz";

        let compressed = compress(data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_decompress_wrong_dict() {
        let dict1 = CompressionDict::new(1, "d1", vec![1, 2, 3], "*", "ds", 0);
        let dict2 = CompressionDict::new(2, "d2", vec![4, 5, 6], "*", "ds", 0);

        let data = b"test data";
        let compressed = compress(data, &dict1).unwrap();

        let result = decompress(&compressed, &dict2);
        assert!(matches!(result, Err(DictError::DictNotFound(1))));
    }

    #[test]
    fn test_compress_to_ops() {
        let dict = b"hello";
        let data = b"hello world hello";

        let ops = compress_to_ops(data, dict);

        // Should have: dict_ref(hello), literal( world ), dict_ref(hello)
        assert!(!ops.is_empty());

        // Verify ops can be decoded
        let encoded = encode_ops(&ops);
        assert!(!encoded.is_empty());
    }

    #[test]
    fn test_find_best_match() {
        let dict = b"hello world";

        // "hello " (with space) matches at offset 0, 6 chars
        let (off1, len1) = find_best_match(dict, b"hello there");
        assert_eq!(off1, 0);
        assert_eq!(len1, 6); // "hello " matches

        // "world" at offset 6 in dict, but " world" also possible
        let (off2, len2) = find_best_match(dict, b"world cup");
        assert_eq!(off2, 6);
        assert_eq!(len2, 5); // Just "world"

        let (_, len3) = find_best_match(dict, b"xyz");
        assert_eq!(len3, 0);
    }

    #[test]
    fn test_compression_ratio() {
        let dict = make_dict(b"repeated pattern here");
        let data = b"repeated pattern here and repeated pattern here again";

        let ratio = compression_ratio(data, &dict);
        // With good dict matches, ratio should be decent
        assert!(ratio <= 1.5); // Allow some overhead
    }

    #[test]
    fn test_is_beneficial() {
        let dict = make_dict(b"common substring");

        // Very small data: not beneficial
        assert!(!is_beneficial(b"tiny", &dict));

        // Larger data with matches: should be beneficial
        let large_data = b"common substring appears multiple times in common substring data";
        // May or may not be beneficial depending on overhead
        let _ = is_beneficial(large_data, &dict);
    }

    #[test]
    fn test_large_data_roundtrip() {
        // Create larger test data
        let dict_data: Vec<u8> = (0..1000).map(|i| (i % 256) as u8).collect();
        let dict = make_dict(&dict_data);

        let mut data = Vec::with_capacity(10000);
        for i in 0..10000 {
            data.push((i % 256) as u8);
        }

        let compressed = compress(&data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_decompress_with_dict_data() {
        let dict_data = b"test dictionary";
        let dict = make_dict(dict_data);

        let data = b"test dictionary is used here test";
        let compressed = compress(data, &dict).unwrap();

        // Decompress using just the raw dict data
        let decompressed = decompress_with_dict_data(&compressed, dict_data).unwrap();
        assert_eq!(decompressed, data);
    }
}