lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Compression dictionary types.
//!
//! This module defines types for shared compression dictionaries
//! that can improve compression ratios for similar files.

use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::fmt;

// ═══════════════════════════════════════════════════════════════════════════════
// CONSTANTS
// ═══════════════════════════════════════════════════════════════════════════════

/// Magic number for dictionary-compressed data.
pub const DICT_MAGIC: [u8; 4] = [0x4C, 0x43, 0x44, 0x43]; // "LCDC"

/// Default dictionary size (32KB).
pub const DEFAULT_DICT_SIZE: usize = 32 * 1024;

/// Minimum dictionary size (4KB).
pub const MIN_DICT_SIZE: usize = 4 * 1024;

/// Maximum dictionary size (256KB).
pub const MAX_DICT_SIZE: usize = 256 * 1024;

/// Minimum match length for dictionary references.
pub const MIN_MATCH_LEN: usize = 4;

/// Maximum match length (64KB).
pub const MAX_MATCH_LEN: usize = 65535;

/// Op code for dictionary reference.
pub const OP_DICT_REF: u8 = 0x00;

/// Op code for literal data.
pub const OP_LITERAL: u8 = 0x01;

// ═══════════════════════════════════════════════════════════════════════════════
// COMPRESSION DICTIONARY
// ═══════════════════════════════════════════════════════════════════════════════

/// A compression dictionary containing common substrings.
#[derive(Debug, Clone)]
pub struct CompressionDict {
    /// Unique dictionary ID.
    pub id: u64,
    /// Dictionary name (descriptive).
    pub name: String,
    /// Raw dictionary data.
    pub data: Vec<u8>,
    /// Pattern this dictionary is for (e.g., "*.json").
    pub pattern: String,
    /// Dataset this dictionary belongs to.
    pub dataset: String,
    /// Creation timestamp.
    pub created: u64,
    /// Number of samples used for training.
    pub sample_count: u32,
    /// Average compression ratio achieved.
    pub avg_ratio: f32,
}

impl CompressionDict {
    /// Create a new compression dictionary.
    pub fn new(
        id: u64,
        name: &str,
        data: Vec<u8>,
        pattern: &str,
        dataset: &str,
        created: u64,
    ) -> Self {
        Self {
            id,
            name: name.to_string(),
            data,
            pattern: pattern.to_string(),
            dataset: dataset.to_string(),
            created,
            sample_count: 0,
            avg_ratio: 0.0,
        }
    }

    /// Set training metadata.
    pub fn with_training_info(mut self, sample_count: u32, avg_ratio: f32) -> Self {
        self.sample_count = sample_count;
        self.avg_ratio = avg_ratio;
        self
    }

    /// Get dictionary size.
    pub fn size(&self) -> usize {
        self.data.len()
    }

    /// Check if pattern matches a path.
    pub fn matches_pattern(&self, path: &str) -> bool {
        if self.pattern.is_empty() || self.pattern == "*" {
            return true;
        }

        // Simple glob matching
        if self.pattern.starts_with("*.") {
            let ext = &self.pattern[2..];
            return path.ends_with(ext);
        }

        if self.pattern.ends_with("*") {
            let prefix = &self.pattern[..self.pattern.len() - 1];
            return path.starts_with(prefix);
        }

        path == self.pattern
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// SUBSTRING ENTRY
// ═══════════════════════════════════════════════════════════════════════════════

/// Entry for tracking substring frequency during training.
#[derive(Debug, Clone)]
pub struct SubstringEntry {
    /// The substring data.
    pub data: Vec<u8>,
    /// Number of occurrences.
    pub count: u32,
    /// Total bytes saved if included in dictionary.
    pub savings: u64,
}

impl SubstringEntry {
    /// Create a new substring entry.
    pub fn new(data: Vec<u8>) -> Self {
        Self {
            data,
            count: 1,
            // Savings = 0 for first occurrence (need at least 2 to save anything)
            savings: 0,
        }
    }

    /// Increment count and update savings.
    pub fn increment(&mut self) {
        self.count += 1;
        let len = self.data.len();
        if len > 4 {
            // Each additional occurrence saves (length - 4) bytes
            self.savings = (self.count as u64 - 1) * (len - 4) as u64;
        }
    }

    /// Score for dictionary inclusion (higher = better).
    pub fn score(&self) -> u64 {
        self.savings
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// COMPRESSION HEADER
// ═══════════════════════════════════════════════════════════════════════════════

/// Header for dictionary-compressed data.
#[derive(Debug, Clone, Copy)]
pub struct CompressedHeader {
    /// Magic number (LCDC).
    pub magic: [u8; 4],
    /// Dictionary ID used.
    pub dict_id: u64,
    /// Original data size.
    pub original_size: u32,
    /// Compressed data size (excluding header).
    pub compressed_size: u32,
    /// Checksum of original data.
    pub checksum: u32,
}

impl CompressedHeader {
    /// Header size in bytes.
    pub const SIZE: usize = 24;

    /// Create a new header.
    pub fn new(dict_id: u64, original_size: u32, compressed_size: u32, checksum: u32) -> Self {
        Self {
            magic: DICT_MAGIC,
            dict_id,
            original_size,
            compressed_size,
            checksum,
        }
    }

    /// Serialize header to bytes.
    pub fn to_bytes(&self) -> [u8; 24] {
        let mut buf = [0u8; 24];
        buf[0..4].copy_from_slice(&self.magic);
        buf[4..12].copy_from_slice(&self.dict_id.to_le_bytes());
        buf[12..16].copy_from_slice(&self.original_size.to_le_bytes());
        buf[16..20].copy_from_slice(&self.compressed_size.to_le_bytes());
        buf[20..24].copy_from_slice(&self.checksum.to_le_bytes());
        buf
    }

    /// Deserialize header from bytes.
    pub fn from_bytes(data: &[u8]) -> Option<Self> {
        if data.len() < 24 {
            return None;
        }

        let mut magic = [0u8; 4];
        magic.copy_from_slice(&data[0..4]);

        if magic != DICT_MAGIC {
            return None;
        }

        let dict_id = u64::from_le_bytes(data[4..12].try_into().ok()?);
        let original_size = u32::from_le_bytes(data[12..16].try_into().ok()?);
        let compressed_size = u32::from_le_bytes(data[16..20].try_into().ok()?);
        let checksum = u32::from_le_bytes(data[20..24].try_into().ok()?);

        Some(Self {
            magic,
            dict_id,
            original_size,
            compressed_size,
            checksum,
        })
    }

    /// Check if data starts with dictionary compression magic.
    pub fn is_dict_compressed(data: &[u8]) -> bool {
        data.len() >= 4 && data[0..4] == DICT_MAGIC
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// COMPRESSION OPERATIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// A single compression operation.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CompressOp {
    /// Reference to dictionary: offset and length.
    DictRef {
        /// Offset into the dictionary.
        offset: u16,
        /// Length of the reference.
        length: u16,
    },
    /// Literal data.
    Literal {
        /// The literal data bytes.
        data: Vec<u8>,
    },
}

impl CompressOp {
    /// Create a dictionary reference.
    pub fn dict_ref(offset: u16, length: u16) -> Self {
        Self::DictRef { offset, length }
    }

    /// Create a literal.
    pub fn literal(data: Vec<u8>) -> Self {
        Self::Literal { data }
    }

    /// Get encoded size of this operation.
    pub fn encoded_size(&self) -> usize {
        match self {
            Self::DictRef { .. } => 5,                // op(1) + offset(2) + length(2)
            Self::Literal { data } => 3 + data.len(), // op(1) + length(2) + data
        }
    }

    /// Get output size (decompressed).
    pub fn output_size(&self) -> usize {
        match self {
            Self::DictRef { length, .. } => *length as usize,
            Self::Literal { data } => data.len(),
        }
    }

    /// Encode operation to bytes.
    pub fn encode(&self, buf: &mut Vec<u8>) {
        match self {
            Self::DictRef { offset, length } => {
                buf.push(OP_DICT_REF);
                buf.extend_from_slice(&offset.to_le_bytes());
                buf.extend_from_slice(&length.to_le_bytes());
            }
            Self::Literal { data } => {
                buf.push(OP_LITERAL);
                buf.extend_from_slice(&(data.len() as u16).to_le_bytes());
                buf.extend_from_slice(data);
            }
        }
    }

    /// Decode operation from bytes, returning (op, bytes_consumed).
    pub fn decode(data: &[u8]) -> Option<(Self, usize)> {
        if data.is_empty() {
            return None;
        }

        match data[0] {
            OP_DICT_REF => {
                if data.len() < 5 {
                    return None;
                }
                let offset = u16::from_le_bytes([data[1], data[2]]);
                let length = u16::from_le_bytes([data[3], data[4]]);
                Some((Self::DictRef { offset, length }, 5))
            }
            OP_LITERAL => {
                if data.len() < 3 {
                    return None;
                }
                let length = u16::from_le_bytes([data[1], data[2]]) as usize;
                if data.len() < 3 + length {
                    return None;
                }
                let literal_data = data[3..3 + length].to_vec();
                Some((Self::Literal { data: literal_data }, 3 + length))
            }
            _ => None,
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TRAINING OPTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Options for dictionary training.
#[derive(Debug, Clone)]
pub struct TrainingOptions {
    /// Target dictionary size.
    pub dict_size: usize,
    /// Minimum substring length to consider.
    pub min_substring_len: usize,
    /// Maximum substring length to consider.
    pub max_substring_len: usize,
    /// Minimum occurrences to include in dictionary.
    pub min_occurrences: u32,
}

impl Default for TrainingOptions {
    fn default() -> Self {
        Self {
            dict_size: DEFAULT_DICT_SIZE,
            min_substring_len: MIN_MATCH_LEN,
            max_substring_len: 256,
            min_occurrences: 2,
        }
    }
}

impl TrainingOptions {
    /// Create with custom dictionary size.
    pub fn with_size(mut self, size: usize) -> Self {
        self.dict_size = size.clamp(MIN_DICT_SIZE, MAX_DICT_SIZE);
        self
    }

    /// Set minimum substring length.
    pub fn min_len(mut self, len: usize) -> Self {
        self.min_substring_len = len.max(MIN_MATCH_LEN);
        self
    }

    /// Set maximum substring length.
    pub fn max_len(mut self, len: usize) -> Self {
        self.max_substring_len = len;
        self
    }

    /// Set minimum occurrences.
    pub fn min_count(mut self, count: u32) -> Self {
        self.min_occurrences = count.max(2);
        self
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// DICTIONARY STATISTICS
// ═══════════════════════════════════════════════════════════════════════════════

/// Statistics about dictionary usage.
#[derive(Debug, Clone, Default)]
pub struct DictStats {
    /// Total compressions using this dictionary.
    pub compressions: u64,
    /// Total decompressions using this dictionary.
    pub decompressions: u64,
    /// Total bytes compressed.
    pub bytes_in: u64,
    /// Total bytes output (after compression).
    pub bytes_out: u64,
    /// Dictionary hits (successful references).
    pub dict_hits: u64,
    /// Dictionary misses (literals used).
    pub dict_misses: u64,
}

impl DictStats {
    /// Get compression ratio.
    pub fn compression_ratio(&self) -> f64 {
        if self.bytes_in == 0 {
            return 1.0;
        }
        self.bytes_out as f64 / self.bytes_in as f64
    }

    /// Get hit rate.
    pub fn hit_rate(&self) -> f64 {
        let total = self.dict_hits + self.dict_misses;
        if total == 0 {
            return 0.0;
        }
        self.dict_hits as f64 / total as f64
    }

    /// Update with compression result.
    pub fn record_compression(
        &mut self,
        input_size: u64,
        output_size: u64,
        hits: u64,
        misses: u64,
    ) {
        self.compressions += 1;
        self.bytes_in += input_size;
        self.bytes_out += output_size;
        self.dict_hits += hits;
        self.dict_misses += misses;
    }

    /// Update with decompression.
    pub fn record_decompression(&mut self) {
        self.decompressions += 1;
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// ERRORS
// ═══════════════════════════════════════════════════════════════════════════════

/// Dictionary compression errors.
#[derive(Debug, Clone)]
pub enum DictError {
    /// Invalid dictionary size.
    InvalidDictSize(usize),
    /// Dictionary not found.
    DictNotFound(u64),
    /// Not enough samples for training.
    InsufficientSamples(usize),
    /// Invalid compressed data.
    InvalidData(String),
    /// Checksum mismatch.
    ChecksumMismatch {
        /// Expected checksum value.
        expected: u32,
        /// Actual checksum value.
        actual: u32,
    },
    /// Dictionary too large.
    DictTooLarge(usize),
    /// No matching dictionary found.
    NoMatchingDict(String),
}

impl fmt::Display for DictError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::InvalidDictSize(size) => write!(f, "invalid dictionary size: {}", size),
            Self::DictNotFound(id) => write!(f, "dictionary not found: {}", id),
            Self::InsufficientSamples(n) => {
                write!(f, "insufficient samples: {} (need at least 2)", n)
            }
            Self::InvalidData(msg) => write!(f, "invalid compressed data: {}", msg),
            Self::ChecksumMismatch { expected, actual } => {
                write!(
                    f,
                    "checksum mismatch: expected {}, got {}",
                    expected, actual
                )
            }
            Self::DictTooLarge(size) => write!(f, "dictionary too large: {}", size),
            Self::NoMatchingDict(path) => write!(f, "no matching dictionary for: {}", path),
        }
    }
}

/// Result type for dictionary operations.
pub type DictResult<T> = Result<T, DictError>;

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::format;
    use alloc::vec;

    #[test]
    fn test_compression_dict() {
        let dict = CompressionDict::new(
            1,
            "json_dict",
            vec![1, 2, 3, 4],
            "*.json",
            "pool/data",
            12345,
        );

        assert_eq!(dict.id, 1);
        assert_eq!(dict.size(), 4);
        assert!(dict.matches_pattern("file.json"));
        assert!(!dict.matches_pattern("file.xml"));
    }

    #[test]
    fn test_pattern_matching() {
        let dict = CompressionDict::new(1, "test", vec![], "*.log", "ds", 0);

        assert!(dict.matches_pattern("app.log"));
        assert!(dict.matches_pattern("/var/log/system.log"));
        assert!(!dict.matches_pattern("app.txt"));

        let dict2 = CompressionDict::new(1, "test", vec![], "/var/*", "ds", 0);
        assert!(dict2.matches_pattern("/var/log"));
        assert!(!dict2.matches_pattern("/home/log"));

        let dict3 = CompressionDict::new(1, "test", vec![], "*", "ds", 0);
        assert!(dict3.matches_pattern("anything"));
    }

    #[test]
    fn test_substring_entry() {
        let mut entry = SubstringEntry::new(vec![1, 2, 3, 4, 5, 6, 7, 8]);

        assert_eq!(entry.count, 1);
        assert_eq!(entry.savings, 0); // First occurrence doesn't save

        entry.increment();
        assert_eq!(entry.count, 2);
        assert_eq!(entry.savings, 4); // 8 - 4 = 4 bytes saved

        entry.increment();
        assert_eq!(entry.count, 3);
        assert_eq!(entry.savings, 8); // 2 * 4 = 8 bytes saved
    }

    #[test]
    fn test_compressed_header() {
        let header = CompressedHeader::new(12345, 1000, 500, 0xDEADBEEF);

        let bytes = header.to_bytes();
        assert_eq!(bytes.len(), 24);
        assert_eq!(&bytes[0..4], &DICT_MAGIC);

        let parsed = CompressedHeader::from_bytes(&bytes).unwrap();
        assert_eq!(parsed.dict_id, 12345);
        assert_eq!(parsed.original_size, 1000);
        assert_eq!(parsed.compressed_size, 500);
        assert_eq!(parsed.checksum, 0xDEADBEEF);
    }

    #[test]
    fn test_header_magic_check() {
        assert!(CompressedHeader::is_dict_compressed(&DICT_MAGIC));
        assert!(CompressedHeader::is_dict_compressed(&[
            0x4C, 0x43, 0x44, 0x43, 0x00
        ]));
        assert!(!CompressedHeader::is_dict_compressed(&[
            0x00, 0x00, 0x00, 0x00
        ]));
        assert!(!CompressedHeader::is_dict_compressed(&[0x4C]));
    }

    #[test]
    fn test_compress_op_dict_ref() {
        let op = CompressOp::dict_ref(100, 50);
        assert_eq!(op.encoded_size(), 5);
        assert_eq!(op.output_size(), 50);

        let mut buf = Vec::new();
        op.encode(&mut buf);
        assert_eq!(buf.len(), 5);

        let (decoded, consumed) = CompressOp::decode(&buf).unwrap();
        assert_eq!(consumed, 5);
        assert_eq!(decoded, op);
    }

    #[test]
    fn test_compress_op_literal() {
        let op = CompressOp::literal(vec![1, 2, 3, 4, 5]);
        assert_eq!(op.encoded_size(), 8); // 1 + 2 + 5
        assert_eq!(op.output_size(), 5);

        let mut buf = Vec::new();
        op.encode(&mut buf);
        assert_eq!(buf.len(), 8);

        let (decoded, consumed) = CompressOp::decode(&buf).unwrap();
        assert_eq!(consumed, 8);
        assert_eq!(decoded, op);
    }

    #[test]
    fn test_training_options() {
        let opts = TrainingOptions::default()
            .with_size(16 * 1024)
            .min_len(8)
            .min_count(3);

        assert_eq!(opts.dict_size, 16 * 1024);
        assert_eq!(opts.min_substring_len, 8);
        assert_eq!(opts.min_occurrences, 3);
    }

    #[test]
    fn test_dict_stats() {
        let mut stats = DictStats::default();

        stats.record_compression(1000, 500, 100, 20);
        assert_eq!(stats.compressions, 1);
        assert_eq!(stats.bytes_in, 1000);
        assert_eq!(stats.bytes_out, 500);
        assert_eq!(stats.compression_ratio(), 0.5);
        assert!((stats.hit_rate() - 0.833).abs() < 0.01);
    }

    #[test]
    fn test_error_display() {
        let e = DictError::InvalidDictSize(100);
        assert!(format!("{}", e).contains("100"));

        let e = DictError::DictNotFound(42);
        assert!(format!("{}", e).contains("42"));

        let e = DictError::ChecksumMismatch {
            expected: 1,
            actual: 2,
        };
        assert!(format!("{}", e).contains("mismatch"));
    }
}