lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Shared Compression Dictionaries for LCPFS.
//!
//! This module provides dictionary-based compression that can significantly
//! improve compression ratios for files with similar content (e.g., JSON files,
//! log files, configuration files).
//!
//! # How It Works
//!
//! 1. **Training**: Analyze sample files to find common substrings
//! 2. **Dictionary**: Store these common substrings in a shared dictionary
//! 3. **Compression**: Replace occurrences with dictionary references
//! 4. **Decompression**: Expand references using the dictionary
//!
//! # Example
//!
//! ```ignore
//! use lcpfs::dictcomp::{auto_train_default, compress_auto, decompress_auto};
//!
//! // Train a dictionary from sample files
//! let samples: &[&[u8]] = &[
//!     b"{\"name\": \"Alice\", \"age\": 30}",
//!     b"{\"name\": \"Bob\", \"age\": 25}",
//!     b"{\"name\": \"Charlie\", \"age\": 35}",
//! ];
//!
//! let dict = auto_train_default("json_dict", "pool/data", "*.json", samples, timestamp)?;
//!
//! // Compress a file using the dictionary
//! let data = b"{\"name\": \"Dave\", \"age\": 40}";
//! let compressed = compress_auto("pool/data", "person.json", data)?;
//!
//! // Decompress
//! let decompressed = decompress_auto(&compressed)?;
//! assert_eq!(decompressed, data);
//! ```
//!
//! # Binary Format
//!
//! Compressed data format:
//! ```text
//! Header (24 bytes):
//!   - Magic: "LCDC" (4 bytes)
//!   - Dict ID: u64 (8 bytes)
//!   - Original size: u32 (4 bytes)
//!   - Compressed size: u32 (4 bytes)
//!   - Checksum: u32 (4 bytes)
//!
//! Operations:
//!   - Dict ref: 0x00 [offset:u16][length:u16]
//!   - Literal:  0x01 [length:u16][data...]
//! ```
//!
//! # Best Practices
//!
//! - Train dictionaries with representative samples
//! - Use separate dictionaries for different file types
//! - Monitor compression ratios and retrain if needed
//! - Dictionary size of 32KB is usually optimal

pub mod compress;
pub mod store;
pub mod train;
pub mod types;

// Re-exports
pub use compress::{
    compress, compression_ratio, decompress, decompress_with_dict_data, is_beneficial,
};
pub use store::{
    GlobalStats, auto_train, auto_train_default, clear_all, compress_auto, decompress_auto,
    find_dict_for_path, get_all_stats, get_dict, get_global_stats, get_stats, is_dict_compressed,
    list_dicts, next_dict_id, register_dict, remove_dict,
};
pub use train::{train, train_dictionary, train_dictionary_default};
pub use types::{
    CompressOp, CompressedHeader, CompressionDict, DEFAULT_DICT_SIZE, DICT_MAGIC, DictError,
    DictResult, DictStats, MAX_DICT_SIZE, MAX_MATCH_LEN, MIN_DICT_SIZE, MIN_MATCH_LEN, OP_DICT_REF,
    OP_LITERAL, SubstringEntry, TrainingOptions,
};

// ═══════════════════════════════════════════════════════════════════════════════
// INTEGRATION TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    #[test]
    fn test_exports_accessible() {
        // Verify all re-exports are accessible
        let _ = DICT_MAGIC;
        let _ = DEFAULT_DICT_SIZE;
        let _ = TrainingOptions::default();
    }

    #[test]
    fn test_full_workflow() {
        // Train dictionary
        let samples: &[&[u8]] = &[
            b"header: value\ncontent: data\n",
            b"header: other\ncontent: more\n",
            b"header: test\ncontent: info\n",
        ];

        let id = next_dict_id() + 1000; // Avoid conflicts
        let dict = train(
            id,
            "http_headers",
            "*.http",
            "test_pool",
            samples,
            &TrainingOptions::default().with_size(1024),
            12345,
        )
        .unwrap();

        register_dict(dict.clone());

        // Compress new data
        let data = b"header: new\ncontent: fresh\n";
        let compressed = compress(data, &dict).unwrap();

        // Decompress
        let decompressed = decompress(&compressed, &dict).unwrap();
        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_json_compression() {
        let samples: &[&[u8]] = &[
            br#"{"type":"user","id":1,"name":"Alice"}"#,
            br#"{"type":"user","id":2,"name":"Bob"}"#,
            br#"{"type":"user","id":3,"name":"Charlie"}"#,
            br#"{"type":"product","id":100,"name":"Widget"}"#,
        ];

        let id = next_dict_id() + 2000;
        let dict = train(
            id,
            "json_dict",
            "*.json",
            "json_pool",
            samples,
            &TrainingOptions::default().with_size(512),
            0,
        )
        .unwrap();

        register_dict(dict.clone());

        // Compress similar JSON
        let data = br#"{"type":"user","id":99,"name":"Dave"}"#;
        let compressed = compress(data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_log_compression() {
        let samples: &[&[u8]] = &[
            b"2025-01-01 10:00:00 INFO Starting application\n",
            b"2025-01-01 10:00:01 DEBUG Loading configuration\n",
            b"2025-01-01 10:00:02 INFO Configuration loaded\n",
            b"2025-01-01 10:00:03 WARN Low memory warning\n",
        ];

        let id = next_dict_id() + 3000;
        let dict = train(
            id,
            "log_dict",
            "*.log",
            "log_pool",
            samples,
            &TrainingOptions::default(),
            0,
        )
        .unwrap();

        register_dict(dict.clone());

        let data = b"2025-01-01 10:00:04 INFO Processing complete\n";
        let compressed = compress(data, &dict).unwrap();
        let decompressed = decompress(&compressed, &dict).unwrap();

        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_auto_workflow() {
        // Use unique dataset name
        let samples: &[&[u8]] = &[
            b"config: value1\noption: true\n",
            b"config: value2\noption: false\n",
        ];

        let dict = auto_train(
            "auto_config",
            "unique_auto_ds",
            "*.conf",
            samples,
            &TrainingOptions::default().with_size(512),
            0,
        )
        .unwrap();

        // Compress using auto
        let data = b"config: value3\noption: true\n";
        let compressed = compress_auto("unique_auto_ds", "app.conf", data).unwrap();

        // Decompress using auto
        let decompressed = decompress_auto(&compressed).unwrap();
        assert_eq!(decompressed, data);
    }

    #[test]
    fn test_dictionary_pattern_matching() {
        let id1 = next_dict_id() + 4000;
        let dict1 = CompressionDict::new(id1, "json", vec![1, 2], "*.json", "pattern_ds", 0);
        register_dict(dict1);

        let id2 = next_dict_id() + 4001;
        let dict2 = CompressionDict::new(id2, "xml", vec![3, 4], "*.xml", "pattern_ds", 0);
        register_dict(dict2);

        // Should find json dict
        let found_json = find_dict_for_path("pattern_ds", "data.json");
        assert!(found_json.is_some());
        assert_eq!(found_json.unwrap().id, id1);

        // Should find xml dict
        let found_xml = find_dict_for_path("pattern_ds", "data.xml");
        assert!(found_xml.is_some());
        assert_eq!(found_xml.unwrap().id, id2);

        // Should not find anything
        let not_found = find_dict_for_path("pattern_ds", "data.txt");
        assert!(not_found.is_none());
    }

    #[test]
    fn test_statistics_tracking() {
        let id = next_dict_id() + 5000;
        let dict = CompressionDict::new(
            id,
            "stats_dict",
            b"common data".to_vec(),
            "*",
            "stats_track_ds",
            0,
        );
        register_dict(dict.clone());

        // Compress some data
        let data1 = b"common data is here";
        let data2 = b"more common data";

        let _ = compress(data1, &dict);
        let _ = compress(data2, &dict);

        // Check global stats
        let global = get_global_stats();
        assert!(global.dict_count >= 1);
        assert!(global.dict_bytes > 0);
    }

    #[test]
    fn test_compression_ratio_check() {
        let dict = CompressionDict::new(
            next_dict_id() + 6000,
            "ratio_dict",
            b"repeated pattern here is a long string".to_vec(),
            "*",
            "ratio_ds",
            0,
        );

        // Data with dict matches should have good ratio
        let data = b"repeated pattern here is a long string and repeated pattern here again";
        let ratio = compression_ratio(data, &dict);

        // Ratio should be reasonable
        assert!(ratio > 0.0);
    }

    #[test]
    fn test_header_format() {
        let header = CompressedHeader::new(12345, 1000, 500, 0xABCD);

        let bytes = header.to_bytes();
        assert_eq!(&bytes[0..4], &DICT_MAGIC);

        let parsed = CompressedHeader::from_bytes(&bytes).unwrap();
        assert_eq!(parsed.dict_id, 12345);
        assert_eq!(parsed.original_size, 1000);
        assert_eq!(parsed.compressed_size, 500);
        assert_eq!(parsed.checksum, 0xABCD);
    }
}