firecloud-storage 0.2.0

Chunking, compression, and local storage for FireCloud distributed storage
Documentation
//! FastCDC content-defined chunking

use crate::StorageResult;
use fastcdc::v2020::FastCDC;
use firecloud_core::{Chunk, ChunkHash, ChunkMetadata, CompressionType};
use std::io::Read;
use tracing::debug;

/// Configuration for file chunking
#[derive(Debug, Clone)]
pub struct ChunkingConfig {
    /// Minimum chunk size in bytes
    pub min_size: u32,
    /// Average chunk size in bytes
    pub avg_size: u32,
    /// Maximum chunk size in bytes
    pub max_size: u32,
}

impl Default for ChunkingConfig {
    fn default() -> Self {
        Self {
            min_size: 64 * 1024,      // 64 KB
            avg_size: 1024 * 1024,    // 1 MB
            max_size: 4 * 1024 * 1024, // 4 MB
        }
    }
}

/// Chunks files using FastCDC algorithm
pub struct FileChunker {
    config: ChunkingConfig,
}

impl FileChunker {
    /// Create a new chunker with default config
    pub fn new() -> Self {
        Self {
            config: ChunkingConfig::default(),
        }
    }

    /// Create with custom config
    pub fn with_config(config: ChunkingConfig) -> Self {
        Self { config }
    }

    /// Chunk a file from a reader
    pub fn chunk_reader<R: Read>(&self, mut reader: R) -> StorageResult<Vec<Chunk>> {
        // Read entire file into memory (for MVP; streaming version later)
        let mut data = Vec::new();
        reader.read_to_end(&mut data)?;

        self.chunk_bytes(&data)
    }

    /// Chunk raw bytes
    pub fn chunk_bytes(&self, data: &[u8]) -> StorageResult<Vec<Chunk>> {
        if data.is_empty() {
            return Ok(Vec::new());
        }

        let chunker = FastCDC::new(
            data,
            self.config.min_size,
            self.config.avg_size,
            self.config.max_size,
        );

        let mut chunks = Vec::new();

        for chunk_data in chunker {
            let chunk_bytes = &data[chunk_data.offset..chunk_data.offset + chunk_data.length];
            let hash = ChunkHash::hash(chunk_bytes);

            debug!(
                "Chunk: offset={}, size={}, hash={}",
                chunk_data.offset,
                chunk_data.length,
                hash
            );

            let chunk = Chunk {
                metadata: ChunkMetadata {
                    hash,
                    size: chunk_data.length as u64,
                    original_size: chunk_data.length as u64,
                    compression: CompressionType::None,
                    encrypted: false,
                },
                data: bytes::Bytes::copy_from_slice(chunk_bytes),
            };

            chunks.push(chunk);
        }

        debug!("Chunked {} bytes into {} chunks", data.len(), chunks.len());

        Ok(chunks)
    }

    /// Reassemble chunks back into original data
    pub fn reassemble(chunks: &[Chunk]) -> bytes::Bytes {
        let total_size: usize = chunks.iter().map(|c| c.data.len()).sum();
        let mut result = Vec::with_capacity(total_size);

        for chunk in chunks {
            result.extend_from_slice(&chunk.data);
        }

        bytes::Bytes::from(result)
    }
}

impl Default for FileChunker {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_chunk_small_file() {
        let chunker = FileChunker::new();
        let data = b"Hello, FireCloud! This is a test file.";

        let chunks = chunker.chunk_bytes(data).unwrap();

        // Small file should produce single chunk
        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].data.as_ref(), data);
    }

    #[test]
    fn test_reassemble() {
        let chunker = FileChunker::new();

        // Create data larger than min chunk size
        let data: Vec<u8> = (0..200_000).map(|i| (i % 256) as u8).collect();

        let chunks = chunker.chunk_bytes(&data).unwrap();
        let reassembled = FileChunker::reassemble(&chunks);

        assert_eq!(reassembled.as_ref(), data.as_slice());
    }

    #[test]
    fn test_empty_file() {
        let chunker = FileChunker::new();
        let chunks = chunker.chunk_bytes(&[]).unwrap();
        assert!(chunks.is_empty());
    }
}