Skip to main content

firecloud_storage/
chunker.rs

1//! FastCDC content-defined chunking
2
3use crate::StorageResult;
4use fastcdc::v2020::FastCDC;
5use firecloud_core::{Chunk, ChunkHash, ChunkMetadata, CompressionType};
6use std::io::Read;
7use tracing::debug;
8
9/// Configuration for file chunking
10#[derive(Debug, Clone)]
11pub struct ChunkingConfig {
12    /// Minimum chunk size in bytes
13    pub min_size: u32,
14    /// Average chunk size in bytes
15    pub avg_size: u32,
16    /// Maximum chunk size in bytes
17    pub max_size: u32,
18}
19
20impl Default for ChunkingConfig {
21    fn default() -> Self {
22        Self {
23            min_size: 64 * 1024,      // 64 KB
24            avg_size: 1024 * 1024,    // 1 MB
25            max_size: 4 * 1024 * 1024, // 4 MB
26        }
27    }
28}
29
30/// Chunks files using FastCDC algorithm
31pub struct FileChunker {
32    config: ChunkingConfig,
33}
34
35impl FileChunker {
36    /// Create a new chunker with default config
37    pub fn new() -> Self {
38        Self {
39            config: ChunkingConfig::default(),
40        }
41    }
42
43    /// Create with custom config
44    pub fn with_config(config: ChunkingConfig) -> Self {
45        Self { config }
46    }
47
48    /// Chunk a file from a reader
49    pub fn chunk_reader<R: Read>(&self, mut reader: R) -> StorageResult<Vec<Chunk>> {
50        // Read entire file into memory (for MVP; streaming version later)
51        let mut data = Vec::new();
52        reader.read_to_end(&mut data)?;
53
54        self.chunk_bytes(&data)
55    }
56
57    /// Chunk raw bytes
58    pub fn chunk_bytes(&self, data: &[u8]) -> StorageResult<Vec<Chunk>> {
59        if data.is_empty() {
60            return Ok(Vec::new());
61        }
62
63        let chunker = FastCDC::new(
64            data,
65            self.config.min_size,
66            self.config.avg_size,
67            self.config.max_size,
68        );
69
70        let mut chunks = Vec::new();
71
72        for chunk_data in chunker {
73            let chunk_bytes = &data[chunk_data.offset..chunk_data.offset + chunk_data.length];
74            let hash = ChunkHash::hash(chunk_bytes);
75
76            debug!(
77                "Chunk: offset={}, size={}, hash={}",
78                chunk_data.offset,
79                chunk_data.length,
80                hash
81            );
82
83            let chunk = Chunk {
84                metadata: ChunkMetadata {
85                    hash,
86                    size: chunk_data.length as u64,
87                    original_size: chunk_data.length as u64,
88                    compression: CompressionType::None,
89                    encrypted: false,
90                },
91                data: bytes::Bytes::copy_from_slice(chunk_bytes),
92            };
93
94            chunks.push(chunk);
95        }
96
97        debug!("Chunked {} bytes into {} chunks", data.len(), chunks.len());
98
99        Ok(chunks)
100    }
101
102    /// Reassemble chunks back into original data
103    pub fn reassemble(chunks: &[Chunk]) -> bytes::Bytes {
104        let total_size: usize = chunks.iter().map(|c| c.data.len()).sum();
105        let mut result = Vec::with_capacity(total_size);
106
107        for chunk in chunks {
108            result.extend_from_slice(&chunk.data);
109        }
110
111        bytes::Bytes::from(result)
112    }
113}
114
115impl Default for FileChunker {
116    fn default() -> Self {
117        Self::new()
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn test_chunk_small_file() {
127        let chunker = FileChunker::new();
128        let data = b"Hello, FireCloud! This is a test file.";
129
130        let chunks = chunker.chunk_bytes(data).unwrap();
131
132        // Small file should produce single chunk
133        assert_eq!(chunks.len(), 1);
134        assert_eq!(chunks[0].data.as_ref(), data);
135    }
136
137    #[test]
138    fn test_reassemble() {
139        let chunker = FileChunker::new();
140
141        // Create data larger than min chunk size
142        let data: Vec<u8> = (0..200_000).map(|i| (i % 256) as u8).collect();
143
144        let chunks = chunker.chunk_bytes(&data).unwrap();
145        let reassembled = FileChunker::reassemble(&chunks);
146
147        assert_eq!(reassembled.as_ref(), data.as_slice());
148    }
149
150    #[test]
151    fn test_empty_file() {
152        let chunker = FileChunker::new();
153        let chunks = chunker.chunk_bytes(&[]).unwrap();
154        assert!(chunks.is_empty());
155    }
156}